summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphilip.liard@gmail.com <philip.liard@gmail.com@ee073f10-1060-11df-b6a4-87a95322a99c>2011-09-21 17:43:54 +0000
committerphilip.liard@gmail.com <philip.liard@gmail.com@ee073f10-1060-11df-b6a4-87a95322a99c>2011-09-21 17:43:54 +0000
commit6a0a07f4f0066eba2dc9bb81465f4e67d670c7b6 (patch)
tree0ae8d76ac1c43f0aada6fcf6b2c4449b8340e1da
parent70942011411fc60b15cddf0a3032b1614777dbc4 (diff)
downloadphonenumbers-6a0a07f4f0066eba2dc9bb81465f4e67d670c7b6.tar.gz
CPP: Add phonenumbermatcher.
git-svn-id: http://libphonenumber.googlecode.com/svn/trunk/cpp/src/phonenumbers@356 ee073f10-1060-11df-b6a4-87a95322a99c
-rw-r--r--encoding_utils.h11
-rw-r--r--phonenumbermatch.cc91
-rw-r--r--phonenumbermatch.h125
-rw-r--r--phonenumbermatcher.cc626
-rw-r--r--phonenumbermatcher.h158
-rw-r--r--phonenumberutil.h3
-rw-r--r--stringutil.cc58
-rw-r--r--stringutil.h19
8 files changed, 1091 insertions, 0 deletions
diff --git a/encoding_utils.h b/encoding_utils.h
index 85819a5..415ce62 100644
--- a/encoding_utils.h
+++ b/encoding_utils.h
@@ -16,6 +16,7 @@
#define I18N_PHONENUMBERS_ENCODING_UTILS_H_
#include "base/basictypes.h"
+#include "phonenumbers/utf/unilib.h"
#include "phonenumbers/utf/utf.h"
namespace i18n {
@@ -32,6 +33,16 @@ class EncodingUtils {
*out = r;
return len;
}
+
+ static const char* AdvanceOneUTF8Character(const char* buf_utf8) {
+ return buf_utf8 + UniLib::OneCharLen(buf_utf8);
+ }
+
+ static const char* BackUpOneUTF8Character(const char* start,
+ const char* end) {
+ while (start < end && UniLib::IsTrailByte(*--end)) {}
+ return end;
+ }
};
} // namespace phonenumbers
diff --git a/phonenumbermatch.cc b/phonenumbermatch.cc
new file mode 100644
index 0000000..bcf5efb
--- /dev/null
+++ b/phonenumbermatch.cc
@@ -0,0 +1,91 @@
+// Copyright (C) 2011 The Libphonenumber Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: Tao Huang
+//
+// Implementation of a mutable match of a phone number within a piece of
+// text. Matches may be found using PhoneNumberUtil::FindNumbers.
+
+#include "phonenumbers/phonenumbermatch.h"
+
+#include <string>
+
+#include "phonenumbers/phonenumber.h"
+#include "phonenumbers/phonenumber.pb.h"
+#include "phonenumbers/stringutil.h"
+
+namespace i18n {
+namespace phonenumbers {
+
+PhoneNumberMatch::PhoneNumberMatch(int start,
+ const string& raw_string,
+ const PhoneNumber& number)
+ : start_(start), raw_string_(raw_string), number_(number) {
+}
+
+PhoneNumberMatch::PhoneNumberMatch()
+ : start_(-1), raw_string_(""), number_(PhoneNumber::default_instance()) {
+}
+
+const PhoneNumber& PhoneNumberMatch::number() const {
+ return number_;
+}
+
+int PhoneNumberMatch::start() const {
+ return start_;
+}
+
+int PhoneNumberMatch::end() const {
+ return start_ + raw_string_.length();
+}
+
+int PhoneNumberMatch::length() const {
+ return raw_string_.length();
+}
+
+const string& PhoneNumberMatch::raw_string() const {
+ return raw_string_;
+}
+
+void PhoneNumberMatch::set_start(int start) {
+ start_ = start;
+}
+
+void PhoneNumberMatch::set_raw_string(const string& raw_string) {
+ raw_string_ = raw_string;
+}
+
+void PhoneNumberMatch::set_number(const PhoneNumber& number) {
+ number_.CopyFrom(number);
+}
+
+string PhoneNumberMatch::ToString() const {
+ return StrCat("PhoneNumberMatch [", start(), ",", end(), ") ",
+ raw_string_.c_str());
+}
+
+bool PhoneNumberMatch::Equals(const PhoneNumberMatch& match) const {
+ return ExactlySameAs(match.number_, number_) &&
+ match.raw_string_.compare(raw_string_) == 0 &&
+ match.start_ == start_;
+}
+
+void PhoneNumberMatch::CopyFrom(const PhoneNumberMatch& match) {
+ raw_string_ = match.raw_string();
+ start_ = match.start();
+ number_ = match.number();
+}
+
+} // namespace phonenumbers
+} // namespace i18n
diff --git a/phonenumbermatch.h b/phonenumbermatch.h
new file mode 100644
index 0000000..5ebfd9d
--- /dev/null
+++ b/phonenumbermatch.h
@@ -0,0 +1,125 @@
+// Copyright (C) 2011 The Libphonenumber Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: Tao Huang
+//
+// A mutable match of a phone number within a piece of text.
+// Matches may be found using PhoneNumberUtil::FindNumbers.
+//
+// A match consists of the phone number as well as the start and end offsets of
+// the corresponding subsequence of the searched text. Use raw_string() to
+// obtain a copy of the matched subsequence.
+//
+// The following annotated example clarifies the relationship between the
+// searched text, the match offsets, and the parsed number:
+//
+// string text = "Call me at +1 425 882-8080 for details.";
+// const string country = "US";
+//
+// // Find the first phone number match:
+// PhoneNumberMatcher matcher(text, country);
+// if (matcher.HasNext()) {
+// PhoneNumberMatch match;
+// matcher.Next(&match);
+// }
+//
+// // raw_string() contains the phone number as it appears in the text.
+// "+1 425 882-8080" == match.raw_string();
+//
+// // start() and end() define the range of the matched subsequence.
+// string subsequence = text.substr(match.start(), match.end());
+// "+1 425 882-8080" == subsequence;
+//
+// // number() returns the the same result as PhoneNumberUtil::Parse()
+// // invoked on raw_string().
+// const PhoneNumberUtil& util = *PhoneNumberUtil::GetInstance();
+// util.Parse(match.raw_string(), country).Equals(match.number());
+//
+// This class is a port of PhoneNumberMatch.java
+
+#ifndef I18N_PHONENUMBERS_PHONENUMBERMATCH_H_
+#define I18N_PHONENUMBERS_PHONENUMBERMATCH_H_
+
+#include <string>
+
+#include "base/basictypes.h"
+#include "phonenumbers/phonenumber.pb.h"
+
+namespace i18n {
+namespace phonenumbers {
+
+using std::string;
+
+class PhoneNumberMatch {
+ public:
+ // Creates a new match.
+ // - start is the index into the target text.
+ // - match is the matched string of the target text.
+ // - number is the matched phone number.
+ PhoneNumberMatch(int start,
+ const string& raw_string,
+ const PhoneNumber& number);
+
+ // Default constructor.
+ PhoneNumberMatch();
+
+ ~PhoneNumberMatch() {}
+
+ // Returns the phone number matched by the receiver.
+ const PhoneNumber& number() const;
+
+ // Returns the start index of the matched phone number within the searched
+ // text.
+ int start() const;
+
+ // Returns the exclusive end index of the matched phone number within the
+ // searched text.
+ int end() const;
+
+ // Returns the length of the text matched in the searched text.
+ int length() const;
+
+ // Returns the raw string matched as a phone number in the searched text.
+ const string& raw_string() const;
+
+ // Returns a string containing debug information.
+ string ToString() const;
+
+ void set_start(int start);
+
+ void set_raw_string(const string& raw_string);
+
+ void set_number(const PhoneNumber& number);
+
+ bool Equals(const PhoneNumberMatch& number) const;
+
+ void CopyFrom(const PhoneNumberMatch& number);
+
+ private:
+ // The start index into the text.
+ int start_;
+
+ // The raw substring matched.
+ string raw_string_;
+
+ // The matched phone number.
+ PhoneNumber number_;
+
+ DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatch);
+};
+
+} // namespace phonenumbers
+} // namespace i18n
+
+#endif // I18N_PHONENUMBERS_PHONENUMBERMATCH_H_
diff --git a/phonenumbermatcher.cc b/phonenumbermatcher.cc
new file mode 100644
index 0000000..a3d528f
--- /dev/null
+++ b/phonenumbermatcher.cc
@@ -0,0 +1,626 @@
+// Copyright (C) 2011 The Libphonenumber Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: Lara Rennie
+// Author: Tao Huang
+//
+// Implementation of a stateful class that finds and extracts telephone numbers
+// from text.
+
+#include "phonenumbers/phonenumbermatcher.h"
+
+#ifndef USE_ICU_REGEXP
+#error phonenumbermatcher depends on ICU (i.e. USE_ICU_REGEXP must be set)
+#endif // USE_ICU_REGEXP
+
+#include <limits>
+#include <string>
+#include <unicode/uchar.h>
+
+#include "base/logging.h"
+#include "base/memory/scoped_ptr.h"
+#include "base/memory/singleton.h"
+#include "phonenumbers/default_logger.h"
+#include "phonenumbers/encoding_utils.h"
+#include "phonenumbers/normalize_utf8.h"
+#include "phonenumbers/phonenumber.pb.h"
+#include "phonenumbers/phonenumbermatch.h"
+#include "phonenumbers/phonenumberutil.h"
+#include "phonenumbers/regexp_adapter.h"
+#include "phonenumbers/regexp_adapter_icu.h"
+#include "phonenumbers/stringutil.h"
+
+#ifdef USE_RE2
+#include "phonenumbers/regexp_adapter_re2.h"
+#endif // USE_RE2_AND_ICU
+
+using std::numeric_limits;
+using std::string;
+using std::vector;
+
+namespace i18n {
+namespace phonenumbers {
+
+namespace {
+// Returns a regular expression quantifier with an upper and lower limit.
+string Limit(int lower, int upper) {
+ DCHECK_GE(lower, 0);
+ DCHECK_GT(upper, 0);
+ DCHECK_LT(lower, upper);
+ return StrCat("{", lower, ",", upper, "}");
+}
+
+bool IsCurrencySymbol(char32 character) {
+ return (u_charType(character) == U_CURRENCY_SYMBOL);
+}
+
+// Helper method to get the national-number part of a number, formatted without
+// any national prefix, and return it as a set of digit blocks that would be
+// formatted together.
+void GetNationalNumberGroups(const PhoneNumberUtil& util,
+ const PhoneNumber& number,
+ vector<string>* digit_blocks) {
+ // This will be in the format +CC-DG;ext=EXT where DG represents groups of
+ // digits.
+ string rfc3966_format;
+ util.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format);
+ // We remove the extension part from the formatted string before splitting it
+ // into different groups.
+ size_t end_index = rfc3966_format.find(';');
+ if (end_index == string::npos) {
+ end_index = rfc3966_format.length();
+ }
+ // The country-code will have a '-' following it.
+ size_t start_index = rfc3966_format.find('-') + 1;
+ SplitStringUsing(rfc3966_format.substr(start_index, end_index - start_index),
+ "-", digit_blocks);
+}
+
+bool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate,
+ const PhoneNumberUtil& util) {
+ // The characters 'x' and 'X' can be (1) a carrier code, in which case they
+ // always precede the national significant number or (2) an extension sign,
+ // in which case they always precede the extension number. We assume a
+ // carrier code is more than 1 digit, so the first case has to have more than
+ // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1
+ // 'x' or 'X'.
+ size_t found;
+ found = candidate.find_first_of("xX");
+ // We ignore the character if 'x' or 'X' appears as the last character of
+ // the string.
+ while (found != string::npos && found < candidate.length() - 1) {
+ // We only look for 'x' or 'X' in ASCII form.
+ char next_char = candidate[found + 1];
+ if (next_char == 'x' || next_char == 'X') {
+ // This is the carrier code case, in which the 'X's always precede the
+ // national significant number.
+ ++found;
+ if (util.IsNumberMatchWithOneString(
+ number, candidate.substr(found, candidate.length() - found))
+ != PhoneNumberUtil::NSN_MATCH) {
+ return false;
+ }
+ } else {
+ string normalized_extension(candidate.substr(found,
+ candidate.length() - found));
+ util.NormalizeDigitsOnly(&normalized_extension);
+ if (normalized_extension != number.extension()) {
+ return false;
+ }
+ }
+ found = candidate.find_first_of("xX", found + 1);
+ }
+ return true;
+}
+} // namespace
+
+#ifdef USE_GOOGLE_BASE
+class PhoneNumberMatcherRegExps {
+ friend struct DefaultSingletonTraits<PhoneNumberMatcherRegExps>;
+#else
+class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
+ friend class Singleton<PhoneNumberMatcherRegExps>;
+#endif // USE_GOOGLE_BASE
+ private:
+ string opening_parens_;
+ string closing_parens_;
+ string non_parens_;
+ // Limit on the number of pairs of brackets in a phone number.
+ string bracket_pair_limit_;
+ // Helper strings for the matching_brackets_ pattern.
+ // An opening bracket at the beginning may not be closed, but subsequent ones
+ // should be. It's also possible that the leading bracket was dropped, so we
+ // shouldn't be surprised if we see a closing bracket first.
+ string leading_maybe_matched_bracket_;
+ string bracket_pairs_;
+ // Limit on the number of leading (plus) characters.
+ string lead_limit_;
+ // Limit on the number of consecutive punctuation characters.
+ string punctuation_limit_;
+ // The maximum number of digits allowed in a digit-separated block. As we
+ // allow all digits in a single block, this should be set high enough to
+ // accommodate the entire national number and the international country code.
+ int digit_block_limit_;
+ // Limit on the number of blocks separated by punctuation. Uses
+ // kDigitBlockLimit since some formats use spaces to separate each digit.
+ string block_limit_;
+ // A punctuation sequence allowing white space.
+ string punctuation_;
+ // A digits block without punctuation.
+ string digit_sequence_;
+ // Punctuation that may be at the start of a phone number - brackets and plus
+ // signs.
+ string lead_class_chars_;
+ // Same as lead_class_chars_, but enclosed as a character class.
+ string lead_class_;
+ // Extra helper strings that form part of pattern_. These are stored
+ // separately since StrCat has a limit of 12 args.
+ string opening_punctuation_;
+ string optional_extn_pattern_;
+
+ public:
+ // We use two different reg-ex factories here for performance reasons. RE2 is
+ // much faster for smaller reg-ex patterns, but the main pattern cannot be
+ // handled by RE2 in an efficient way.
+ scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_;
+ scoped_ptr<const AbstractRegExpFactory> regexp_factory_;
+
+ // Matches strings that look like publication pages. Example:
+ // Computing Complete Answers to Queries in the Presence of Limited Access
+ // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003).
+ //
+ // The string "211-227 (2003)" is not a telephone number.
+ scoped_ptr<const RegExp> pub_pages_;
+ // Matches strings that look like dates using "/" as a separator. Examples:
+ // 3/10/2011, 31/10/96 or 08/31/95.
+ scoped_ptr<const RegExp> slash_separated_dates_;
+ // Pattern to check that brackets match. Opening brackets should be closed
+ // within a phone number. This also checks that there is something inside the
+ // brackets. Having no brackets at all is also fine.
+ scoped_ptr<const RegExp> matching_brackets_;
+ // Matches white-space, which may indicate the end of a phone number and the
+ // start of something else (such as a neighbouring zip-code). If white-space
+ // is found, continues to match all characters that are not typically used to
+ // start a phone number.
+ scoped_ptr<const RegExp> group_separator_;
+ scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_;
+ scoped_ptr<const RegExp> capturing_ascii_digits_pattern_;
+ // Compiled reg-ex representing lead_class_;
+ scoped_ptr<const RegExp> lead_class_pattern_;
+ // Phone number pattern allowing optional punctuation.
+ scoped_ptr<const RegExp> pattern_;
+
+#ifdef USE_GOOGLE_BASE
+ PhoneNumberMatcherRegExps* PhoneNumberMatcherRegExps::GetInstance() {
+ return Singleton<PhoneNumberMatcherRegExps>::get();
+ }
+#endif // USE_GOOGLE_BASE
+
+ PhoneNumberMatcherRegExps()
+ : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[([" */),
+ closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\])]" */),
+ non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")),
+ bracket_pair_limit_(Limit(0, 3)),
+ leading_maybe_matched_bracket_(StrCat(
+ "(?:[", opening_parens_, "])?",
+ "(?:", non_parens_, "+[", closing_parens_, "])?")),
+ bracket_pairs_(StrCat(
+ "(?:[", opening_parens_, "]", non_parens_, "+",
+ "[", closing_parens_, "])", bracket_pair_limit_)),
+ lead_limit_(Limit(0, 2)),
+ punctuation_limit_(Limit(0, 4)),
+ digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn +
+ PhoneNumberUtil::kMaxLengthCountryCode),
+ block_limit_(Limit(0, digit_block_limit_)),
+ punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]",
+ punctuation_limit_)),
+ digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))),
+ lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)),
+ lead_class_(StrCat("[", lead_class_chars_, "]")),
+ opening_punctuation_(StrCat("(?:", lead_class_, punctuation_, ")")),
+ optional_extn_pattern_(StrCat(
+ "(?i)(?:",
+ PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(),
+ ")?")),
+ regexp_factory_for_pattern_(new ICURegExpFactory()),
+#ifdef USE_RE2
+ regexp_factory_(new RE2RegExpFactory()),
+#else
+ regexp_factory_(new ICURegExpFactory()),
+#endif // USE_RE2
+ pub_pages_(regexp_factory_->CreateRegExp(
+ "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")),
+ slash_separated_dates_(regexp_factory_->CreateRegExp(
+ "(?:(?:[0-3]?\\d/[01]?\\d)|"
+ "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")),
+ matching_brackets_(regexp_factory_->CreateRegExp(
+ StrCat(leading_maybe_matched_bracket_, non_parens_, "+",
+ bracket_pairs_, non_parens_, "*"))),
+ group_separator_(regexp_factory_->CreateRegExp(
+ StrCat("\\p{Z}", "[^", lead_class_chars_, "\\p{Nd}]*"))),
+ capture_up_to_second_number_start_pattern_(
+ regexp_factory_->CreateRegExp(
+ PhoneNumberUtil::kCaptureUpToSecondNumberStart)),
+ capturing_ascii_digits_pattern_(
+ regexp_factory_->CreateRegExp("(\\d+)")),
+ lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)),
+ pattern_(regexp_factory_for_pattern_->CreateRegExp(
+ StrCat("(", opening_punctuation_, lead_limit_,
+ digit_sequence_, "(?:", punctuation_, digit_sequence_, ")",
+ block_limit_, optional_extn_pattern_, ")"))) {
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps);
+};
+
+PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util,
+ const string& text,
+ const string& region_code,
+ PhoneNumberMatcher::Leniency leniency,
+ int max_tries)
+ : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
+ phone_util_(util),
+ text_(text),
+ preferred_region_(region_code),
+ leniency_(leniency),
+ max_tries_(max_tries),
+ state_(NOT_READY),
+ last_match_(NULL),
+ search_index_(0) {
+}
+
+PhoneNumberMatcher::PhoneNumberMatcher(const string& text,
+ const string& region_code)
+ : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()),
+ phone_util_(*PhoneNumberUtil::GetInstance()),
+ text_(text),
+ preferred_region_(region_code),
+ leniency_(VALID),
+ max_tries_(numeric_limits<int>::max()),
+ state_(NOT_READY),
+ last_match_(NULL),
+ search_index_(0) {
+}
+
+PhoneNumberMatcher::~PhoneNumberMatcher() {
+}
+
+// static
+bool PhoneNumberMatcher::IsLatinLetter(char32 letter) {
+ // Combining marks are a subset of non-spacing-mark.
+ if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) {
+ return false;
+ }
+ UBlockCode block = ublock_getCode(letter);
+ return ((block == UBLOCK_BASIC_LATIN) ||
+ (block == UBLOCK_LATIN_1_SUPPLEMENT) ||
+ (block == UBLOCK_LATIN_EXTENDED_A) ||
+ (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) ||
+ (block == UBLOCK_LATIN_EXTENDED_B) ||
+ (block == UBLOCK_COMBINING_DIACRITICAL_MARKS));
+}
+
+bool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset,
+ PhoneNumberMatch* match) {
+ DCHECK(match);
+ // Check the candidate doesn't contain any formatting which would indicate
+ // that it really isn't a phone number.
+ if (!reg_exps_->matching_brackets_->FullMatch(candidate)) {
+ return false;
+ }
+
+ // If leniency is set to VALID or stricter, we also want to skip numbers that
+ // are surrounded by Latin alphabetic characters, to skip cases like
+ // abc8005001234 or 8005001234def.
+ if (leniency_ >= VALID) {
+ // If the candidate is not at the start of the text, and does not start with
+ // phone-number punctuation, check the previous character.
+ scoped_ptr<RegExpInput> candidate_input(
+ reg_exps_->regexp_factory_->CreateInput(candidate));
+ if (offset > 0 &&
+ !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) {
+ char32 previous_char;
+ const char* previous_char_ptr =
+ EncodingUtils::BackUpOneUTF8Character(text_.c_str(),
+ text_.c_str() + offset);
+ EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char);
+ // We return false if it is a latin letter or a currency symbol.
+ if (IsCurrencySymbol(previous_char) || IsLatinLetter(previous_char)) {
+ return false;
+ }
+ }
+ size_t lastCharIndex = offset + candidate.length();
+ if (lastCharIndex < text_.length()) {
+ char32 next_char;
+ const char* next_char_ptr =
+ EncodingUtils::AdvanceOneUTF8Character(
+ text_.c_str() + lastCharIndex - 1);
+ EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char);
+ if (IsCurrencySymbol(next_char) || IsLatinLetter(next_char)) {
+ return false;
+ }
+ }
+ }
+
+ PhoneNumber number;
+ if (phone_util_.Parse(candidate, preferred_region_, &number) !=
+ PhoneNumberUtil::NO_PARSING_ERROR) {
+ return false;
+ }
+ if (VerifyAccordingToLeniency(leniency_, number, candidate)) {
+ match->set_start(offset);
+ match->set_raw_string(candidate);
+ match->set_number(number);
+ return true;
+ }
+ return false;
+}
+
+// Helper method to replace the verification method for each enum in the Java
+// version.
+bool PhoneNumberMatcher::VerifyAccordingToLeniency(
+ Leniency leniency, const PhoneNumber& number,
+ const string& candidate) const {
+ switch (leniency) {
+ case PhoneNumberMatcher::POSSIBLE:
+ return phone_util_.IsPossibleNumber(number);
+ case PhoneNumberMatcher::VALID:
+ if (!phone_util_.IsValidNumber(number)) {
+ return false;
+ }
+ return ContainsOnlyValidXChars(number, candidate, phone_util_);
+ case PhoneNumberMatcher::STRICT_GROUPING: {
+ if (!phone_util_.IsValidNumber(number) ||
+ !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
+ // Two or more slashes were present.
+ FindNth(candidate, '/', 2) != string::npos) {
+ return false;
+ }
+ // TODO(lararennie,shaopengjia): Evaluate how this works for other locales
+ // (testing has been limited to NANPA regions) and optimise if necessary.
+ string normalized_candidate =
+ NormalizeUTF8::NormalizeDecimalDigits(candidate);
+ vector<string> formatted_number_groups;
+ GetNationalNumberGroups(phone_util_, number, &formatted_number_groups);
+ size_t from_index = 0;
+ // Check each group of consecutive digits are not broken into separate
+ // groups in the normalized_candidate string.
+ for (size_t i = 0; i < formatted_number_groups.size(); ++i) {
+ // Fails if the substring of normalized_candidate starting from
+ // from_index doesn't contain the consecutive digits in digit_group.
+ from_index = normalized_candidate.find(formatted_number_groups.at(i),
+ from_index);
+ if (from_index == string::npos) {
+ return false;
+ }
+ // Moves from_index forward.
+ from_index += formatted_number_groups.at(i).length();
+ if (i == 0 && from_index < normalized_candidate.length()) {
+ // We are at the position right after the NDC. Note although
+ // normalized_candidate might contain non-ASCII formatting characters,
+ // they won't be treated as ASCII digits when converted to a char.
+ if (isdigit(normalized_candidate.at(from_index))) {
+ // This means there is no formatting symbol after the NDC. In this
+ // case, we only accept the number if there is no formatting
+ // symbol at all in the number, except for extensions.
+ string national_significant_number;
+ phone_util_.GetNationalSignificantNumber(
+ number, &national_significant_number);
+ return HasPrefixString(
+ normalized_candidate.substr(
+ from_index - formatted_number_groups.at(i).length()),
+ national_significant_number);
+ }
+ }
+ }
+ // The check here makes sure that we haven't mistakenly already used the
+ // extension to match the last group of the subscriber number. Note the
+ // extension cannot have formatting in-between digits.
+ return
+ normalized_candidate.substr(from_index).find(number.extension()) !=
+ string::npos;
+ }
+ case PhoneNumberMatcher::EXACT_GROUPING: {
+ if (!phone_util_.IsValidNumber(number) ||
+ !ContainsOnlyValidXChars(number, candidate, phone_util_) ||
+ // Two or more slashes were present.
+ FindNth(candidate, '/', 2) != string::npos) {
+ return false;
+ }
+ // TODO(lararennie,shaopengjia): Evaluate how this works for other locales
+ // (testing has been limited to NANPA regions) and optimise if necessary.
+ vector<string> candidate_groups;
+ string normalized_candidate =
+ NormalizeUTF8::NormalizeDecimalDigits(candidate);
+ const scoped_ptr<RegExpInput> candidate_number(
+ reg_exps_->regexp_factory_->CreateInput(normalized_candidate));
+ string digit_block;
+ while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume(
+ candidate_number.get(),
+ &digit_block)) {
+ candidate_groups.push_back(digit_block);
+ }
+
+ // Set this to the last group, skipping it if the number has an extension.
+ int candidate_number_group_index =
+ number.has_extension() ? candidate_groups.size() - 2
+ : candidate_groups.size() - 1;
+ // First we check if the national significant number is formatted as a
+ // block. We use contains and not equals, since the national significant
+ // number may be present with a prefix such as a national number prefix,
+ // or the country code itself.
+ string national_significant_number;
+ phone_util_.GetNationalSignificantNumber(number,
+ &national_significant_number);
+ if (candidate_groups.size() == 1 ||
+ candidate_groups.at(candidate_number_group_index).find(
+ national_significant_number) != string::npos) {
+ return true;
+ }
+ vector<string> formatted_number_groups;
+ GetNationalNumberGroups(phone_util_, number, &formatted_number_groups);
+ // Starting from the end, go through in reverse, excluding the first
+ // group, and check the candidate and number groups are the same.
+ for (int formatted_number_group_index =
+ (formatted_number_groups.size() - 1);
+ formatted_number_group_index > 0 &&
+ candidate_number_group_index >= 0;
+ --formatted_number_group_index, --candidate_number_group_index) {
+ if (candidate_groups.at(candidate_number_group_index) !=
+ formatted_number_groups.at(formatted_number_group_index)) {
+ return false;
+ }
+ }
+ // Now check the first group. There may be a national prefix at the start,
+ // so we only check that the candidate group ends with the formatted
+ // number group.
+ return (candidate_number_group_index >= 0 &&
+ HasSuffixString(candidate_groups.at(candidate_number_group_index),
+ formatted_number_groups.at(0)));
+ }
+ default:
+ LOG(ERROR) << "No implementation defined for verification for leniency "
+ << static_cast<int>(leniency);
+ return false;
+ }
+}
+
+bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset,
+ PhoneNumberMatch* match) {
+ DCHECK(match);
+ // Try removing either the first or last "group" in the number and see if this
+ // gives a result. We consider white space to be a possible indication of
+ // the start or end of the phone number.
+ scoped_ptr<RegExpInput> candidate_input(
+ reg_exps_->regexp_factory_->CreateInput(candidate));
+ if (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(),
+ NULL)) {
+ // Try the first group by itself.
+ int group_start_index =
+ candidate.length() - candidate_input->ToString().length();
+ string first_group_only = candidate.substr(0, group_start_index);
+ phone_util_.TrimUnwantedEndChars(&first_group_only);
+ bool success = ParseAndVerify(first_group_only, offset, match);
+ if (success) {
+ return true;
+ }
+ --max_tries_;
+
+ // Try the rest of the candidate without the first group.
+ string without_first_group(candidate_input->ToString());
+ phone_util_.TrimUnwantedEndChars(&without_first_group);
+ success =
+ ParseAndVerify(without_first_group, offset + group_start_index, match);
+ if (success) {
+ return true;
+ }
+ --max_tries_;
+
+ if (max_tries_ > 0) {
+ while (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(),
+ NULL)) {
+ // Find the last group.
+ }
+ int last_group_start =
+ candidate.length() - candidate_input->ToString().length();
+ string without_last_group = candidate.substr(0, last_group_start);
+ phone_util_.TrimUnwantedEndChars(&without_last_group);
+ if (without_last_group == first_group_only) {
+ // If there are only two groups, then the group "without the last group"
+ // is the same as the first group. In these cases, we don't want to
+ // re-check the number group, so we exit already.
+ return false;
+ }
+ success = ParseAndVerify(without_last_group, offset, match);
+ if (success) {
+ return true;
+ }
+ --max_tries_;
+ }
+ }
+ return false;
+}
+
+bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset,
+ PhoneNumberMatch* match) {
+ DCHECK(match);
+ // Skip a match that is more likely a publication page reference or a date.
+ if (reg_exps_->pub_pages_->PartialMatch(candidate) ||
+ reg_exps_->slash_separated_dates_->PartialMatch(candidate)) {
+ return false;
+ }
+
+ // Try to come up with a valid match given the entire candidate.
+ if (ParseAndVerify(candidate, offset, match)) {
+ return true;
+ }
+
+ // If that failed, try to find an "inner match" - there might be a phone
+ // number within this candidate.
+ return ExtractInnerMatch(candidate, offset, match);
+}
+
+bool PhoneNumberMatcher::HasNext() {
+ if (state_ == NOT_READY) {
+ PhoneNumberMatch temp_match;
+ if (!Find(search_index_, &temp_match)) {
+ state_ = DONE;
+ } else {
+ last_match_.reset(new PhoneNumberMatch(temp_match.start(),
+ temp_match.raw_string(),
+ temp_match.number()));
+ search_index_ = last_match_->end();
+ state_ = READY;
+ }
+ }
+ return state_ == READY;
+}
+
+bool PhoneNumberMatcher::Next(PhoneNumberMatch* match) {
+ DCHECK(match);
+ // Check the state and find the next match as a side-effect if necessary.
+ if (!HasNext()) {
+ return false;
+ }
+ match->CopyFrom(*last_match_);
+ state_ = NOT_READY;
+ last_match_.reset(NULL);
+ return true;
+}
+
+bool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) {
+ DCHECK(match);
+
+ scoped_ptr<RegExpInput> text(
+ reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index)));
+ string candidate;
+ while ((max_tries_ > 0) &&
+ reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) {
+ int start = text_.length() - text->ToString().length() - candidate.length();
+ // Check for extra numbers at the end.
+ reg_exps_->capture_up_to_second_number_start_pattern_->
+ PartialMatch(candidate, &candidate);
+ if (ExtractMatch(candidate, start, match)) {
+ return true;
+ }
+
+ index = start + candidate.length();
+ --max_tries_;
+ }
+ return false;
+}
+
+} // namespace phonenumbers
+} // namespace i18n
diff --git a/phonenumbermatcher.h b/phonenumbermatcher.h
new file mode 100644
index 0000000..0eb013d
--- /dev/null
+++ b/phonenumbermatcher.h
@@ -0,0 +1,158 @@
+// Copyright (C) 2011 The Libphonenumber Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Author: Lara Rennie
+// Author: Tao Huang
+//
+// This is a direct port from PhoneNumberMatcher.java.
+// Changes to this class should also happen to the Java version, whenever it
+// makes sense.
+
+#ifndef I18N_PHONENUMBERS_PHONENUMBERMATCHER_H_
+#define I18N_PHONENUMBERS_PHONENUMBERMATCHER_H_
+
+#include <string>
+
+#include "base/basictypes.h"
+#include "base/memory/scoped_ptr.h"
+#include "phonenumbers/regexp_adapter.h"
+
+namespace i18n {
+namespace phonenumbers {
+
+using std::string;
+
+class PhoneNumber;
+class PhoneNumberMatch;
+class PhoneNumberMatcherRegExps;
+class PhoneNumberUtil;
+
+class PhoneNumberMatcher {
+ friend class PhoneNumberMatcherTest;
+ public:
+ // Leniency when finding potential phone numbers in text segments. The levels
+ // here are ordered in increasing strictness.
+ enum Leniency {
+ // Phone numbers accepted are possible, but not necessarily valid.
+ POSSIBLE,
+ // Phone numbers accepted are possible and valid.
+ VALID,
+ // Phone numbers accepted are valid and are grouped in a possible way for
+ // this locale. For example, a US number written as "65 02 53 00 00" is not
+ // accepted at this leniency level, whereas "650 253 0000" or "6502530000"
+ // are. Numbers with more than one '/' symbol are also dropped at this
+ // level.
+ // Warning: This and the next level might result in lower coverage
+ // especially for regions outside of country code "+1".
+ STRICT_GROUPING,
+ // Phone numbers accepted are valid and are grouped in the same way that we
+ // would have formatted it, or as a single block. For example, a US number
+ // written as "650 2530000" is not accepted at this leniency level, whereas
+ // "650 253 0000" or "6502530000" are.
+ EXACT_GROUPING,
+ };
+
+ // Constructs a phone number matcher.
+ PhoneNumberMatcher(const PhoneNumberUtil& util,
+ const string& text,
+ const string& region_code,
+ Leniency leniency,
+ int max_tries);
+
+ // Wrapper to construct a phone number matcher, with no limitation on the
+ // number of retries and VALID Leniency.
+ PhoneNumberMatcher(const string& text,
+ const string& region_code);
+
+ ~PhoneNumberMatcher();
+
+ // Returns true if the text sequence has another match.
+ bool HasNext();
+
+ // Gets next match from text sequence.
+ bool Next(PhoneNumberMatch* match);
+
+ private:
+ // The potential states of a PhoneNumberMatcher.
+ enum State {
+ NOT_READY,
+ READY,
+ DONE,
+ };
+
+ // Attempts to extract a match from a candidate string. Returns true if a
+ // match is found, otherwise returns false. The value "offset" refers to the
+ // start index of the candidate string within the overall text.
+ bool Find(int index, PhoneNumberMatch* match);
+
+ // Attempts to extract a match from candidate. Returns true if the match was
+ // found, otherwise returns false.
+ bool ExtractMatch(const string& candidate, int offset,
+ PhoneNumberMatch* match);
+
+ // Attempts to extract a match from a candidate string if the whole candidate
+ // does not qualify as a match. Returns true if a match is found, otherwise
+ // returns false.
+ bool ExtractInnerMatch(const string& candidate, int offset,
+ PhoneNumberMatch* match);
+
+ // Parses a phone number from the candidate using PhoneNumberUtil::Parse() and
+ // verifies it matches the requested leniency. If parsing and verification
+ // succeed, returns true, otherwise this method returns false;
+ bool ParseAndVerify(const string& candidate, int offset,
+ PhoneNumberMatch* match);
+
+ bool VerifyAccordingToLeniency(Leniency leniency, const PhoneNumber& number,
+ const string& candidate) const;
+
+ // Helper method to determine if a character is a Latin-script letter or not.
+ // For our purposes, combining marks should also return true since we assume
+ // they have been added to a preceding Latin character.
+ static bool IsLatinLetter(char32 letter);
+
+ // Helper class holding useful regular expressions.
+ const PhoneNumberMatcherRegExps* reg_exps_;
+
+ // The phone number utility;
+ const PhoneNumberUtil& phone_util_;
+
+ // The text searched for phone numbers;
+ const string text_;
+
+ // The region(country) to assume for phone numbers without an international
+ // prefix.
+ const string preferred_region_;
+
+ // The degree of validation requested.
+ Leniency leniency_;
+
+ // The maximum number of retries after matching an invalid number.
+ int max_tries_;
+
+ // The iteration tristate.
+ State state_;
+
+ // The last successful match, NULL unless in State.READY.
+ scoped_ptr<PhoneNumberMatch> last_match_;
+
+ // The next index to start searching at. Undefined in State.DONE.
+ int search_index_;
+
+ DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcher);
+};
+
+} // namespace phonenumbers
+} // namespace i18n
+
+#endif // I18N_PHONENUMBERS_PHONENUMBERMATCHER_H_
diff --git a/phonenumberutil.h b/phonenumberutil.h
index bd9cfb7..6bc8022 100644
--- a/phonenumberutil.h
+++ b/phonenumberutil.h
@@ -67,6 +67,9 @@ class PhoneNumberUtil : public Singleton<PhoneNumberUtil> {
friend class Singleton<PhoneNumberUtil>;
#endif
friend class AsYouTypeFormatter;
+ friend class PhoneNumberMatcher;
+ friend class PhoneNumberMatcherRegExps;
+ friend class PhoneNumberMatcherTest;
friend class PhoneNumberUtilTest;
public:
~PhoneNumberUtil();
diff --git a/stringutil.cc b/stringutil.cc
index e9d7a88..8d021a4 100644
--- a/stringutil.cc
+++ b/stringutil.cc
@@ -14,6 +14,7 @@
// Author: Philippe Liard
+#include <algorithm>
#include <cassert>
#include <cstring>
#include <sstream>
@@ -23,6 +24,7 @@
namespace i18n {
namespace phonenumbers {
+using std::equal;
using std::stringstream;
string operator+(const string& s, int n) {
@@ -54,6 +56,43 @@ string SimpleItoa(uint64 n) {
return GenericSimpleItoa(n);
}
+bool HasPrefixString(const string& s, const string& prefix) {
+ return s.size() >= prefix.size() &&
+ equal(s.begin(), s.begin() + prefix.size(), prefix.begin());
+}
+
+size_t FindNth(const string& s, char c, int n) {
+ size_t pos = string::npos;
+
+ for (int i = 0; i < n; ++i) {
+ pos = s.find_first_of(c, pos + 1);
+ if (pos == string::npos) {
+ break;
+ }
+ }
+ return pos;
+}
+
+void SplitStringUsing(const string& s, const string& delimiter,
+ vector<string>* result) {
+ assert(result);
+ size_t start_pos = 0;
+ size_t find_pos = string::npos;
+ if (delimiter.empty()) {
+ return;
+ }
+ while ((find_pos = s.find(delimiter, start_pos)) != string::npos) {
+ const string substring = s.substr(start_pos, find_pos - start_pos);
+ if (!substring.empty()) {
+ result->push_back(substring);
+ }
+ start_pos = find_pos + delimiter.length();
+ }
+ if (start_pos != s.length()) {
+ result->push_back(s.substr(start_pos));
+ }
+}
+
void StripString(string* s, const char* remove, char replacewith) {
const char* str_start = s->c_str();
const char* str = str_start;
@@ -255,6 +294,25 @@ string StrCat(const StringHolder& s1, const StringHolder& s2,
string StrCat(const StringHolder& s1, const StringHolder& s2,
const StringHolder& s3, const StringHolder& s4,
const StringHolder& s5, const StringHolder& s6,
+ const StringHolder& s7, const StringHolder& s8) {
+ string result;
+ result.reserve(s1.Length() + s2.Length() + s3.Length() + s4.Length() +
+ s5.Length() + s6.Length() + s7.Length() + s8.Length() + 1);
+ result += s1;
+ result += s2;
+ result += s3;
+ result += s4;
+ result += s5;
+ result += s6;
+ result += s7;
+ result += s8;
+
+ return result;
+}
+
+string StrCat(const StringHolder& s1, const StringHolder& s2,
+ const StringHolder& s3, const StringHolder& s4,
+ const StringHolder& s5, const StringHolder& s6,
const StringHolder& s7, const StringHolder& s8,
const StringHolder& s9) {
string result;
diff --git a/stringutil.h b/stringutil.h
index ee46bb2..6d90d00 100644
--- a/stringutil.h
+++ b/stringutil.h
@@ -19,6 +19,7 @@
#include <cstddef>
#include <string>
+#include <vector>
#include "base/basictypes.h"
@@ -26,6 +27,7 @@ namespace i18n {
namespace phonenumbers {
using std::string;
+using std::vector;
// Supports string("hello") + 10.
string operator+(const string& s, int n);
@@ -34,6 +36,18 @@ string operator+(const string& s, int n);
string SimpleItoa(uint64 n);
string SimpleItoa(int n);
+// Returns whether the provided string starts with the supplied prefix.
+bool HasPrefixString(const string& s, const string& prefix);
+
+// Returns the index of the nth occurence of c in s or string::npos if less than
+// n occurrences are present.
+size_t FindNth(const string& s, char c, int n);
+
+// Splits a string using a character delimiter. Appends the components to the
+// provided vector. Note that empty tokens are ignored.
+void SplitStringUsing(const string& s, const string& delimiter,
+ vector<string>* result);
+
// Replaces any occurrence of the character 'remove' (or the characters
// in 'remove') with the character 'replacewith'.
void StripString(string* s, const char* remove, char replacewith);
@@ -116,6 +130,11 @@ string StrCat(const StringHolder& s1, const StringHolder& s2,
string StrCat(const StringHolder& s1, const StringHolder& s2,
const StringHolder& s3, const StringHolder& s4,
const StringHolder& s5, const StringHolder& s6,
+ const StringHolder& s7, const StringHolder& s8);
+
+string StrCat(const StringHolder& s1, const StringHolder& s2,
+ const StringHolder& s3, const StringHolder& s4,
+ const StringHolder& s5, const StringHolder& s6,
const StringHolder& s7, const StringHolder& s8,
const StringHolder& s9);