diff options
author | philip.liard@gmail.com <philip.liard@gmail.com@ee073f10-1060-11df-b6a4-87a95322a99c> | 2011-07-04 15:09:31 +0000 |
---|---|---|
committer | philip.liard@gmail.com <philip.liard@gmail.com@ee073f10-1060-11df-b6a4-87a95322a99c> | 2011-07-04 15:09:31 +0000 |
commit | ee8bec183175cdd7acde5c49d7d8fa82886bc432 (patch) | |
tree | e732751dcd0722a34e803427f11601e11a4d6cb5 | |
parent | 6d168d33b72f2207591580adf0fdb77333287ee7 (diff) | |
download | phonenumbers-ee8bec183175cdd7acde5c49d7d8fa82886bc432.tar.gz |
CPP: Replace direct UTF-8 with escape sequences.
git-svn-id: http://libphonenumber.googlecode.com/svn/trunk/cpp/src/phonenumbers@289 ee073f10-1060-11df-b6a4-87a95322a99c
-rw-r--r-- | phonenumberutil.cc | 46 | ||||
-rw-r--r-- | phonenumberutil_test.cc | 63 | ||||
-rw-r--r-- | regexp_adapter_test.cc | 14 |
3 files changed, 81 insertions, 42 deletions
diff --git a/phonenumberutil.cc b/phonenumberutil.cc index 00893d1..690147c 100644 --- a/phonenumberutil.cc +++ b/phonenumberutil.cc @@ -62,7 +62,7 @@ using std::stringstream; using google::protobuf::RepeatedPtrField; // static -const char PhoneNumberUtil::kPlusChars[] = "++"; +const char PhoneNumberUtil::kPlusChars[] = "+\xEF\xBC\x8B"; /* "++" */ // To find out the unicode code-point of the characters below in vim, highlight // the character and type 'ga'. Note that the - is used to express ranges of // full-width punctuation below, as well as being present in the expression @@ -70,7 +70,10 @@ const char PhoneNumberUtil::kPlusChars[] = "++"; // unicode character. // static const char PhoneNumberUtil::kValidPunctuation[] = - "-x‐-―−ー--/ ()()[].\\[\\]/~⁓∼"; + /* "-x‐-―−ー--/ <U+200B><U+2060> ()()[].\\[\\]/~⁓∼" */ + "-x\xE2\x80\x90-\xE2\x80\x95\xE2\x88\x92\xE3\x83\xBC\xEF\xBC\x8D-\xEF\xBC" + "\x8F \xC2\xA0\xE2\x80\x8B\xE2\x81\xA0\xE3\x80\x80()\xEF\xBC\x88\xEF\xBC" + "\x89\xEF\xBC\xBB\xEF\xBC\xBD.\\[\\]/~\xE2\x81\x93\xE2\x88\xBC"; namespace { @@ -450,35 +453,35 @@ void InitializeStaticMapsAndSets() { all_plus_number_grouping_symbols->insert( make_pair(ToUnicodeCodepoint("-"), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("-"), '-')); + make_pair(ToUnicodeCodepoint("\xEF\xBC\x8D" /* "-" */), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("‐"), '-')); + make_pair(ToUnicodeCodepoint("\xE2\x80\x90" /* "‐" */), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("‑"), '-')); + make_pair(ToUnicodeCodepoint("\xE2\x80\x91" /* "‑" */), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("‒"), '-')); + make_pair(ToUnicodeCodepoint("\xE2\x80\x92" /* "‒" */), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("–"), '-')); + make_pair(ToUnicodeCodepoint("\xE2\x80\x93" /* "–" */), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("—"), '-')); + make_pair(ToUnicodeCodepoint("\xE2\x80\x94" /* "—" */), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("―"), '-')); + make_pair(ToUnicodeCodepoint("\xE2\x80\x95" /* "―" */), '-')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("−"), '-')); + make_pair(ToUnicodeCodepoint("\xE2\x88\x92" /* "−" */), '-')); all_plus_number_grouping_symbols->insert( make_pair(ToUnicodeCodepoint("/"), '/')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("/"), '/')); + make_pair(ToUnicodeCodepoint("\xEF\xBC\x8F" /* "/" */), '/')); all_plus_number_grouping_symbols->insert( make_pair(ToUnicodeCodepoint(" "), ' ')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint(" "), ' ')); + make_pair(ToUnicodeCodepoint("\xE3\x80\x80" /* " " */), ' ')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint(""), ' ')); + make_pair(ToUnicodeCodepoint("\xE2\x81\xA0"), ' ')); all_plus_number_grouping_symbols->insert( make_pair(ToUnicodeCodepoint("."), '.')); all_plus_number_grouping_symbols->insert( - make_pair(ToUnicodeCodepoint("."), '.')); + make_pair(ToUnicodeCodepoint("\xEF\xBC\x8E" /* "." */), '.')); // Only the upper-case letters are added here - the lower-case versions are // added programmatically. alpha_mappings->insert(make_pair(ToUnicodeCodepoint("A"), '2')); @@ -692,7 +695,8 @@ PhoneNumberUtil* PhoneNumberUtil::GetInstance() { void PhoneNumberUtil::CreateRegularExpressions() const { unique_international_prefix.reset(RegExp::Create( - "[\\d]+(?:[~⁓∼~][\\d]+)?")); + /* "[\\d]+(?:[~⁓∼~][\\d]+)?" */ + "[\\d]+(?:[~\xE2\x81\x93\xE2\x88\xBC\xEF\xBD\x9E][\\d]+)?")); // The first_group_capturing_pattern was originally set to $1 but there are // some countries for which the first group is not used in the national // pattern (e.g. Argentina) so the $1 group does not match correctly. @@ -720,10 +724,16 @@ void PhoneNumberUtil::CreateRegularExpressions() const { const string capturing_extn_digits = StrCat("([", kDigits, "]{1,7})"); known_extn_patterns.reset(new string( StrCat(kRfc3966ExtnPrefix, capturing_extn_digits, "|" - "[ \\t,]*(?:ext(?:ensi(?:ó?|ó))?n?|extn?|[,xx##~~]|" + /* "[ \\t,]*(?:ext(?:ensi(?:ó?|ó))?n?|extn?|[,xx##~~]|" "int|int|anexo)" - "[:\\..]?[ \\t,-]*", capturing_extn_digits, "#?|" - "[- ]+([", kDigits, "]{1,5})#"))); + "[:\\..]?[ \\t,-]*", capturing_extn_digits, "#?|" */ + "[ \xC2\xA0\\t,]*(?:ext(?:ensi(?:o\xCC\x81?|\xC3\xB3))?n?|\xEF\xBD" + "\x85\xEF\xBD\x98\xEF\xBD\x94\xEF\xBD\x8E?|[,x\xEF\xBD\x98#\xEF" + "\xBC\x83~\xEF\xBD\x9E]|" + "int|\xEF\xBD\x89\xEF\xBD\x8E\xEF\xBD\x94|anexo)" + "[:\\.\xEF\xBC\x8E]?[ \xC2\xA0\\t,-]*", capturing_extn_digits, + "#?|[- ]+([", kDigits, "]{1,5})#"))); + extn_pattern.reset(RegExp::Create( StrCat("(?i)(?:", *known_extn_patterns, ")$"))); valid_phone_number_pattern.reset(RegExp::Create( diff --git a/phonenumberutil_test.cc b/phonenumberutil_test.cc index 2badb96..ce206ce 100644 --- a/phonenumberutil_test.cc +++ b/phonenumberutil_test.cc @@ -1145,11 +1145,15 @@ TEST_F(PhoneNumberUtilTest, ExtractPossibleNumber) { ExtractPossibleNumber("Tel:+800-345-600", &extracted_number); EXPECT_EQ("+800-345-600", extracted_number); // Should recognise wide digits as possible start values. - ExtractPossibleNumber("023", &extracted_number); - EXPECT_EQ("023", extracted_number); + ExtractPossibleNumber("\xEF\xBC\x90\xEF\xBC\x92\xEF\xBC\x93" /* "023" */, + &extracted_number); + EXPECT_EQ("\xEF\xBC\x90\xEF\xBC\x92\xEF\xBC\x93" /* "023" */, + extracted_number); // Dashes are not possible start values and should be removed. - ExtractPossibleNumber("Num-123", &extracted_number); - EXPECT_EQ("123", extracted_number); + ExtractPossibleNumber("Num-\xEF\xBC\x91\xEF\xBC\x92\xEF\xBC\x93" + /* "Num-123" */, &extracted_number); + EXPECT_EQ("\xEF\xBC\x91\xEF\xBC\x92\xEF\xBC\x93" /* "123" */, + extracted_number); // If not possible number present, return empty string. ExtractPossibleNumber("Num-....", &extracted_number); EXPECT_EQ("", extracted_number); @@ -1163,7 +1167,8 @@ TEST_F(PhoneNumberUtilTest, ExtractPossibleNumber) { ExtractPossibleNumber("(650) 253-0000.", &extracted_number); EXPECT_EQ("650) 253-0000", extracted_number); // This case has a trailing RTL char. - ExtractPossibleNumber("(650) 253-0000", &extracted_number); + ExtractPossibleNumber("(650) 253-0000\xE2\x80\x8F" + /* "(650) 253-0000" */, &extracted_number); EXPECT_EQ("650) 253-0000", extracted_number); } @@ -1675,13 +1680,15 @@ TEST_F(PhoneNumberUtilTest, IsViablePhoneNumber) { EXPECT_TRUE(IsViablePhoneNumber("0800-4-PIZZA")); // Only one or two digits before possible punctuation followed by more digits. // The punctuation used here is the unicode character u+3000. - EXPECT_TRUE(IsViablePhoneNumber("1 34")); - EXPECT_FALSE(IsViablePhoneNumber("1 3+4")); + EXPECT_TRUE(IsViablePhoneNumber("1\xE3\x80\x80" "34" /* "1 34" */)); + EXPECT_FALSE(IsViablePhoneNumber("1\xE3\x80\x80" "3+4" /* "1 3+4" */)); // Unicode variants of possible starting character and other allowed // punctuation/digits. - EXPECT_TRUE(IsViablePhoneNumber("(1) 3456789")); + EXPECT_TRUE(IsViablePhoneNumber("\xEF\xBC\x88" "1\xEF\xBC\x89\xE3\x80\x80" + "3456789" /* "(1) 3456789" */ )); // Testing a leading + is okay. - EXPECT_TRUE(IsViablePhoneNumber("+1) 3456789")); + EXPECT_TRUE(IsViablePhoneNumber("+1\xEF\xBC\x89\xE3\x80\x80" + "3456789" /* "+1) 3456789" */)); } TEST_F(PhoneNumberUtilTest, ConvertAlphaCharactersInNumber) { @@ -1692,8 +1699,10 @@ TEST_F(PhoneNumberUtilTest, ConvertAlphaCharactersInNumber) { EXPECT_EQ(kExpectedOutput, input); // Try with some non-ASCII characters. - input.assign("1 (800) ABC-DEF"); - static const string kExpectedFullwidthOutput = "1 (800) 222-333"; + input.assign("1\xE3\x80\x80\xEF\xBC\x88" "800) ABC-DEF" + /* "1 (800) ABCD-DEF" */); + static const string kExpectedFullwidthOutput = + "1\xE3\x80\x80\xEF\xBC\x88" "800) 222-333" /* "1 (800) 222-333" */; phone_util_.ConvertAlphaCharactersInNumber(&input); EXPECT_EQ(kExpectedFullwidthOutput, input); } @@ -1717,13 +1726,13 @@ TEST_F(PhoneNumberUtilTest, NormaliseReplaceAlphaCharacters) { TEST_F(PhoneNumberUtilTest, NormaliseOtherDigits) { // The first digit is a full-width 2, the last digit is an Arabic-indic digit // 5. - string input_number("25٥"); + string input_number("\xEF\xBC\x92" "5\xD9\xA5" /* "25٥" */); Normalize(&input_number); static const string kExpectedOutput("255"); EXPECT_EQ(kExpectedOutput, input_number) << "Conversion did not correctly replace non-latin digits"; // The first digit is an Eastern-Arabic 5, the latter an Eastern-Arabic 0. - string eastern_arabic_input_number("۵2۰"); + string eastern_arabic_input_number("\xDB\xB5" "2\xDB\xB0" /* "۵2۰" */); Normalize(&eastern_arabic_input_number); static const string kExpectedOutput2("520"); EXPECT_EQ(kExpectedOutput2, eastern_arabic_input_number) @@ -2379,21 +2388,32 @@ TEST_F(PhoneNumberUtilTest, ParseWithInternationalPrefixes) { // Using a full-width plus sign. test_number.Clear(); EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, - phone_util_.Parse("+1 (650) 333-6000", + phone_util_.Parse("\xEF\xBC\x8B" "1 (650) 333-6000", + /* "+1 (650) 333-6000" */ RegionCode::SG(), &test_number)); EXPECT_EQ(us_number, test_number); // The whole number, including punctuation, is here represented in full-width // form. test_number.Clear(); EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, - phone_util_.Parse("+1 (650) 333-6000", + phone_util_.Parse("\xEF\xBC\x8B\xEF\xBC\x91\xE3\x80\x80\xEF\xBC\x88" + "\xEF\xBC\x96\xEF\xBC\x95\xEF\xBC\x90\xEF\xBC\x89" + "\xE3\x80\x80\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x93" + "\xEF\xBC\x8D\xEF\xBC\x96\xEF\xBC\x90\xEF\xBC\x90" + "\xEF\xBC\x90", + /* "+1 (650) 333-6000" */ RegionCode::SG(), &test_number)); EXPECT_EQ(us_number, test_number); // Using the U+30FC dash. test_number.Clear(); EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, - phone_util_.Parse("+1 (650) 333ー6000", + phone_util_.Parse("\xEF\xBC\x8B\xEF\xBC\x91\xE3\x80\x80\xEF\xBC\x88" + "\xEF\xBC\x96\xEF\xBC\x95\xEF\xBC\x90\xEF\xBC\x89" + "\xE3\x80\x80\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x93" + "\xE3\x83\xBC\xEF\xBC\x96\xEF\xBC\x90\xEF\xBC\x90" + "\xEF\xBC\x90", + /* "+1 (650) 333ー6000" */ RegionCode::SG(), &test_number)); EXPECT_EQ(us_number, test_number); } @@ -2633,8 +2653,9 @@ TEST_F(PhoneNumberUtilTest, ParseNumbersWithPlusWithNoRegion) { // Test with full-width plus. result_proto.Clear(); EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, - phone_util_.Parse("+64 3 331 6005", RegionCode::GetUnknown(), - &result_proto)); + phone_util_.Parse("\xEF\xBC\x8B" "64 3 331 6005", + /* "+64 3 331 6005" */ + RegionCode::GetUnknown(), &result_proto)); EXPECT_EQ(nz_number, result_proto); // Test with normal plus but leading characters that need to be stripped. EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, @@ -2792,7 +2813,8 @@ TEST_F(PhoneNumberUtilTest, ParseExtensions) { EXPECT_EQ(us_with_extension, test_number); test_number.Clear(); EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, - phone_util_.Parse("(800) 901-3355 ,extensión 7246433", + phone_util_.Parse("(800) 901-3355 ,extensi\xC3\xB3n 7246433", + /* "(800) 901-3355 ,extensión 7246433" */ RegionCode::US(), &test_number)); EXPECT_EQ(us_with_extension, test_number); @@ -2800,7 +2822,8 @@ TEST_F(PhoneNumberUtilTest, ParseExtensions) { // Repeat with the small letter o with acute accent created by combining // characters. EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, - phone_util_.Parse("(800) 901-3355 ,extensión 7246433", + phone_util_.Parse("(800) 901-3355 ,extensio\xCC\x81n 7246433", + /* "(800) 901-3355 ,extensión 7246433" */ RegionCode::US(), &test_number)); EXPECT_EQ(us_with_extension, test_number); diff --git a/regexp_adapter_test.cc b/regexp_adapter_test.cc index 14fbd40..2b03342 100644 --- a/regexp_adapter_test.cc +++ b/regexp_adapter_test.cc @@ -186,12 +186,18 @@ TEST_F(RegExpAdapterTest, TestGlobalReplace) { } TEST(RegExpAdapter, TestUtf8) { - const scoped_ptr<const RegExp> reg_exp(RegExp::Create("℡⊏([α-ω]*)⊐")); + const scoped_ptr<const RegExp> reg_exp(RegExp::Create( + "\xE2\x84\xA1\xE2\x8A\x8F([\xCE\xB1-\xCF\x89]*)\xE2\x8A\x90" + /* "℡⊏([α-ω]*)⊐" */)); string matched; - EXPECT_FALSE(reg_exp->Match("℡⊏123⊐", true, &matched)); - EXPECT_TRUE(reg_exp->Match("℡⊏αβ⊐", true, &matched)); - EXPECT_EQ("αβ", matched); + EXPECT_FALSE(reg_exp->Match( + "\xE2\x84\xA1\xE2\x8A\x8F" "123\xE2\x8A\x90" /* "℡⊏123⊐" */, true, + &matched)); + EXPECT_TRUE(reg_exp->Match( + "\xE2\x84\xA1\xE2\x8A\x8F\xCE\xB1\xCE\xB2\xE2\x8A\x90" + /* "℡⊏αβ⊐" */, true, &matched)); + EXPECT_EQ("\xCE\xB1\xCE\xB2" /* "αβ" */, matched); } } // namespace phonenumbers |