diff options
author | roubert@google.com <roubert@google.com@38ededc0-08b8-5190-f2ac-b31f878777ad> | 2014-03-21 12:22:07 +0000 |
---|---|---|
committer | roubert@google.com <roubert@google.com@38ededc0-08b8-5190-f2ac-b31f878777ad> | 2014-03-21 12:22:07 +0000 |
commit | c3c546166647eeda301711d0ee83d4f7187f3a9c (patch) | |
tree | c6612d70ea8bdf37d083c1a8bd6c60961a4eceac | |
parent | aa04d02df8665c81b05c057ddfd22d6ef8606405 (diff) | |
download | src-c3c546166647eeda301711d0ee83d4f7187f3a9c.tar.gz |
Add the PostBoxMatchers helper class.
This class contains regular expressions to match post office boxes, and
a function to get the appropriate list of these to use for a particular
country.
git-svn-id: http://libaddressinput.googlecode.com/svn/trunk@201 38ededc0-08b8-5190-f2ac-b31f878777ad
-rw-r--r-- | cpp/libaddressinput.gyp | 2 | ||||
-rw-r--r-- | cpp/src/post_box_matchers.cc | 140 | ||||
-rw-r--r-- | cpp/src/post_box_matchers.h | 38 | ||||
-rw-r--r-- | cpp/test/post_box_matchers_test.cc | 87 |
4 files changed, 267 insertions, 0 deletions
diff --git a/cpp/libaddressinput.gyp b/cpp/libaddressinput.gyp index 4b9304c..bd64cc7 100644 --- a/cpp/libaddressinput.gyp +++ b/cpp/libaddressinput.gyp @@ -41,6 +41,7 @@ 'src/lookup_key_util.cc', 'src/metadata_loader.cc', 'src/null_storage.cc', + 'src/post_box_matchers.cc', 'src/region_data_constants.cc', 'src/retriever.cc', 'src/rule.cc', @@ -75,6 +76,7 @@ 'test/lookup_key_util_test.cc', 'test/metadata_loader_test.cc', 'test/null_storage_test.cc', + 'test/post_box_matchers_test.cc', 'test/region_data_constants_test.cc', 'test/retriever_test.cc', 'test/rule_retriever_test.cc', diff --git a/cpp/src/post_box_matchers.cc b/cpp/src/post_box_matchers.cc new file mode 100644 index 0000000..fd0602a --- /dev/null +++ b/cpp/src/post_box_matchers.cc @@ -0,0 +1,140 @@ +// Copyright (C) 2014 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "util/re2ptr.h" // Must be the first #include statement! + +#include "post_box_matchers.h" + +#include <algorithm> +#include <cctype> +#include <cstddef> +#include <map> +#include <string> +#include <utility> +#include <vector> + +#include "rule.h" + +namespace i18n { +namespace addressinput { + +namespace { + +std::map<std::string, const RE2ptr*> InitMatchers() { + static const struct { + const char* const language; + const RE2ptr ptr; + } kMatchers[] = { + { "ar", + /* "صندوق بريد|ص[-. ]ب" */ + new RE2("\xD8\xB5\xD9\x86\xD8\xAF\xD9\x88\xD9\x82 " + "\xD8\xA8\xD8\xB1\xD9\x8A\xD8\xAF|\xD8\xB5[-. ]\xD8\xA8") }, + + { "cs", new RE2("(?i)p\\.? ?p\\.? \\d") }, + { "da", new RE2("(?i)Postboks") }, + { "de", new RE2("(?i)Postfach") }, + + { "el", + /* "T\\.? ?Θ\\.? \\d{2}" */ + new RE2("(?i)T\\.? ?\xCE\x98\\.? \\d{2}") }, + + { "en", new RE2("Private Bag|Post(?:al)? Box") }, + { "es", new RE2("(?i)(?:Apartado|Casillas) de correos?") }, + { "fi", new RE2("(?i)Postilokero|P\\.?L\\.? \\d") }, + { "hr", new RE2("(?i)p\\.? ?p\\.? \\d") }, + + { "hu", + /* "Postafi(?:[oó]|ó)k|Pf\\.? \\d" */ + new RE2("(?i)Postafi(?:[o\xC3\xB3]|o\xCC\x81)k|Pf\\.? \\d") }, + + { "fr", + /* "Bo(?:[iî]|î)te Postale|BP \\d|CEDEX \\d" */ + new RE2("(?i)Bo(?:[i\xC3\xAE]|i\xCC\x82)te Postale|BP \\d|CEDEX \\d") }, + + { "ja", + /* "私書箱\\d{1,5}号" */ + new RE2("(?i)\xE7\xA7\x81\xE6\x9B\xB8\xE7\xAE\xB1\\d{1,5}\xE5\x8F\xB7") }, + + { "nl", new RE2("(?i)Postbus") }, + { "no", new RE2("(?i)Postboks") }, + { "pl", new RE2("(?i)Skr(?:\\.?|ytka) poczt(?:\\.?|owa)") }, + { "pt", new RE2("(?i)Apartado") }, + + { "ru", + /* "абонентский ящик|[аa]\\\" */ + new RE2("(?i)\xD0\xB0\xD0\xB1\xD0\xBE\xD0\xBD\xD0\xB5\xD0\xBD\xD1\x82\xD1" + "\x81\xD0\xBA\xD0\xB8\xD0\xB9 \xD1\x8F\xD1\x89\xD0\xB8\xD0\xBA|" + "[\xD0\xB0""a]\\\"\xD1\x8F (?:(?:\xE2\x84\x96|#|N) ?)?\\d") }, + + { "sv", new RE2("(?i)Box \\d") }, + + { "zh", + /* "郵政信箱.{1,5}號|郵局第.{1,10}號信箱" */ + new RE2("(?i)\xE9\x83\xB5\xE6\x94\xBF\xE4\xBF\xA1\xE7\xAE\xB1.{1,5}" + "\xE8\x99\x9F|\xE9\x83\xB5\xE5\xB1\x80\xE7\xAC\xAC.{1,10}" + "\xE8\x99\x9F\xE4\xBF\xA1\xE7\xAE\xB1") }, + + { "und", new RE2("P\\.? ?O\\.? Box") } + }; + + std::map<std::string, const RE2ptr*> matchers; + + for (size_t i = 0; i < sizeof kMatchers / sizeof *kMatchers; ++i) { + matchers.insert(std::make_pair(kMatchers[i].language, &kMatchers[i].ptr)); + } + + return matchers; +} + +std::string GetBaseLanguage(const std::string& language) { + // Be lenient in parsing, allow underscore separators and uppercase letters. + std::string::size_type end = language.find_first_of("-_"); + std::string base( + end == std::string::npos ? language : language.substr(0, end)); + std::transform(base.begin(), base.end(), base.begin(), tolower); + return base; +} + +} // namespace + +// static +std::vector<const RE2ptr*> PostBoxMatchers::GetMatchers( + const Rule& country_rule) { + static const std::map<std::string, const RE2ptr*> kMatchers(InitMatchers()); + + // Always add any expressions defined for "und" (English-like defaults). + std::vector<std::string> languages(1, "und"); + for (std::vector<std::string>::const_iterator + it = country_rule.GetLanguages().begin(); + it != country_rule.GetLanguages().end(); ++it) { + languages.push_back(GetBaseLanguage(*it)); + } + + std::vector<const RE2ptr*> result; + + for (std::vector<std::string>::const_iterator + it = languages.begin(); + it != languages.end(); ++it) { + std::map<std::string, const RE2ptr*>::const_iterator + jt = kMatchers.find(*it); + if (jt != kMatchers.end()) { + result.push_back(jt->second); + } + } + + return result; +} + +} // namespace addressinput +} // namespace i18n diff --git a/cpp/src/post_box_matchers.h b/cpp/src/post_box_matchers.h new file mode 100644 index 0000000..8f9f44b --- /dev/null +++ b/cpp/src/post_box_matchers.h @@ -0,0 +1,38 @@ +// Copyright (C) 2014 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Post office box regular expressions. + +#ifndef I18N_ADDRESSINPUT_POST_BOX_MATCHERS_H_ +#define I18N_ADDRESSINPUT_POST_BOX_MATCHERS_H_ + +#include <vector> + +namespace i18n { +namespace addressinput { + +class RE2ptr; +class Rule; + +class PostBoxMatchers { + public: + // Returns pointers to RE2 regular expression objects to test address lines + // for those languages that are relevant for |country_rule|. + static std::vector<const RE2ptr*> GetMatchers(const Rule& country_rule); +}; + +} // namespace addressinput +} // namespace i18n + +#endif // I18N_ADDRESSINPUT_POST_BOX_MATCHERS_H_ diff --git a/cpp/test/post_box_matchers_test.cc b/cpp/test/post_box_matchers_test.cc new file mode 100644 index 0000000..d89ee0b --- /dev/null +++ b/cpp/test/post_box_matchers_test.cc @@ -0,0 +1,87 @@ +// Copyright (C) 2014 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "post_box_matchers.h" + +#include <cstddef> +#include <vector> + +#include "rule.h" + +#include <gtest/gtest.h> + +namespace i18n { +namespace addressinput { +class RE2ptr; +} // namespace addressinput +} // namespace i18n + +namespace { + +using i18n::addressinput::PostBoxMatchers; +using i18n::addressinput::RE2ptr; +using i18n::addressinput::Rule; + +TEST(PostBoxMatchersTest, AlwaysGetMatcherForLanguageUnd) { + Rule rule; + std::vector<const RE2ptr*> matchers = PostBoxMatchers::GetMatchers(rule); + EXPECT_EQ(1, matchers.size()); + EXPECT_TRUE(matchers[0] != NULL); +} + +TEST(PostBoxMatchersTest, NoMatcherForInvalidLanguage) { + Rule rule; + ASSERT_TRUE(rule.ParseSerializedRule("{\"languages\":\"xx\"}")); + std::vector<const RE2ptr*> matchers = PostBoxMatchers::GetMatchers(rule); + EXPECT_EQ(1, matchers.size()); + EXPECT_TRUE(matchers[0] != NULL); +} + +TEST(PostBoxMatchersTest, HasMatcherForValidLanguage) { + Rule rule; + ASSERT_TRUE(rule.ParseSerializedRule("{\"languages\":\"sv\"}")); + std::vector<const RE2ptr*> matchers = PostBoxMatchers::GetMatchers(rule); + EXPECT_EQ(2, matchers.size()); + EXPECT_TRUE(matchers[0] != NULL); + EXPECT_TRUE(matchers[1] != NULL); +} + +TEST(PostBoxMatchersTest, MixValidAndInvalidLanguage) { + Rule rule; + ASSERT_TRUE(rule.ParseSerializedRule("{\"languages\":\"xx~sv\"}")); + std::vector<const RE2ptr*> matchers = PostBoxMatchers::GetMatchers(rule); + EXPECT_EQ(2, matchers.size()); + EXPECT_TRUE(matchers[0] != NULL); + EXPECT_TRUE(matchers[1] != NULL); +} + +TEST(PostBoxMatchersTest, UseBaseLanguageForMatching) { + Rule rule; + ASSERT_TRUE(rule.ParseSerializedRule("{\"languages\":\"sv-SE\"}")); + std::vector<const RE2ptr*> matchers = PostBoxMatchers::GetMatchers(rule); + EXPECT_EQ(2, matchers.size()); + EXPECT_TRUE(matchers[0] != NULL); + EXPECT_TRUE(matchers[1] != NULL); +} + +TEST(PostBoxMatchersTest, LenientLanguageTagParsing) { + Rule rule; + ASSERT_TRUE(rule.ParseSerializedRule("{\"languages\":\"SV_SE\"}")); + std::vector<const RE2ptr*> matchers = PostBoxMatchers::GetMatchers(rule); + EXPECT_EQ(2, matchers.size()); + EXPECT_TRUE(matchers[0] != NULL); + EXPECT_TRUE(matchers[1] != NULL); +} + +} // namespace |