Determine language tag and order of UI components.

This patch changes BuildComponents() function to read the UI language tag and determine the language tag that should be stored with the address data. The function also chooses between the default and the Latin order of UI components in the address form. A language tag is a sequence of one of more subtags separated by hyphen ("-"). All characters are in ASCII and case-insensitive. The first subtag is the base language, for example "zh". The first four-character subtag is the script, for example "Hans". git-svn-id: http://libaddressinput.googlecode.com/svn/trunk@208 38ededc0-08b8-5190-f2ac-b31f878777ad
author: rouslan@chromium.org <rouslan@chromium.org@38ededc0-08b8-5190-f2ac-b31f878777ad> 2014-05-14 07:47:13 +0000
committer: rouslan@chromium.org <rouslan@chromium.org@38ededc0-08b8-5190-f2ac-b31f878777ad> 2014-05-14 07:47:13 +0000
commit: 947ce24396bbb66ec5fcd14a73f85f4e32c3dcc0 (patch)
tree: 5690ec7daed768b42524fe15b0193afdc4353b67 /cpp
parent: c9cd8929fc3b9ff6918b9246220ea645cae78185 (diff)
download: src-947ce24396bbb66ec5fcd14a73f85f4e32c3dcc0.tar.gz
14 files changed, 440 insertions, 41 deletions
diff --git a/cpp/include/libaddressinput/address_ui.h b/cpp/include/libaddressinput/address_ui.h
index 35d2ede..8939501 100644
--- a/cpp/include/libaddressinput/address_ui.h
+++ b/cpp/include/libaddressinput/address_ui.h
@@ -28,10 +28,19 @@ struct AddressUiComponent;
 const std::vector<std::string>& GetRegionCodes();
 
 // Returns the UI components for the CLDR |region_code|. Uses the strings from
-// |localization|. Returns an empty vector on error.
+// |localization|. The components can be in default or Latin order, depending on
+// the language of |localization|.
+//
+// Sets the |best_address_language_tag| to the BCP 47 language tag that should
+// be saved with this address. This language will be used to get drop-downs to
+// help users fill in their address, and to format the address that the user
+// entered. The parameter should not be NULL.
+//
+// Returns an empty vector on error.
 std::vector<AddressUiComponent> BuildComponents(
     const std::string& region_code,
-    const Localization& localization);
+    const Localization& localization,
+    std::string* best_address_language_tag);
 
 }  // namespace addressinput
 }  // namespace i18n
diff --git a/cpp/include/libaddressinput/localization.h b/cpp/include/libaddressinput/localization.h
index 6b599c3..77a1917 100644
--- a/cpp/include/libaddressinput/localization.h
+++ b/cpp/include/libaddressinput/localization.h
@@ -23,12 +23,14 @@ namespace addressinput {
 // The object to retrieve localized strings based on message IDs. Sample usage:
 //    Localization localization;
 //    localization.SetLanguage("en");
-//    Process(BuildComponents("CA", localization));
+//    std::string best_language_tag;
+//    Process(BuildComponents("CA", localization, &best_language_tag));
 //
 // Alternative usage:
 //    Localization localization;
-//    localization.SetGetter(&MyStringGetter);
-//    Process(BuildComponents("CA", localization));
+//    localization.SetGetter(&MyStringGetter, "fr");
+//    std::string best_language_tag;
+//    Process(BuildComponents("CA", localization, &best_language_tag));
 class Localization {
  public:
   // Initializes with English messages by default.
@@ -41,15 +43,22 @@ class Localization {
 
   // Sets the language for the strings. The only supported language is "en"
   // until we have translations.
-  void SetLanguage(const std::string& language_code);
+  void SetLanguage(const std::string& language_tag);
 
   // Sets the string getter that takes a message identifier and returns the
-  // corresponding localized string.
-  void SetGetter(std::string (*getter)(int));
+  // corresponding localized string. The |language_tag| parameter is used only
+  // for information purposes here.
+  void SetGetter(std::string (*getter)(int), const std::string& language_tag);
+
+  // Returns the current language tag.
+  const std::string& GetLanguage() const { return language_tag_; }
 
  private:
   // The string getter.
   std::string (*get_string_)(int);
+
+  // The current language tag.
+  std::string language_tag_;
 };
 
 }  // namespace addressinput
diff --git a/cpp/libaddressinput.gyp b/cpp/libaddressinput.gyp
index e57bda0..b4ab884 100644
--- a/cpp/libaddressinput.gyp
+++ b/cpp/libaddressinput.gyp
@@ -37,6 +37,7 @@
         'src/address_problem.cc',
         'src/address_ui.cc',
         'src/address_validator.cc',
+        'src/language.cc',
         'src/localization.cc',
         'src/lookup_key.cc',
         'src/lookup_key_util.cc',
@@ -75,6 +76,7 @@
         'test/fake_downloader_test.cc',
         'test/fake_storage.cc',
         'test/fake_storage_test.cc',
+        'test/language_test.cc',
         'test/localization_test.cc',
         'test/lookup_key_test.cc',
         'test/lookup_key_util_test.cc',
diff --git a/cpp/src/address_ui.cc b/cpp/src/address_ui.cc
index 15f3c61..0eb28fc 100644
--- a/cpp/src/address_ui.cc
+++ b/cpp/src/address_ui.cc
@@ -18,12 +18,15 @@
 #include <libaddressinput/address_ui_component.h>
 #include <libaddressinput/localization.h>
 
+#include <cassert>
+#include <cstddef>
 #include <set>
 #include <string>
 #include <vector>
 
 #include "address_field_util.h"
 #include "grit.h"
+#include "language.h"
 #include "messages.h"
 #include "region_data_constants.h"
 #include "rule.h"
@@ -33,6 +36,41 @@ namespace addressinput {
 
 namespace {
 
+Language ChooseBestAddressLanguage(
+    const std::vector<Language>& available_languages,
+    bool has_latin_format,
+    const Language& ui_language) {
+  if (available_languages.empty()) {
+    return ui_language;
+  }
+
+  if (ui_language.tag.empty()) {
+    return available_languages.front();
+  }
+
+  // The conventionally formatted BCP 47 Latin script with a preceding subtag
+  // separator.
+  static const char kLatinScriptSuffix[] = "-Latn";
+  Language latin_script_language(
+      available_languages.front().base + kLatinScriptSuffix);
+  if (has_latin_format && ui_language.has_latin_script) {
+    return latin_script_language;
+  }
+
+  for (std::vector<Language>::const_iterator
+       available_lang_it = available_languages.begin();
+       available_lang_it != available_languages.end(); ++available_lang_it) {
+    // Base language comparison works because no region supports the same base
+    // language with different scripts, for now. For example, no region supports
+    // "zh-Hant" and "zh-Hans" at the same time.
+    if (ui_language.base == available_lang_it->base) {
+      return *available_lang_it;
+    }
+  }
+
+  return has_latin_format ? latin_script_language : available_languages.front();
+}
+
 int GetMessageIdForField(AddressField field,
                          int admin_area_name_message_id,
                          int postal_code_name_message_id) {
@@ -71,7 +109,9 @@ const std::vector<std::string>& GetRegionCodes() {
 
 std::vector<AddressUiComponent> BuildComponents(
     const std::string& region_code,
-    const Localization& localization) {
+    const Localization& localization,
+    std::string* best_address_language_tag) {
+  assert(best_address_language_tag != NULL);
   std::vector<AddressUiComponent> result;
 
   Rule rule;
@@ -81,15 +121,31 @@ std::vector<AddressUiComponent> BuildComponents(
     return result;
   }
 
+  std::vector<Language> available_languages;
+  for (std::vector<std::string>::const_iterator language_tag_it =
+       rule.GetLanguages().begin();
+       language_tag_it != rule.GetLanguages().end(); ++language_tag_it) {
+    available_languages.push_back(Language(*language_tag_it));
+  }
+
+  const Language& best_address_language = ChooseBestAddressLanguage(
+      available_languages, !rule.GetLatinFormat().empty(),
+      Language(localization.GetLanguage()));
+  *best_address_language_tag = best_address_language.tag;
+
+  const std::vector<AddressField>& format =
+      !rule.GetLatinFormat().empty() &&
+      best_address_language.has_latin_script
+          ? rule.GetLatinFormat() : rule.GetFormat();
+
   // For avoiding showing an input field twice, when the field is displayed
   // twice on an envelope.
   std::set<AddressField> fields;
 
   bool previous_field_is_newline = true;
   bool next_field_is_newline = true;
-  for (std::vector<AddressField>::const_iterator field_it =
-       rule.GetFormat().begin();
-       field_it != rule.GetFormat().end(); ++field_it) {
+  for (std::vector<AddressField>::const_iterator field_it = format.begin();
+       field_it != format.end(); ++field_it) {
     if (IsNewline(*field_it)) {
       previous_field_is_newline = true;
       continue;
diff --git a/cpp/src/language.cc b/cpp/src/language.cc
new file mode 100644
index 0000000..5c212dd
--- /dev/null
+++ b/cpp/src/language.cc
@@ -0,0 +1,58 @@
+// Copyright (C) 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "language.h"
+
+#include <algorithm>
+#include <cctype>
+#include <string>
+#include <vector>
+
+#include "util/string_split.h"
+
+namespace i18n {
+namespace addressinput {
+
+Language::Language(const std::string& language_tag) : tag(language_tag),
+                                                      base(),
+                                                      has_latin_script(false) {
+  // Character '-' is the separator for subtags in the BCP 47. However, some
+  // legacy code generates tags with '_' instead of '-'.
+  static const char kSubtagsSeparator = '-';
+  static const char kAlternativeSubtagsSeparator = '_';
+  std::replace(tag.begin(), tag.end(), kAlternativeSubtagsSeparator,
+               kSubtagsSeparator);
+
+  // OK to use 'tolower' because BCP 47 tags are always in ASCII.
+  std::string lowercase = tag;
+  std::transform(lowercase.begin(), lowercase.end(), lowercase.begin(),
+                 tolower);
+
+  base = lowercase.substr(0, lowercase.find(kSubtagsSeparator));
+
+  // The lowercase BCP 47 subtag for Latin script.
+  static const char kLowercaseLatinScript[] = "latn";
+  std::vector<std::string> subtags;
+  SplitString(lowercase, kSubtagsSeparator, &subtags);
+
+  // Support only the second and third position for the script.
+  has_latin_script =
+      (subtags.size() > 1 && subtags[1] == kLowercaseLatinScript) ||
+      (subtags.size() > 2 && subtags[2] == kLowercaseLatinScript);
+}
+
+Language::~Language() {}
+
+}  // namespace addressinput
+}  // namespace i18n
diff --git a/cpp/src/language.h b/cpp/src/language.h
new file mode 100644
index 0000000..f2cc447
--- /dev/null
+++ b/cpp/src/language.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef I18N_ADDRESSINPUT_LANGUAGE_H_
+#define I18N_ADDRESSINPUT_LANGUAGE_H_
+
+#include <string>
+
+namespace i18n {
+namespace addressinput {
+
+// Helper for working with a BCP 47 language tag.
+// http://tools.ietf.org/html/bcp47
+struct Language {
+  explicit Language(const std::string& language_tag);
+  ~Language();
+
+  // The language tag (with '_' replaced with '-'), for example "zh-Latn-CN".
+  std::string tag;
+
+  // The base language, for example "zh". Always lowercase.
+  std::string base;
+
+  // True if the language tag explicitly has a Latin script. For example, this
+  // is true for "zh-Latn", but false for "zh". Only the second and third subtag
+  // positions are supported for script.
+  bool has_latin_script;
+};
+
+}  // namespace addressinput
+}  // namespace i18n
+
+#endif  // I18N_ADDRESSINPUT_LANGUAGE_H_
diff --git a/cpp/src/localization.cc b/cpp/src/localization.cc
index 3fd32af..1558b23 100644
--- a/cpp/src/localization.cc
+++ b/cpp/src/localization.cc
@@ -23,7 +23,9 @@ namespace addressinput {
 
 namespace {
 
-// For each language code XX with translations:
+static const char kDefaultLanguage[] = "en";
+
+// For each language XX with translations:
 //    (1) Add a namespace XX here with an include of "XX_messages.cc".
 //    (2) Add a wrapper that converts the char pointer to std::string. (GRIT
 //        generated functions return char pointers.)
@@ -41,7 +43,8 @@ std::string GetStdString(int message_id) {
 
 }  // namespace
 
-Localization::Localization() : get_string_(&en::GetStdString) {}
+Localization::Localization() : get_string_(&en::GetStdString),
+                               language_tag_(kDefaultLanguage) {}
 
 Localization::~Localization() {}
 
@@ -49,17 +52,20 @@ std::string Localization::GetString(int message_id) const {
   return get_string_(message_id);
 }
 
-void Localization::SetLanguage(const std::string& language_code) {
-  if (language_code == "en") {
+void Localization::SetLanguage(const std::string& language_tag) {
+  if (language_tag == kDefaultLanguage) {
     get_string_ = &en::GetStdString;
   } else {
     assert(false);
   }
+  language_tag_ = language_tag;
 }
 
-void Localization::SetGetter(std::string (*getter)(int)) {
+void Localization::SetGetter(std::string (*getter)(int),
+                             const std::string& language_tag) {
   assert(getter != NULL);
   get_string_ = getter;
+  language_tag_ = language_tag;
 }
 
 }  // namespace addressinput
diff --git a/cpp/src/post_box_matchers.cc b/cpp/src/post_box_matchers.cc
index fd0602a..95ee375 100644
--- a/cpp/src/post_box_matchers.cc
+++ b/cpp/src/post_box_matchers.cc
@@ -16,14 +16,13 @@
 
 #include "post_box_matchers.h"
 
-#include <algorithm>
-#include <cctype>
 #include <cstddef>
 #include <map>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "language.h"
 #include "rule.h"
 
 namespace i18n {
@@ -97,15 +96,6 @@ std::map<std::string, const RE2ptr*> InitMatchers() {
   return matchers;
 }
 
-std::string GetBaseLanguage(const std::string& language) {
-  // Be lenient in parsing, allow underscore separators and uppercase letters.
-  std::string::size_type end = language.find_first_of("-_");
-  std::string base(
-      end == std::string::npos ? language : language.substr(0, end));
-  std::transform(base.begin(), base.end(), base.begin(), tolower);
-  return base;
-}
-
 } // namespace
 
 // static
@@ -118,7 +108,8 @@ std::vector<const RE2ptr*> PostBoxMatchers::GetMatchers(
   for (std::vector<std::string>::const_iterator
        it = country_rule.GetLanguages().begin();
        it != country_rule.GetLanguages().end(); ++it) {
-    languages.push_back(GetBaseLanguage(*it));
+    Language language(*it);
+    languages.push_back(language.base);
   }
 
   std::vector<const RE2ptr*> result;
diff --git a/cpp/src/rule.cc b/cpp/src/rule.cc
index ebd5be5..48ec3dc 100644
--- a/cpp/src/rule.cc
+++ b/cpp/src/rule.cc
@@ -40,10 +40,11 @@ typedef std::map<std::string, int> NameMessageIdMap;
 const char kAdminAreaNameTypeKey[] = "state_name_type";
 const char kFormatKey[] = "fmt";
 const char kIdKey[] = "id";
+const char kLanguagesKey[] = "languages";
+const char kLatinFormatKey[] = "lfmt";
 const char kPostalCodeNameTypeKey[] = "zip_name_type";
 const char kRequireKey[] = "require";
 const char kSubKeysKey[] = "sub_keys";
-const char kLanguagesKey[] = "languages";
 const char kZipKey[] = "zip";
 
 // Used as a separator in a list of items. For example, the list of supported
@@ -108,6 +109,7 @@ int GetMessageIdFromName(const std::string& name,
 Rule::Rule()
     : id_(),
       format_(),
+      latin_format_(),
       required_(),
       sub_keys_(),
       languages_(),
@@ -132,6 +134,7 @@ const Rule& Rule::GetDefault() {
 void Rule::CopyFrom(const Rule& rule) {
   id_ = rule.id_;
   format_ = rule.format_;
+  latin_format_ = rule.latin_format_;
   required_ = rule.required_;
   sub_keys_ = rule.sub_keys_;
   languages_ = rule.languages_;
@@ -158,6 +161,11 @@ bool Rule::ParseSerializedRule(const std::string& serialized_rule) {
     ParseAddressFieldsFormat(json.GetStringValueForKey(kFormatKey), &format_);
   }
 
+  if (json.HasStringValueForKey(kLatinFormatKey)) {
+    ParseAddressFieldsFormat(
+        json.GetStringValueForKey(kLatinFormatKey), &latin_format_);
+  }
+
   if (json.HasStringValueForKey(kRequireKey)) {
     ParseAddressFieldsRequired(
         json.GetStringValueForKey(kRequireKey), &required_);
diff --git a/cpp/src/rule.h b/cpp/src/rule.h
index dbba9f1..bd74784 100644
--- a/cpp/src/rule.h
+++ b/cpp/src/rule.h
@@ -58,10 +58,16 @@ class Rule {
   // Returns the ID string for this rule.
   const std::string& GetId() const { return id_; };
 
-  // Returns the address format for this rule. The format can include the
-  // NEWLINE extension for AddressField enum.
+  // Returns the approximate address format with the default order of fields.
+  // The format can include the NEWLINE extension for AddressField enum.
   const std::vector<AddressField>& GetFormat() const { return format_; }
 
+  // Returns the approximate address format with the Latin order of fields. The
+  // format can include the NEWLINE extension for AddressField enum.
+  const std::vector<AddressField>& GetLatinFormat() const {
+    return latin_format_;
+  }
+
   // Returns the required fields for this rule.
   const std::vector<AddressField>& GetRequired() const { return required_; }
 
@@ -71,8 +77,8 @@ class Rule {
   // "CA", "NY", "TX", etc.
   const std::vector<std::string>& GetSubKeys() const { return sub_keys_; }
 
-  // Returns all of the language codes for which this rule has custom rules, for
-  // example ["de", "fr", "it"].
+  // Returns all of the language tags supported by this rule, for example ["de",
+  // "fr", "it"].
   const std::vector<std::string>& GetLanguages() const { return languages_; }
 
   // Returns a pointer to a RE2 regular expression object created from the
@@ -97,6 +103,7 @@ class Rule {
  private:
   std::string id_;
   std::vector<AddressField> format_;
+  std::vector<AddressField> latin_format_;
   std::vector<AddressField> required_;
   std::vector<std::string> sub_keys_;
   std::vector<std::string> languages_;
diff --git a/cpp/test/address_ui_test.cc b/cpp/test/address_ui_test.cc
index b8cb834..b36ea8c 100644
--- a/cpp/test/address_ui_test.cc
+++ b/cpp/test/address_ui_test.cc
@@ -26,13 +26,16 @@
 
 namespace {
 
+using i18n::addressinput::ADMIN_AREA;
 using i18n::addressinput::AddressField;
 using i18n::addressinput::AddressUiComponent;
 using i18n::addressinput::BuildComponents;
 using i18n::addressinput::COUNTRY;
 using i18n::addressinput::GetRegionCodes;
 using i18n::addressinput::Localization;
+using i18n::addressinput::POSTAL_CODE;
 using i18n::addressinput::RECIPIENT;
+using i18n::addressinput::STREET_ADDRESS;
 
 // Returns testing::AssertionSuccess if the |components| are valid. Uses
 // |region_code| in test failure messages.
@@ -66,6 +69,7 @@ testing::AssertionResult ComponentsAreValid(
 class AddressUiTest : public testing::TestWithParam<std::string> {
  protected:
   Localization localization_;
+  std::string best_address_language_tag_;
 };
 
 // Verifies that a region code consists of two characters, for example "TW".
@@ -76,14 +80,15 @@ TEST_P(AddressUiTest, RegionCodeHasTwoCharacters) {
 // Verifies that BuildComponents() returns valid UI components for a region
 // code.
 TEST_P(AddressUiTest, ComponentsAreValid) {
-  EXPECT_TRUE(ComponentsAreValid(BuildComponents(GetParam(), localization_)));
+  EXPECT_TRUE(ComponentsAreValid(BuildComponents(
+      GetParam(), localization_, &best_address_language_tag_)));
 }
 
 // Verifies that BuildComponents() returns at most one input field of each type.
 TEST_P(AddressUiTest, UniqueFieldTypes) {
   std::set<AddressField> fields;
   const std::vector<AddressUiComponent>& components =
-      BuildComponents(GetParam(), localization_);
+      BuildComponents(GetParam(), localization_, &best_address_language_tag_);
   for (std::vector<AddressUiComponent>::const_iterator it = components.begin();
        it != components.end(); ++it) {
     EXPECT_TRUE(fields.insert(it->field).second);
@@ -98,7 +103,130 @@ INSTANTIATE_TEST_CASE_P(
 // Verifies that BuildComponents() returns an empty vector for an invalid region
 // code.
 TEST_F(AddressUiTest, InvalidRegionCodeReturnsEmptyVector) {
-  EXPECT_TRUE(BuildComponents("INVALID-REGION-CODE", localization_).empty());
+  EXPECT_TRUE(BuildComponents("INVALID-REGION-CODE", localization_,
+                              &best_address_language_tag_).empty());
 }
 
+// Test data for determining the best language tag and whether the right format
+// pattern was used (fmt vs lfmt).
+struct LanguageTestCase {
+  LanguageTestCase(const std::string& region_code,
+                   const std::string& ui_language_tag,
+                   const std::string& expected_best_address_language_tag,
+                   AddressField expected_first_field)
+      : region_code(region_code),
+        ui_language_tag(ui_language_tag),
+        expected_best_address_language_tag(expected_best_address_language_tag),
+        expected_first_field(expected_first_field) {}
+
+  ~LanguageTestCase() {}
+
+  // The CLDR region code to test.
+  const std::string region_code;
+
+  // The BCP 47 language tag to test.
+  const std::string ui_language_tag;
+
+  // The expected value for the best language tag returned by BuildComponents().
+  const std::string expected_best_address_language_tag;
+
+  // The first field expected to be returned from BuildComponents(). Useful for
+  // determining whether the returned format is in Latin or default order.
+  const AddressField expected_first_field;
+};
+
+class BestAddressLanguageTagTest
+    : public testing::TestWithParam<LanguageTestCase> {
+ protected:
+  Localization localization_;
+  std::string best_address_language_tag_;
+};
+
+std::string GetterStub(int) { return std::string(); }
+
+TEST_P(BestAddressLanguageTagTest, CorrectBestAddressLanguageTag) {
+  localization_.SetGetter(&GetterStub, GetParam().ui_language_tag);
+  const std::vector<AddressUiComponent>& components = BuildComponents(
+      GetParam().region_code, localization_, &best_address_language_tag_);
+  EXPECT_EQ(GetParam().expected_best_address_language_tag,
+            best_address_language_tag_);
+  ASSERT_FALSE(components.empty());
+  EXPECT_EQ(GetParam().expected_first_field, components.front().field);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    LanguageTestCases, BestAddressLanguageTagTest,
+    testing::Values(
+        // Armenia supports hy and has a Latin format.
+        LanguageTestCase("AM", "", "hy", RECIPIENT),
+        LanguageTestCase("AM", "hy", "hy", RECIPIENT),
+        LanguageTestCase("AM", "en", "hy-Latn", RECIPIENT),
+
+        // P.R. China supports zh-Hans and has a Latin format.
+        LanguageTestCase("CN", "zh-hans", "zh-Hans", POSTAL_CODE),
+        LanguageTestCase("CN", "zh-hant", "zh-Hans", POSTAL_CODE),
+        LanguageTestCase("CN", "zh-hans-CN", "zh-Hans", POSTAL_CODE),
+        LanguageTestCase("CN", "zh", "zh-Hans", POSTAL_CODE),
+        LanguageTestCase("CN", "ZH_HANS", "zh-Hans", POSTAL_CODE),
+        LanguageTestCase("CN", "zh-cmn-Hans-CN", "zh-Hans", POSTAL_CODE),
+        LanguageTestCase("CN", "zh-Latn", "zh-Latn", RECIPIENT),
+        LanguageTestCase("CN", "zh-latn-CN", "zh-Latn", RECIPIENT),
+        LanguageTestCase("CN", "en", "zh-Latn", RECIPIENT),
+        LanguageTestCase("CN", "ja", "zh-Latn", RECIPIENT),
+        LanguageTestCase("CN", "ko", "zh-Latn", RECIPIENT),
+        LanguageTestCase("CN", "ZH_LATN", "zh-Latn", RECIPIENT),
+        // Libaddressinput does not have information about extended language
+        // subtags, so it uses the zh-Latn language tag for all base languages
+        // that are not zh, even if it's effectively the same language.
+        // Mandarin Chinese, Simplified script, as used in China:
+        LanguageTestCase("CN", "cmn-Hans-CN", "zh-Latn", RECIPIENT),
+
+        // Hong Kong supports zh-Hant and en. It has a Latin format.
+        LanguageTestCase("HK", "zh", "zh-Hant", ADMIN_AREA),
+        LanguageTestCase("HK", "zh-hans", "zh-Hant", ADMIN_AREA),
+        LanguageTestCase("HK", "zh-hant", "zh-Hant", ADMIN_AREA),
+        LanguageTestCase("HK", "zh-yue-HK", "zh-Hant", ADMIN_AREA),
+        LanguageTestCase("HK", "en", "en", ADMIN_AREA),
+        LanguageTestCase("HK", "zh-latn", "zh-Latn", RECIPIENT),
+        LanguageTestCase("HK", "fr", "zh-Latn", RECIPIENT),
+        LanguageTestCase("HK", "ja", "zh-Latn", RECIPIENT),
+        LanguageTestCase("HK", "ko", "zh-Latn", RECIPIENT),
+        // Libaddressinput does not have information about extended language
+        // subtags, so it uses the zh-Latn language tag for all base languages
+        // that are not zh or en, even if it's effectively the same language.
+        // Cantonese Chinese, as used in Hong Kong:
+        LanguageTestCase("HK", "yue-HK", "zh-Latn", RECIPIENT),
+
+        // Macao supports zh-Hant and pt. It has a Latin format.
+        LanguageTestCase("MO", "zh", "zh-Hant", STREET_ADDRESS),
+        LanguageTestCase("MO", "zh-Hant", "zh-Hant", STREET_ADDRESS),
+        LanguageTestCase("MO", "pt", "pt", STREET_ADDRESS),
+        LanguageTestCase("MO", "zh-Latn", "zh-Latn", RECIPIENT),
+        LanguageTestCase("MO", "en", "zh-Latn", RECIPIENT),
+
+        // Switzerland supports de, fr, and it.
+        LanguageTestCase("CH", "de", "de", RECIPIENT),
+        LanguageTestCase("CH", "de-DE", "de", RECIPIENT),
+        LanguageTestCase("CH", "de-Latn-DE", "de", RECIPIENT),
+        LanguageTestCase("CH", "fr", "fr", RECIPIENT),
+        LanguageTestCase("CH", "it", "it", RECIPIENT),
+        LanguageTestCase("CH", "en", "de", RECIPIENT),
+
+        // Antarctica does not have language information.
+        LanguageTestCase("AQ", "en", "en", RECIPIENT),
+        LanguageTestCase("AQ", "fr", "fr", RECIPIENT),
+        LanguageTestCase("AQ", "es", "es", RECIPIENT),
+        LanguageTestCase("AQ", "zh-Hans", "zh-Hans", RECIPIENT),
+
+        // Egypt supports ar and has a Latin format.
+        LanguageTestCase("EG", "ar", "ar", RECIPIENT),
+        LanguageTestCase("EG", "ar-Arab", "ar", RECIPIENT),
+        LanguageTestCase("EG", "ar-Latn", "ar-Latn", RECIPIENT),
+        LanguageTestCase("EG", "fr", "ar-Latn", RECIPIENT),
+        LanguageTestCase("EG", "fa", "ar-Latn", RECIPIENT),
+        // Libaddressinput does not have language-to-script mapping, so it uses
+        // the ar-Latn language tag for all base languages that are not ar, even
+        // if the script is the same.
+        LanguageTestCase("EG", "fa-Arab", "ar-Latn", RECIPIENT)));
+
 }  // namespace
diff --git a/cpp/test/language_test.cc b/cpp/test/language_test.cc
new file mode 100644
index 0000000..197459e
--- /dev/null
+++ b/cpp/test/language_test.cc
@@ -0,0 +1,62 @@
+// Copyright (C) 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "language.h"
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+namespace {
+
+using i18n::addressinput::Language;
+
+struct LanguageTestCase {
+  LanguageTestCase(const std::string& input_language_tag,
+                   const std::string& expected_language_tag,
+                   const std::string& expected_base_language,
+                   bool expected_has_latin_script)
+      : input_language_tag(input_language_tag),
+        expected_language_tag(expected_language_tag),
+        expected_base_language(expected_base_language),
+        expected_has_latin_script(expected_has_latin_script) {}
+
+  ~LanguageTestCase() {}
+
+  const std::string input_language_tag;
+  const std::string expected_language_tag;
+  const std::string expected_base_language;
+  const bool expected_has_latin_script;
+};
+
+class LanguageTest : public testing::TestWithParam<LanguageTestCase> {};
+
+TEST_P(LanguageTest, ExtractedDataIsCorrect) {
+  Language language(GetParam().input_language_tag);
+  EXPECT_EQ(GetParam().expected_language_tag, language.tag);
+  EXPECT_EQ(GetParam().expected_base_language, language.base);
+  EXPECT_EQ(GetParam().expected_has_latin_script, language.has_latin_script);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    LanguageTestCases, LanguageTest,
+    testing::Values(
+        LanguageTestCase("", "", "", false),
+        LanguageTestCase("en", "en", "en", false),
+        LanguageTestCase("zh-Latn-CN", "zh-Latn-CN", "zh", true),
+        LanguageTestCase("zh-cmn-Latn-CN", "zh-cmn-Latn-CN", "zh", true),
+        LanguageTestCase("zh-Hans", "zh-Hans", "zh", false),
+        LanguageTestCase("en_GB", "en-GB", "en", false)));
+
+}  // namespace
diff --git a/cpp/test/localization_test.cc b/cpp/test/localization_test.cc
index 6c8b3be..334068f 100644
--- a/cpp/test/localization_test.cc
+++ b/cpp/test/localization_test.cc
@@ -33,11 +33,13 @@ class LocalizationTest : public testing::TestWithParam<int> {
 };
 
 // Verifies that a custom message getter can be used.
-const char kValidMessage[] = "Data";
+static const char kValidMessage[] = "Data";
+static const char kValidLanguageTag[] = "tlh";
 std::string GetValidMessage(int message_id) { return kValidMessage; }
 TEST_P(LocalizationTest, ValidStringGetterCanBeUsed) {
-  localization_.SetGetter(&GetValidMessage);
+  localization_.SetGetter(&GetValidMessage, kValidLanguageTag);
   EXPECT_EQ(kValidMessage, localization_.GetString(GetParam()));
+  EXPECT_EQ(kValidLanguageTag, localization_.GetLanguage());
 }
 
 // Verifies that the default language for messages does not have empty strings.
@@ -45,8 +47,8 @@ TEST_P(LocalizationTest, DefaultStringIsNotEmpty) {
   EXPECT_FALSE(localization_.GetString(GetParam()).empty());
 }
 
-// Verifies that English is the default language.
-TEST_P(LocalizationTest, EnglishIsDefaultLanguage) {
+// Verifies that the default string is English.
+TEST_P(LocalizationTest, DefaultStringIsEnglish) {
   std::string default_string = localization_.GetString(GetParam());
   localization_.SetLanguage("en");
   EXPECT_EQ(default_string, localization_.GetString(GetParam()));
@@ -80,4 +82,9 @@ TEST_F(LocalizationTest, InvalidMessageIsEmptyString) {
   EXPECT_TRUE(localization_.GetString(INVALID_MESSAGE_ID).empty());
 }
 
+// Verifies that the default language is English.
+TEST_F(LocalizationTest, DefaultLanguageIsEnglish) {
+  EXPECT_EQ("en", localization_.GetLanguage());
+}
+
 }  // namespace
diff --git a/cpp/test/rule_test.cc b/cpp/test/rule_test.cc
index b9e671a..ca54006 100644
--- a/cpp/test/rule_test.cc
+++ b/cpp/test/rule_test.cc
@@ -43,6 +43,7 @@ TEST(RuleTest, CopyOverwritesRule) {
   Rule rule;
   ASSERT_TRUE(rule.ParseSerializedRule("{"
                                        "\"fmt\":\"%S%Z\","
+                                       "\"lfmt\":\"%Z%S\","
                                        "\"id\":\"data/XA\","
                                        "\"lname\":\"Testistan\","
                                        "\"require\":\"AC\","
@@ -55,6 +56,7 @@ TEST(RuleTest, CopyOverwritesRule) {
 
   Rule copy;
   EXPECT_NE(rule.GetFormat(), copy.GetFormat());
+  EXPECT_NE(rule.GetLatinFormat(), copy.GetLatinFormat());
   EXPECT_NE(rule.GetId(), copy.GetId());
   EXPECT_NE(rule.GetRequired(), copy.GetRequired());
   EXPECT_NE(rule.GetSubKeys(), copy.GetSubKeys());
@@ -69,6 +71,7 @@ TEST(RuleTest, CopyOverwritesRule) {
 
   copy.CopyFrom(rule);
   EXPECT_EQ(rule.GetFormat(), copy.GetFormat());
+  EXPECT_EQ(rule.GetLatinFormat(), copy.GetLatinFormat());
   EXPECT_EQ(rule.GetId(), copy.GetId());
   EXPECT_EQ(rule.GetRequired(), copy.GetRequired());
   EXPECT_EQ(rule.GetSubKeys(), copy.GetSubKeys());
@@ -115,6 +118,15 @@ TEST(RuleTest, ParsesFormatCorrectly) {
   EXPECT_EQ(expected, rule.GetFormat());
 }
 
+TEST(RuleTest, ParsesLatinFormatCorrectly) {
+  std::vector<AddressField> expected;
+  expected.push_back(LOCALITY);
+  expected.push_back(ADMIN_AREA);
+  Rule rule;
+  ASSERT_TRUE(rule.ParseSerializedRule("{\"lfmt\":\"%C%S\"}"));
+  EXPECT_EQ(expected, rule.GetLatinFormat());
+}
+
 TEST(RuleTest, ParsesRequiredCorrectly) {
   std::vector<AddressField> expected;
   expected.push_back(STREET_ADDRESS);
author	rouslan@chromium.org <rouslan@chromium.org@38ededc0-08b8-5190-f2ac-b31f878777ad>	2014-05-14 07:47:13 +0000
committer	rouslan@chromium.org <rouslan@chromium.org@38ededc0-08b8-5190-f2ac-b31f878777ad>	2014-05-14 07:47:13 +0000
commit	947ce24396bbb66ec5fcd14a73f85f4e32c3dcc0 (patch)
tree	5690ec7daed768b42524fe15b0193afdc4353b67 /cpp
parent	c9cd8929fc3b9ff6918b9246220ea645cae78185 (diff)
download	src-947ce24396bbb66ec5fcd14a73f85f4e32c3dcc0.tar.gz