14 files changed, 440 insertions, 41 deletions
diff --git a/cpp/include/libaddressinput/address_ui.h b/cpp/include/libaddressinput/address_ui.h
index 35d2ede..8939501 100644
--- a/cpp/include/libaddressinput/address_ui.h
+++ b/cpp/include/libaddressinput/address_ui.h
@@ -28,10 +28,19 @@ struct AddressUiComponent;
 const std::vector<std::string>& GetRegionCodes();
 
 // Returns the UI components for the CLDR |region_code|. Uses the strings from
-// |localization|. Returns an empty vector on error.
+// |localization|. The components can be in default or Latin order, depending on
+// the language of |localization|.
+//
+// Sets the |best_address_language_tag| to the BCP 47 language tag that should
+// be saved with this address. This language will be used to get drop-downs to
+// help users fill in their address, and to format the address that the user
+// entered. The parameter should not be NULL.
+//
+// Returns an empty vector on error.
 std::vector<AddressUiComponent> BuildComponents(
     const std::string& region_code,
-    const Localization& localization);
+    const Localization& localization,
+    std::string* best_address_language_tag);
 
 }  // namespace addressinput
 }  // namespace i18n
diff --git a/cpp/include/libaddressinput/localization.h b/cpp/include/libaddressinput/localization.h
index 6b599c3..77a1917 100644
--- a/cpp/include/libaddressinput/localization.h
+++ b/cpp/include/libaddressinput/localization.h
@@ -23,12 +23,14 @@ namespace addressinput {
 // The object to retrieve localized strings based on message IDs. Sample usage:
 //    Localization localization;
 //    localization.SetLanguage("en");
-//    Process(BuildComponents("CA", localization));
+//    std::string best_language_tag;
+//    Process(BuildComponents("CA", localization, &best_language_tag));
 //
 // Alternative usage:
 //    Localization localization;
-//    localization.SetGetter(&MyStringGetter);
-//    Process(BuildComponents("CA", localization));
+//    localization.SetGetter(&MyStringGetter, "fr");
+//    std::string best_language_tag;
+//    Process(BuildComponents("CA", localization, &best_language_tag));
 class Localization {
  public:
   // Initializes with English messages by default.
@@ -41,15 +43,22 @@ class Localization {
 
   // Sets the language for the strings. The only supported language is "en"
   // until we have translations.
-  void SetLanguage(const std::string& language_code);
+  void SetLanguage(const std::string& language_tag);
 
   // Sets the string getter that takes a message identifier and returns the
-  // corresponding localized string.
-  void SetGetter(std::string (*getter)(int));
+  // corresponding localized string. The |language_tag| parameter is used only
+  // for information purposes here.
+  void SetGetter(std::string (*getter)(int), const std::string& language_tag);
+
+  // Returns the current language tag.
+  const std::string& GetLanguage() const { return language_tag_; }
 
  private:
   // The string getter.
   std::string (*get_string_)(int);
+
+  // The current language tag.
+  std::string language_tag_;
 };
 
 }  // namespace addressinput
diff --git a/cpp/libaddressinput.gyp b/cpp/libaddressinput.gyp
index e57bda0..b4ab884 100644
--- a/cpp/libaddressinput.gyp
+++ b/cpp/libaddressinput.gyp
@@ -37,6 +37,7 @@
         'src/address_problem.cc',
         'src/address_ui.cc',
         'src/address_validator.cc',
+        'src/language.cc',
         'src/localization.cc',
         'src/lookup_key.cc',
         'src/lookup_key_util.cc',
@@ -75,6 +76,7 @@
         'test/fake_downloader_test.cc',
         'test/fake_storage.cc',
         'test/fake_storage_test.cc',
+        'test/language_test.cc',
         'test/localization_test.cc',
         'test/lookup_key_test.cc',
         'test/lookup_key_util_test.cc',
diff --git a/cpp/src/address_ui.cc b/cpp/src/address_ui.cc
index 15f3c61..0eb28fc 100644
--- a/cpp/src/address_ui.cc
+++ b/cpp/src/address_ui.cc
@@ -18,12 +18,15 @@
 #include <libaddressinput/address_ui_component.h>
 #include <libaddressinput/localization.h>
 
+#include <cassert>
+#include <cstddef>
 #include <set>
 #include <string>
 #include <vector>
 
 #include "address_field_util.h"
 #include "grit.h"
+#include "language.h"
 #include "messages.h"
 #include "region_data_constants.h"
 #include "rule.h"
@@ -33,6 +36,41 @@ namespace addressinput {
 
 namespace {
 
+Language ChooseBestAddressLanguage(
+    const std::vector<Language>& available_languages,
+    bool has_latin_format,
+    const Language& ui_language) {
+  if (available_languages.empty()) {
+    return ui_language;
+  }
+
+  if (ui_language.tag.empty()) {
+    return available_languages.front();
+  }
+
+  // The conventionally formatted BCP 47 Latin script with a preceding subtag
+  // separator.
+  static const char kLatinScriptSuffix[] = "-Latn";
+  Language latin_script_language(
+      available_languages.front().base + kLatinScriptSuffix);
+  if (has_latin_format && ui_language.has_latin_script) {
+    return latin_script_language;
+  }
+
+  for (std::vector<Language>::const_iterator
+       available_lang_it = available_languages.begin();
+       available_lang_it != available_languages.end(); ++available_lang_it) {
+    // Base language comparison works because no region supports the same base
+    // language with different scripts, for now. For example, no region supports
+    // "zh-Hant" and "zh-Hans" at the same time.
+    if (ui_language.base == available_lang_it->base) {
+      return *available_lang_it;
+    }
+  }
+
+  return has_latin_format ? latin_script_language : available_languages.front();
+}
+
 int GetMessageIdForField(AddressField field,
                          int admin_area_name_message_id,
                          int postal_code_name_message_id) {
@@ -71,7 +109,9 @@ const std::vector<std::string>& GetRegionCodes() {
 
 std::vector<AddressUiComponent> BuildComponents(
     const std::string& region_code,
-    const Localization& localization) {
+    const Localization& localization,
+    std::string* best_address_language_tag) {
+  assert(best_address_language_tag != NULL);
   std::vector<AddressUiComponent> result;
 
   Rule rule;
@@ -81,15 +121,31 @@ std::vector<AddressUiComponent> BuildComponents(
     return result;
   }
 
+  std::vector<Language> available_languages;
+  for (std::vector<std::string>::const_iterator language_tag_it =
+       rule.GetLanguages().begin();
+       language_tag_it != rule.GetLanguages().end(); ++language_tag_it) {
+    available_languages.push_back(Language(*language_tag_it));
+  }
+
+  const Language& best_address_language = ChooseBestAddressLanguage(
+      available_languages, !rule.GetLatinFormat().empty(),
+      Language(localization.GetLanguage()));
+  *best_address_language_tag = best_address_language.tag;
+
+  const std::vector<AddressField>& format =
+      !rule.GetLatinFormat().empty() &&
+      best_address_language.has_latin_script
+          ? rule.GetLatinFormat() : rule.GetFormat();
+
   // For avoiding showing an input field twice, when the field is displayed
   // twice on an envelope.
   std::set<AddressField> fields;
 
   bool previous_field_is_newline = true;
   bool next_field_is_newline = true;
-  for (std::vector<AddressField>::const_iterator field_it =
-       rule.GetFormat().begin();
-       field_it != rule.GetFormat().end(); ++field_it) {
+  for (std::vector<AddressField>::const_iterator field_it = format.begin();
+       field_it != format.end(); ++field_it) {
     if (IsNewline(*field_it)) {
       previous_field_is_newline = true;
       continue;
diff --git a/cpp/src/language.cc b/cpp/src/language.cc
new file mode 100644
index 0000000..5c212dd
--- /dev/null
+++ b/cpp/src/language.cc
@@ -0,0 +1,58 @@
+// Copyright (C) 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "language.h"
+
+#include <algorithm>
+#include <cctype>
+#include <string>
+#include <vector>
+
+#include "util/string_split.h"
+
+namespace i18n {
+namespace addressinput {
+
+Language::Language(const std::string& language_tag) : tag(language_tag),
+                                                      base(),
+                                                      has_latin_script(false) {
+  // Character '-' is the separator for subtags in the BCP 47. However, some
+  // legacy code generates tags with '_' instead of '-'.
+  static const char kSubtagsSeparator = '-';
+  static const char kAlternativeSubtagsSeparator = '_';
+  std::replace(tag.begin(), tag.end(), kAlternativeSubtagsSeparator,
+               kSubtagsSeparator);
+
+  // OK to use 'tolower' because BCP 47 tags are always in ASCII.
+  std::string lowercase = tag;
+  std::transform(lowercase.begin(), lowercase.end(), lowercase.begin(),
+                 tolower);
+
+  base = lowercase.substr(0, lowercase.find(kSubtagsSeparator));
+
+  // The lowercase BCP 47 subtag for Latin script.
+  static const char kLowercaseLatinScript[] = "latn";
+  std::vector<std::string> subtags;
+  SplitString(lowercase, kSubtagsSeparator, &subtags);
+
+  // Support only the second and third position for the script.
+  has_latin_script =
+      (subtags.size() > 1 && subtags[1] == kLowercaseLatinScript) ||
+      (subtags.size() > 2 && subtags[2] == kLowercaseLatinScript);
+}
+
+Language::~Language() {}
+
+}  // namespace addressinput
+}  // namespace i18n
diff --git a/cpp/src/language.h b/cpp/src/language.h
new file mode 100644
index 0000000..f2cc447
--- /dev/null
+++ b/cpp/src/language.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef I18N_ADDRESSINPUT_LANGUAGE_H_
+#define I18N_ADDRESSINPUT_LANGUAGE_H_
+
+#include <string>
+
+namespace i18n {
+namespace addressinput {
+
+// Helper for working with a BCP 47 language tag.
+// http://tools.ietf.org/html/bcp47
+struct Language {
+  explicit Language(const std::string& language_tag);
+  ~Language();
+
+  // The language tag (with '_' replaced with '-'), for example "zh-Latn-CN".
+  std::string tag;
+
+  // The base language, for example "zh". Always lowercase.
+  std::string base;
+
+  // True if the language tag explicitly has a Latin script. For example, this
+  // is true for "zh-Latn", but false for "zh". Only the second and third subtag
+  // positions are supported for script.
+  bool has_latin_script;
+};
+
+}  // namespace addressinput
+}  // namespace i18n
+
+#endif  // I18N_ADDRESSINPUT_LANGUAGE_H_
diff --git a/cpp/src/localization.cc b/cpp/src/localization.cc
index 3fd32af..1558b23 100644
--- a/cpp/src/localization.cc
+++ b/cpp/src/localization.cc
@@ -23,7 +23,9 @@ namespace addressinput {
 
 namespace {
 
-// For each language code XX with translations:
+static const char kDefaultLanguage[] = "en";
+
+// For each language XX with translations:
 //    (1) Add a namespace XX here with an include of "XX_messages.cc".
 //    (2) Add a wrapper that converts the char pointer to std::string. (GRIT
 //        generated functions return char pointers.)
@@ -41,7 +43,8 @@ std::string GetStdString(int message_id) {
 
 }  // namespace
 
-Localization::Localization() : get_string_(&en::GetStdString) {}
+Localization::Localization() : get_string_(&en::GetStdString),
+                               language_tag_(kDefaultLanguage) {}
 
 Localization::~Localization() {}
 
@@ -49,17 +52,20 @@ std::string Localization::GetString(int message_id) const {
   return get_string_(message_id);
 }
 
-void Localization::SetLanguage(const std::string& language_code) {
-  if (language_code == "en") {
+void Localization::SetLanguage(const std::string& language_tag) {
+  if (language_tag == kDefaultLanguage) {
     get_string_ = &en::GetStdString;
   } else {
     assert(false);
   }
+  language_tag_ = language_tag;
 }
 
-void Localization::SetGetter(std::string (*getter)(int)) {
+void Localization::SetGetter(std::string (*getter)(int),
+                             const std::string& language_tag) {
   assert(getter != NULL);
   get_string_ = getter;
+  language_tag_ = language_tag;
 }
 
 }  // namespace addressinput
diff --git a/cpp/src/post_box_matchers.cc b/cpp/src/post_box_matchers.cc
index fd0602a..95ee375 100644
--- a/cpp/src/post_box_matchers.cc
+++ b/cpp/src/post_box_matchers.cc
@@ -16,14 +16,13 @@
 
 #include "post_box_matchers.h"
 
-#include <algorithm>
-#include <cctype>
 #include <cstddef>
 #include <map>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "language.h"
 #include "rule.h"
 
 namespace i18n {
@@ -97,15 +96,6 @@ std::map<std::string, const RE2ptr*> InitMatchers() {
   return matchers;
 }
 
-std::string GetBaseLanguage(const std::string& language) {
-  // Be lenient in parsing, allow underscore separators and uppercase letters.
-  std::string::size_type end = language.find_first_of("-_");
-  std::string base(
-      end == std::string::npos ? language : language.substr(0, end));
-  std::transform(base.begin(), base.end(), base.begin(), tolower);
-  return base;
-}
-
 } // namespace
 
 // static
@@ -118,7 +108,8 @@ std::vector<const RE2ptr*> PostBoxMatchers::GetMatchers(
   for (std::vector<std::string>::const_iterator
        it = country_rule.GetLanguages().begin();
        it != country_rule.GetLanguages().end(); ++it) {
-    languages.push_back(GetBaseLanguage(*it));
+    Language language(*it);
+    languages.push_back(language.base);
   }
 
   std::vector<const RE2ptr*> result;
diff --git a/cpp/src/rule.cc b/cpp/src/rule.cc
index ebd5be5..48ec3dc 100644
--- a/cpp/src/rule.cc
+++ b/cpp/src/rule.cc
@@ -40,10 +40,11 @@ typedef std::map<std::string, int> NameMessageIdMap;
 const char kAdminAreaNameTypeKey[] = "state_name_type";
 const char kFormatKey[] = "fmt";
 const char kIdKey[] = "id";
+const char kLanguagesKey[] = "languages";
+const char kLatinFormatKey[] = "lfmt";
 const char kPostalCodeNameTypeKey[] = "zip_name_type";
 const char kRequireKey[] = "require";
 const char kSubKeysKey[] = "sub_keys";
-const char kLanguagesKey[] = "languages";
 const char kZipKey[] = "zip";
 
 // Used as a separator in a list of items. For example, the list of supported
@@ -108,6 +109,7 @@ int GetMessageIdFromName(const std::string& name,
 Rule::Rule()
     : id_(),
       format_(),
+      latin_format_(),
       required_(),
       sub_keys_(),
       languages_(),
@@ -132,6 +134,7 @@ const Rule& Rule::GetDefault() {
 void Rule::CopyFrom(const Rule& rule) {
   id_ = rule.id_;
   format_ = rule.format_;
+  latin_format_ = rule.latin_format_;
   required_ = rule.required_;
   sub_keys_ = rule.sub_keys_;
   languages_ = rule.languages_;
@@ -158,6 +161,11 @@ bool Rule::ParseSerializedRule(const std::string& serialized_rule) {
     ParseAddressFieldsFormat(json.GetStringValueForKey(kFormatKey), &format_);
   }
 
+  if (json.HasStringValueForKey(kLatinFormatKey)) {
+    ParseAddressFieldsFormat(
+        json.GetStringValueForKey(kLatinFormatKey), &latin_format_);
+  }
+
   if (json.HasStringValueForKey(kRequireKey)) {
     ParseAddressFieldsRequired(
         json.GetStringValueForKey(kRequireKey), &required_);
diff --git a/cpp/src/rule.h b/cpp/src/rule.h
index dbba9f1..bd74784 100644
--- a/cpp/src/rule.h
+++ b/cpp/src/rule.h
@@ -58,10 +58,16 @@ class Rule {
   // Returns the ID string for this rule.
   const std::string& GetId() const { return id_; };
 
-  // Returns the address format for this rule. The format can include the
-  // NEWLINE extension for AddressField enum.
+  // Returns the approximate address format with the default order of fields.
+  // The format can include the NEWLINE extension for AddressField enum.
   const std::vector<AddressField>& GetFormat() const { return format_; }
 
+  // Returns the approximate address format with the Latin order of fields. The
+  // format can include the NEWLINE extension for AddressField enum.
+  const std::vector<AddressField>& GetLatinFormat() const {
+    return latin_format_;
+  }
+
   // Returns the required fields for this rule.
   const std::vector<AddressField>& GetRequired() const { return required_; }
 
@@ -71,8 +77,8 @@ class Rule {
   // "CA", "NY", "TX", etc.
   const std::vector<std::string>& GetSubKeys() const { return sub_keys_; }
 
-  // Returns all of the language codes for which this rule has custom rules, for
-  // example ["de", "fr", "it"].
+  // Returns all of the language tags supported by this rule, for example ["de",
+  // "fr", "it"].
   const std::vector<std::string>& GetLanguages() const { return languages_; }
 
   // Returns a pointer to a RE2 regular expression object created from the
@@ -97,6 +103,7 @@ class Rule {
  private:
   std::string id_;
   std::vector<AddressField> format_;
+  std::vector<AddressField> latin_format_;
   std::vector<AddressField> required_;
   std::vector<std::string> sub_keys_;
   std::vector<std::string> languages_;
diff --git a/cpp/test/address_ui_test.cc b/cpp/test/address_ui_test.cc
index b8cb834..b36ea8c 100644
--- a/cpp/test/address_ui_test.cc
+++ b/cpp/test/address_ui_test.cc
@@ -26,13 +26,16 @@
 
 namespace {
 
+using i18n::addressinput::ADMIN_AREA;
 using i18n::addressinput::AddressField;
 using i18n::addressinput::AddressUiComponent;
 using i18n::addressinput::BuildComponents;
 using i18n::addressinput::COUNTRY;
 using i18n::addressinput::GetRegionCodes;
 using i18n::addressinput::Localization;
+using i18n::addressinput::POSTAL_CODE;
 using i18n::addressinput::RECIPIENT;
+using i18n::addressinput::STREET_ADDRESS;
 
 // Returns testing::AssertionSuccess if the |components| are valid. Uses
 // |region_code| in test failure messages.
@@ -66,6 +69,7 @@ testing::AssertionResult ComponentsAreValid(
 class AddressUiTest : public testing::TestWithParam<std::string> {
  protected:
   Localization localization_;
+  std::string best_address_language_tag_;
 };
 
 // Verifies that a region code consists of two characters, for example "TW".
@@ -76,14 +80,15 @@ TEST_P(AddressUiTest, RegionCodeHasTwoCharacters) {
 // Verifies that BuildComponents() returns valid UI components for a region
 // code.
 TEST_P(AddressUiTest, ComponentsAreValid) {
-  EXPECT_TRUE(ComponentsAreValid(BuildComponents(GetParam(), localization_)));
+  EXPECT_TRUE(ComponentsAreValid(BuildComponents(
+      GetParam(), localization_, &best_address_language_tag_)));
 }
 
 // Verifies that BuildComponents() returns at most one input field of each type.
 TEST_P(AddressUiTest, UniqueFieldTypes) {
   std::set<AddressField> fields;
   const std::vector<AddressUiComponent>& components =
-      BuildComponents(GetParam(), localization_);
+      BuildComponents(GetParam(), localization_, &best_address_language_tag_);
   for (std::vector<AddressUiComponent>::const_iterator it = components.begin();
        it != components.end(); ++it) {
     EXPECT_TRUE(fields.insert(it->field).second);
@@ -98,7 +103,130 @@ INSTANTIATE_TEST_CASE_P(
 // Verifies that BuildComponents() returns an empty vector for an invalid region
 // code.
 TEST_F(AddressUiTest, InvalidRegionCodeReturnsEmptyVector) {
-  EXPECT_TRUE(BuildComponents("INVALID-REGION-CODE", localization_).empty());
+  EXPECT_TRUE(BuildComponents("INVALID-REGION-CODE", localization_,
+                              &best_address_language_tag_).empty());
 }
 
+// Test data for determining the best language tag and whether the right format
+// pattern was used (fmt vs lfmt).
+struct LanguageTestCase {
+  LanguageTestCase(const std::string& region_code,
+                   const std::string& ui_language_tag,
+                   const std::string& expected_best_address_language_tag,
+                   AddressField expected_first_field)
+      : region_code(region_code),
+        ui_language_tag(ui_language_tag),
+        expected_best_address_language_tag(expected_best_address_language_tag),
+        expected_first_field(expected_first_field) {}
+
+  ~LanguageTestCase() {}
+
+  // The CLDR region code to test.
+  const std::string region_code;
+
+  // The BCP 47 language tag to test.
+  const std::string ui_language_tag;
+
+  // The expected value for the best language tag returned by BuildComponents().
+  const std::string expected_best_address_language_tag;
+
+  // The first field expected to be returned from BuildComponents(). Useful for
+  // determining whether the returned format is in Latin or default order.
+  const AddressField expected_first_field;
+};
+
+class BestAddressLanguageTagTest
+    : public testing::TestWithParam<LanguageTestCase> {
+ protected:
+  Localization localization_;
+  std::string best_address_language_tag_;
+};
+
+std::string GetterStub(int) { return std::string(); }
+
+TEST_P(BestAddressLanguageTagTest, CorrectBestAddressLanguageTag) {
+  localization_.SetGetter(&GetterStub, GetParam().ui_language_tag);
+  const std::vector<AddressUiComponent>& components = BuildComponents(
+      GetParam().region_code, localization_, &best_address_language_tag_);
+  EXPECT_EQ(GetParam().expected_best_address_language_tag,
+            best_address_language_tag_);
+  ASSERT_FALSE(components.empty());
+  EXPECT_EQ(GetParam().expected_first_field, components.front().field);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    LanguageTestCases, BestAddressLanguageTagTest,
+    testing::Values(
+        // Armenia supports hy and has a Latin format.
+        LanguageTestCase("AM", "", "hy", RECIPIENT),
+        LanguageTestCase("AM", "hy", "hy", RECIPIENT),
+        LanguageTestCase("AM", "en", "hy-Latn", RECIPIENT),
+
+        // P.R. China supports zh-Hans and has a Latin format.
+        LanguageTestCase("CN", "zh-hans", "zh-Hans", POSTAL_CODE),
+        LanguageTestCase("CN", "zh-hant", "zh-Hans", POSTAL_CODE),
+        LanguageTestCase("CN", "zh-hans-CN", "zh-Hans", POSTAL_CODE),
+        LanguageTestCase("CN", "zh", "zh-Hans", POSTAL_CODE),
+        LanguageTestCase("CN", "ZH_HANS", "zh-Hans", POSTAL_CODE),
+        LanguageTestCase("CN", "zh-cmn-Hans-CN", "zh-Hans", POSTAL_CODE),
+        LanguageTestCase("CN", "zh-Latn", "zh-Latn", RECIPIENT),
+        LanguageTestCase("CN", "zh-latn-CN", "zh-Latn", RECIPIENT),
+        LanguageTestCase("CN", "en", "zh-Latn", RECIPIENT),
+        LanguageTestCase("CN", "ja", "zh-Latn", RECIPIENT),
+        LanguageTestCase("CN", "ko", "zh-Latn", RECIPIENT),
+        LanguageTestCase("CN", "ZH_LATN", "zh-Latn", RECIPIENT),
+        // Libaddressinput does not have information about extended language
+        // subtags, so it uses the zh-Latn language tag for all base languages
+        // that are not zh, even if it's effectively the same language.
+        // Mandarin Chinese, Simplified script, as used in China:
+        LanguageTestCase("CN", "cmn-Hans-CN", "zh-Latn", RECIPIENT),
+
+        // Hong Kong supports zh-Hant and en. It has a Latin format.
+        LanguageTestCase("HK", "zh", "zh-Hant", ADMIN_AREA),
+        LanguageTestCase("HK", "zh-hans", "zh-Hant", ADMIN_AREA),
+        LanguageTestCase("HK", "zh-hant", "zh-Hant", ADMIN_AREA),
+        LanguageTestCase("HK", "zh-yue-HK", "zh-Hant", ADMIN_AREA),
+        LanguageTestCase("HK", "en", "en", ADMIN_AREA),
+        LanguageTestCase("HK", "zh-latn", "zh-Latn", RECIPIENT),
+        LanguageTestCase("HK", "fr", "zh-Latn", RECIPIENT),
+        LanguageTestCase("HK", "ja", "zh-Latn", RECIPIENT),
+        LanguageTestCase("HK", "ko", "zh-Latn", RECIPIENT),
+        // Libaddressinput does not have information about extended language
+        // subtags, so it uses the zh-Latn language tag for all base languages
+        // that are not zh or en, even if it's effectively the same language.
+        // Cantonese Chinese, as used in Hong Kong:
+        LanguageTestCase("HK", "yue-HK", "zh-Latn", RECIPIENT),
+
+        // Macao supports zh-Hant and pt. It has a Latin format.
+        LanguageTestCase("MO", "zh", "zh-Hant", STREET_ADDRESS),
+        LanguageTestCase("MO", "zh-Hant", "zh-Hant", STREET_ADDRESS),
+        LanguageTestCase("MO", "pt", "pt", STREET_ADDRESS),
+        LanguageTestCase("MO", "zh-Latn", "zh-Latn", RECIPIENT),
+        LanguageTestCase("MO", "en", "zh-Latn", RECIPIENT),
+
+        // Switzerland supports de, fr, and it.
+        LanguageTestCase("CH", "de", "de", RECIPIENT),
+        LanguageTestCase("CH", "de-DE", "de", RECIPIENT),
+        LanguageTestCase("CH", "de-Latn-DE", "de", RECIPIENT),
+        LanguageTestCase("CH", "fr", "fr", RECIPIENT),
+        LanguageTestCase("CH", "it", "it", RECIPIENT),
+        LanguageTestCase("CH", "en", "de", RECIPIENT),
+
+        // Antarctica does not have language information.
+        LanguageTestCase("AQ", "en", "en", RECIPIENT),
+        LanguageTestCase("AQ", "fr", "fr", RECIPIENT),
+        LanguageTestCase("AQ", "es", "es", RECIPIENT),
+        LanguageTestCase("AQ", "zh-Hans", "zh-Hans", RECIPIENT),
+
+        // Egypt supports ar and has a Latin format.
+        LanguageTestCase("EG", "ar", "ar", RECIPIENT),
+        LanguageTestCase("EG", "ar-Arab", "ar", RECIPIENT),
+        LanguageTestCase("EG", "ar-Latn", "ar-Latn", RECIPIENT),
+        LanguageTestCase("EG", "fr", "ar-Latn", RECIPIENT),
+        LanguageTestCase("EG", "fa", "ar-Latn", RECIPIENT),
+        // Libaddressinput does not have language-to-script mapping, so it uses
+        // the ar-Latn language tag for all base languages that are not ar, even
+        // if the script is the same.
+        LanguageTestCase("EG", "fa-Arab", "ar-Latn", RECIPIENT)));
+
 }  // namespace
diff --git a/cpp/test/language_test.cc b/cpp/test/language_test.cc
new file mode 100644
index 0000000..197459e
--- /dev/null
+++ b/cpp/test/language_test.cc
@@ -0,0 +1,62 @@
+// Copyright (C) 2014 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "language.h"
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+namespace {
+
+using i18n::addressinput::Language;
+
+struct LanguageTestCase {
+  LanguageTestCase(const std::string& input_language_tag,
+                   const std::string& expected_language_tag,
+                   const std::string& expected_base_language,
+                   bool expected_has_latin_script)
+      : input_language_tag(input_language_tag),
+        expected_language_tag(expected_language_tag),
+        expected_base_language(expected_base_language),
+        expected_has_latin_script(expected_has_latin_script) {}
+
+  ~LanguageTestCase() {}
+
+  const std::string input_language_tag;
+  const std::string expected_language_tag;
+  const std::string expected_base_language;
+  const bool expected_has_latin_script;
+};
+
+class LanguageTest : public testing::TestWithParam<LanguageTestCase> {};
+
+TEST_P(LanguageTest, ExtractedDataIsCorrect) {
+  Language language(GetParam().input_language_tag);
+  EXPECT_EQ(GetParam().expected_language_tag, language.tag);
+  EXPECT_EQ(GetParam().expected_base_language, language.base);
+  EXPECT_EQ(GetParam().expected_has_latin_script, language.has_latin_script);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    LanguageTestCases, LanguageTest,
+    testing::Values(
+        LanguageTestCase("", "", "", false),
+        LanguageTestCase("en", "en", "en", false),
+        LanguageTestCase("zh-Latn-CN", "zh-Latn-CN", "zh", true),
+        LanguageTestCase("zh-cmn-Latn-CN", "zh-cmn-Latn-CN", "zh", true),
+        LanguageTestCase("zh-Hans", "zh-Hans", "zh", false),
+        LanguageTestCase("en_GB", "en-GB", "en", false)));
+
+}  // namespace
diff --git a/cpp/test/localization_test.cc b/cpp/test/localization_test.cc
index 6c8b3be..334068f 100644
--- a/cpp/test/localization_test.cc
+++ b/cpp/test/localization_test.cc
@@ -33,11 +33,13 @@ class LocalizationTest : public testing::TestWithParam<int> {
 };
 
 // Verifies that a custom message getter can be used.
-const char kValidMessage[] = "Data";
+static const char kValidMessage[] = "Data";
+static const char kValidLanguageTag[] = "tlh";
 std::string GetValidMessage(int message_id) { return kValidMessage; }
 TEST_P(LocalizationTest, ValidStringGetterCanBeUsed) {
-  localization_.SetGetter(&GetValidMessage);
+  localization_.SetGetter(&GetValidMessage, kValidLanguageTag);
   EXPECT_EQ(kValidMessage, localization_.GetString(GetParam()));
+  EXPECT_EQ(kValidLanguageTag, localization_.GetLanguage());
 }
 
 // Verifies that the default language for messages does not have empty strings.
@@ -45,8 +47,8 @@ TEST_P(LocalizationTest, DefaultStringIsNotEmpty) {
   EXPECT_FALSE(localization_.GetString(GetParam()).empty());
 }
 
-// Verifies that English is the default language.
-TEST_P(LocalizationTest, EnglishIsDefaultLanguage) {
+// Verifies that the default string is English.
+TEST_P(LocalizationTest, DefaultStringIsEnglish) {
   std::string default_string = localization_.GetString(GetParam());
   localization_.SetLanguage("en");
   EXPECT_EQ(default_string, localization_.GetString(GetParam()));
@@ -80,4 +82,9 @@ TEST_F(LocalizationTest, InvalidMessageIsEmptyString) {
   EXPECT_TRUE(localization_.GetString(INVALID_MESSAGE_ID).empty());
 }
 
+// Verifies that the default language is English.
+TEST_F(LocalizationTest, DefaultLanguageIsEnglish) {
+  EXPECT_EQ("en", localization_.GetLanguage());
+}
+
 }  // namespace
diff --git a/cpp/test/rule_test.cc b/cpp/test/rule_test.cc
index b9e671a..ca54006 100644
--- a/cpp/test/rule_test.cc
+++ b/cpp/test/rule_test.cc
@@ -43,6 +43,7 @@ TEST(RuleTest, CopyOverwritesRule) {
   Rule rule;
   ASSERT_TRUE(rule.ParseSerializedRule("{"
                                        "\"fmt\":\"%S%Z\","
+                                       "\"lfmt\":\"%Z%S\","
                                        "\"id\":\"data/XA\","
                                        "\"lname\":\"Testistan\","
                                        "\"require\":\"AC\","
@@ -55,6 +56,7 @@ TEST(RuleTest, CopyOverwritesRule) {
 
   Rule copy;
   EXPECT_NE(rule.GetFormat(), copy.GetFormat());
+  EXPECT_NE(rule.GetLatinFormat(), copy.GetLatinFormat());
   EXPECT_NE(rule.GetId(), copy.GetId());
   EXPECT_NE(rule.GetRequired(), copy.GetRequired());
   EXPECT_NE(rule.GetSubKeys(), copy.GetSubKeys());
@@ -69,6 +71,7 @@ TEST(RuleTest, CopyOverwritesRule) {
 
   copy.CopyFrom(rule);
   EXPECT_EQ(rule.GetFormat(), copy.GetFormat());
+  EXPECT_EQ(rule.GetLatinFormat(), copy.GetLatinFormat());
   EXPECT_EQ(rule.GetId(), copy.GetId());
   EXPECT_EQ(rule.GetRequired(), copy.GetRequired());
   EXPECT_EQ(rule.GetSubKeys(), copy.GetSubKeys());
@@ -115,6 +118,15 @@ TEST(RuleTest, ParsesFormatCorrectly) {
   EXPECT_EQ(expected, rule.GetFormat());
 }
 
+TEST(RuleTest, ParsesLatinFormatCorrectly) {
+  std::vector<AddressField> expected;
+  expected.push_back(LOCALITY);
+  expected.push_back(ADMIN_AREA);
+  Rule rule;
+  ASSERT_TRUE(rule.ParseSerializedRule("{\"lfmt\":\"%C%S\"}"));
+  EXPECT_EQ(expected, rule.GetLatinFormat());
+}
+
 TEST(RuleTest, ParsesRequiredCorrectly) {
   std::vector<AddressField> expected;
   expected.push_back(STREET_ADDRESS);