diff options
Diffstat (limited to 'cpp/src/language.cc')
-rw-r--r-- | cpp/src/language.cc | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/cpp/src/language.cc b/cpp/src/language.cc new file mode 100644 index 0000000..9e456f0 --- /dev/null +++ b/cpp/src/language.cc @@ -0,0 +1,102 @@ +// Copyright (C) 2014 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "language.h" + +#include <algorithm> +#include <cctype> +#include <string> +#include <vector> + +#include "rule.h" +#include "util/string_split.h" + +namespace i18n { +namespace addressinput { + +Language::Language(const std::string& language_tag) : tag(language_tag), + base(), + has_latin_script(false) { + // Character '-' is the separator for subtags in the BCP 47. However, some + // legacy code generates tags with '_' instead of '-'. + static const char kSubtagsSeparator = '-'; + static const char kAlternativeSubtagsSeparator = '_'; + std::replace(tag.begin(), tag.end(), kAlternativeSubtagsSeparator, + kSubtagsSeparator); + + // OK to use 'tolower' because BCP 47 tags are always in ASCII. + std::string lowercase = tag; + std::transform(lowercase.begin(), lowercase.end(), lowercase.begin(), + tolower); + + base = lowercase.substr(0, lowercase.find(kSubtagsSeparator)); + + // The lowercase BCP 47 subtag for Latin script. + static const char kLowercaseLatinScript[] = "latn"; + std::vector<std::string> subtags; + SplitString(lowercase, kSubtagsSeparator, &subtags); + + // Support only the second and third position for the script. + has_latin_script = + (subtags.size() > 1 && subtags[1] == kLowercaseLatinScript) || + (subtags.size() > 2 && subtags[2] == kLowercaseLatinScript); +} + +Language::~Language() {} + +Language ChooseBestAddressLanguage(const Rule& address_region_rule, + const Language& ui_language) { + if (address_region_rule.GetLanguages().empty()) { + return ui_language; + } + + std::vector<Language> available_languages; + for (std::vector<std::string>::const_iterator + language_tag_it = address_region_rule.GetLanguages().begin(); + language_tag_it != address_region_rule.GetLanguages().end(); + ++language_tag_it) { + available_languages.push_back(Language(*language_tag_it)); + } + + if (ui_language.tag.empty()) { + return available_languages.front(); + } + + bool has_latin_format = !address_region_rule.GetLatinFormat().empty(); + + // The conventionally formatted BCP 47 Latin script with a preceding subtag + // separator. + static const char kLatinScriptSuffix[] = "-Latn"; + Language latin_script_language( + available_languages.front().base + kLatinScriptSuffix); + if (has_latin_format && ui_language.has_latin_script) { + return latin_script_language; + } + + for (std::vector<Language>::const_iterator + available_lang_it = available_languages.begin(); + available_lang_it != available_languages.end(); ++available_lang_it) { + // Base language comparison works because no region supports the same base + // language with different scripts, for now. For example, no region supports + // "zh-Hant" and "zh-Hans" at the same time. + if (ui_language.base == available_lang_it->base) { + return *available_lang_it; + } + } + + return has_latin_format ? latin_script_language : available_languages.front(); +} + +} // namespace addressinput +} // namespace i18n |