diff options
author | Fredrik Roubert <roubert@google.com> | 2014-06-18 03:31:22 +0000 |
---|---|---|
committer | Fredrik Roubert <roubert@google.com> | 2014-09-01 19:20:48 +0200 |
commit | cd1c048abc4516c9e69ee0ce03d3cc6788c8e44a (patch) | |
tree | 1f9b8a53562efeba8172affd7eca31e559a2d455 | |
parent | 9a6398885fdb692c6ff628b767a45a5bb0994107 (diff) | |
download | src-cd1c048abc4516c9e69ee0ce03d3cc6788c8e44a.tar.gz |
More natural language friendly string matching in PreloadSupplier.
This is done by updating the StringCompare helper class to also provide
a comparison method that conforms to the STL requirements for less<>
predicates, and then wrapping this as an STL predicate.
(A portable default implementation of this is done through "clever" use
of RE2 to get UTF-8 capable string comparison. Users of the library can
override this with better/faster implementations.)
This will make PreloadSupplier able to match "ZÜRICH" and "Zürich". Yay!
;-)
R=rouslan@chromium.org
BUG=
Review URL: https://codereview.appspot.com/101340044
-rw-r--r-- | cpp/include/libaddressinput/preload_supplier.h | 4 | ||||
-rw-r--r-- | cpp/src/preload_supplier.cc | 39 | ||||
-rw-r--r-- | cpp/src/util/string_compare.cc | 52 | ||||
-rw-r--r-- | cpp/src/util/string_compare.h | 6 | ||||
-rw-r--r-- | cpp/test/address_validator_test.cc | 2 | ||||
-rw-r--r-- | cpp/test/util/string_compare_test.cc | 33 |
6 files changed, 115 insertions, 21 deletions
diff --git a/cpp/include/libaddressinput/preload_supplier.h b/cpp/include/libaddressinput/preload_supplier.h index 52a5e7c..740b4d0 100644 --- a/cpp/include/libaddressinput/preload_supplier.h +++ b/cpp/include/libaddressinput/preload_supplier.h @@ -20,7 +20,6 @@ #include <libaddressinput/util/basictypes.h> #include <libaddressinput/util/scoped_ptr.h> -#include <map> #include <set> #include <string> #include <vector> @@ -29,6 +28,7 @@ namespace i18n { namespace addressinput { class Downloader; +class IndexMap; class LookupKey; class Retriever; class Rule; @@ -91,7 +91,7 @@ class PreloadSupplier : public Supplier { const scoped_ptr<const Retriever> retriever_; std::set<std::string> pending_; - std::map<std::string, const Rule*> rule_index_; + const scoped_ptr<IndexMap> rule_index_; std::vector<const Rule*> rule_storage_; DISALLOW_COPY_AND_ASSIGN(PreloadSupplier); diff --git a/cpp/src/preload_supplier.cc b/cpp/src/preload_supplier.cc index e4b223f..1a1ecf6 100644 --- a/cpp/src/preload_supplier.cc +++ b/cpp/src/preload_supplier.cc @@ -24,6 +24,7 @@ #include <algorithm> #include <cassert> #include <cstddef> +#include <functional> #include <map> #include <set> #include <stack> @@ -37,12 +38,36 @@ #include "retriever.h" #include "rule.h" #include "util/json.h" +#include "util/string_compare.h" namespace i18n { namespace addressinput { namespace { +// STL predicate less<> that uses StringCompare to match strings that a human +// reader would consider to be "the same". The default implementation just does +// case insensitive string comparison, but StringCompare can be overriden with +// more sophisticated implementations. +class IndexLess : public std::binary_function<std::string, std::string, bool> { + public: + result_type operator()(const first_argument_type& a, + const second_argument_type& b) const { + return kStringCompare.NaturalLess(a, b); + } + + private: + static const StringCompare kStringCompare; +}; + +const StringCompare IndexLess::kStringCompare; + +} // namespace + +class IndexMap : public std::map<std::string, const Rule*, IndexLess> {}; + +namespace { + class Helper { public: // Does not take ownership of its parameters. @@ -51,7 +76,7 @@ class Helper { const PreloadSupplier::Callback& loaded, const Retriever& retriever, std::set<std::string>* pending, - std::map<std::string, const Rule*>* rule_index, + IndexMap* rule_index, std::vector<const Rule*>* rule_storage) : region_code_(region_code), loaded_(loaded), @@ -207,7 +232,7 @@ class Helper { const std::string region_code_; const PreloadSupplier::Callback& loaded_; std::set<std::string>* const pending_; - std::map<std::string, const Rule*>* const rule_index_; + IndexMap* const rule_index_; std::vector<const Rule*>* const rule_storage_; const scoped_ptr<const Retriever::Callback> retrieved_; @@ -229,7 +254,7 @@ PreloadSupplier::PreloadSupplier(const std::string& validation_data_url, Storage* storage) : retriever_(new Retriever(validation_data_url, downloader, storage)), pending_(), - rule_index_(), + rule_index_(new IndexMap), rule_storage_() {} PreloadSupplier::~PreloadSupplier() { @@ -274,7 +299,7 @@ void PreloadSupplier::LoadRules(const std::string& region_code, loaded, *retriever_, &pending_, - &rule_index_, + rule_index_.get(), &rule_storage_); } @@ -298,8 +323,8 @@ bool PreloadSupplier::GetRuleHierarchy(const LookupKey& lookup_key, for (size_t depth = 0; depth <= max_depth; ++depth) { const std::string& key = lookup_key.ToKeyString(depth); std::map<std::string, const Rule*>::const_iterator it = - rule_index_.find(key); - if (it == rule_index_.end()) { + rule_index_->find(key); + if (it == rule_index_->end()) { return depth > 0; // No data on COUNTRY level is failure. } hierarchy->rule[depth] = it->second; @@ -310,7 +335,7 @@ bool PreloadSupplier::GetRuleHierarchy(const LookupKey& lookup_key, } bool PreloadSupplier::IsLoadedKey(const std::string& key) const { - return rule_index_.find(key) != rule_index_.end(); + return rule_index_->find(key) != rule_index_->end(); } bool PreloadSupplier::IsPendingKey(const std::string& key) const { diff --git a/cpp/src/util/string_compare.cc b/cpp/src/util/string_compare.cc index c63b138..31a7534 100644 --- a/cpp/src/util/string_compare.cc +++ b/cpp/src/util/string_compare.cc @@ -12,20 +12,54 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "re2ptr.h" // Must be the first #include statement! - #include "string_compare.h" #include <libaddressinput/util/basictypes.h> +#include <cassert> #include <string> +#include <re2/re2.h> + +#include "lru_cache_using_std.h" + +// RE2 uses type string, which is not necessarily the same as type std::string. +// In order to create objects of the correct type, to be able to pass pointers +// to these objects to RE2, the function that does that is defined inside an +// unnamed namespace inside the re2 namespace. Oh, my ... +namespace re2 { +namespace { + +// In order to (mis-)use RE2 to implement UTF-8 capable less<>, this function +// calls RE2::PossibleMatchRange() to calculate the "lessest" string that would +// be a case-insensitive match to the string. This is far too expensive to do +// repeatedly, so the function is only ever called through an LRU cache. +std::string ComputeMinPossibleMatch(const std::string& str) { + string min, max; // N.B.: RE2 type string! + + RE2::Options options; + options.set_literal(true); + options.set_case_sensitive(false); + RE2 matcher(str, options); + + bool success = matcher.PossibleMatchRange(&min, &max, str.size()); + assert(success); + (void)success; // Prevent unused variable if assert() is optimized away. + + return min; +} + +} // namespace +} // namespace re2 + namespace i18n { namespace addressinput { class StringCompare::Impl { + enum { MAX_CACHE_SIZE = 1 << 15 }; + public: - Impl() { + Impl() : min_possible_match_(&re2::ComputeMinPossibleMatch, MAX_CACHE_SIZE) { options_.set_literal(true); options_.set_case_sensitive(false); } @@ -37,8 +71,15 @@ class StringCompare::Impl { return RE2::FullMatch(a, matcher); } + bool NaturalLess(const std::string& a, const std::string& b) const { + const std::string& min_a(min_possible_match_(a)); + const std::string& min_b(min_possible_match_(b)); + return min_a < min_b; + } + private: RE2::Options options_; + mutable lru_cache_using_std<std::string, std::string> min_possible_match_; DISALLOW_COPY_AND_ASSIGN(Impl); }; @@ -52,5 +93,10 @@ bool StringCompare::NaturalEquals(const std::string& a, return impl_->NaturalEquals(a, b); } +bool StringCompare::NaturalLess(const std::string& a, + const std::string& b) const { + return impl_->NaturalLess(a, b); +} + } // namespace addressinput } // namespace i18n diff --git a/cpp/src/util/string_compare.h b/cpp/src/util/string_compare.h index 9d530fa..ae680dd 100644 --- a/cpp/src/util/string_compare.h +++ b/cpp/src/util/string_compare.h @@ -33,6 +33,12 @@ class StringCompare { // default implementation just does case insensitive string matching. bool NaturalEquals(const std::string& a, const std::string& b) const; + // Comparison function for use with the STL analogous to NaturalEquals(). + // Libaddressinput itself isn't really concerned about how this is done, as + // long as it conforms to the STL requirements on less<> predicates. This + // default implementation is VERY SLOW! Must be replaced if you need speed. + bool NaturalLess(const std::string& a, const std::string& b) const; + private: class Impl; scoped_ptr<Impl> impl_; diff --git a/cpp/test/address_validator_test.cc b/cpp/test/address_validator_test.cc index 201b823..5703a37 100644 --- a/cpp/test/address_validator_test.cc +++ b/cpp/test/address_validator_test.cc @@ -353,7 +353,7 @@ TEST_P(AddressValidatorTest, ValidLatinAddressJP) { if (GetParam() == &OndemandValidatorWrapper::Build) return; address_.region_code = "JP"; - address_.administrative_area = "TOKUSHIMA"; + address_.administrative_area = "Tokushima"; address_.locality = "Tokushima"; address_.postal_code = "770-0847"; address_.address_line.push_back("..."); diff --git a/cpp/test/util/string_compare_test.cc b/cpp/test/util/string_compare_test.cc index d5990d9..8f8d4d5 100644 --- a/cpp/test/util/string_compare_test.cc +++ b/cpp/test/util/string_compare_test.cc @@ -25,14 +25,19 @@ using i18n::addressinput::StringCompare; struct TestCase { TestCase(const std::string& left, const std::string& right, - bool should_be_equal) - : left(left), right(right), should_be_equal(should_be_equal) {} + bool should_be_equal, + bool should_be_less) + : left(left), + right(right), + should_be_equal(should_be_equal), + should_be_less(should_be_less) {} ~TestCase() {} std::string left; std::string right; bool should_be_equal; + bool should_be_less; }; class StringCompareTest : public testing::TestWithParam<TestCase> { @@ -48,13 +53,25 @@ TEST_P(StringCompareTest, CorrectComparison) { } } +TEST_P(StringCompareTest, CorrectLess) { + if (GetParam().should_be_less) { + EXPECT_TRUE(compare_.NaturalLess(GetParam().left, GetParam().right)); + } else { + EXPECT_FALSE(compare_.NaturalLess(GetParam().left, GetParam().right)); + } +} + INSTANTIATE_TEST_CASE_P( Comparisons, StringCompareTest, - testing::Values(TestCase("foo", "foo", true), - TestCase("foo", "FOO", true), - TestCase("bar", "foo", false), - TestCase("강원도", "강원도", true), - TestCase("강원도", "대구광역시", false), - TestCase("ZÜRICH", "zürich", true))); + testing::Values(TestCase("foo", "foo", true, false), + TestCase("foo", "FOO", true, false), + TestCase("bar", "foo", false, true), + TestCase("강원도", "강원도", true, false), + TestCase("강원도", "대구광역시", false, true), + TestCase("ZÜRICH", "zürich", true, false), + TestCase("абв", "где", false, true), + TestCase("абв", "ГДЕ", false, true), + TestCase("где", "абв", false, false), + TestCase("где", "АБВ", false, false))); } // namespace |