aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFredrik Roubert <roubert@google.com>2014-06-18 03:31:22 +0000
committerFredrik Roubert <roubert@google.com>2014-09-01 19:20:48 +0200
commitcd1c048abc4516c9e69ee0ce03d3cc6788c8e44a (patch)
tree1f9b8a53562efeba8172affd7eca31e559a2d455
parent9a6398885fdb692c6ff628b767a45a5bb0994107 (diff)
downloadsrc-cd1c048abc4516c9e69ee0ce03d3cc6788c8e44a.tar.gz
More natural language friendly string matching in PreloadSupplier.
This is done by updating the StringCompare helper class to also provide a comparison method that conforms to the STL requirements for less<> predicates, and then wrapping this as an STL predicate. (A portable default implementation of this is done through "clever" use of RE2 to get UTF-8 capable string comparison. Users of the library can override this with better/faster implementations.) This will make PreloadSupplier able to match "ZÜRICH" and "Zürich". Yay! ;-) R=rouslan@chromium.org BUG= Review URL: https://codereview.appspot.com/101340044
-rw-r--r--cpp/include/libaddressinput/preload_supplier.h4
-rw-r--r--cpp/src/preload_supplier.cc39
-rw-r--r--cpp/src/util/string_compare.cc52
-rw-r--r--cpp/src/util/string_compare.h6
-rw-r--r--cpp/test/address_validator_test.cc2
-rw-r--r--cpp/test/util/string_compare_test.cc33
6 files changed, 115 insertions, 21 deletions
diff --git a/cpp/include/libaddressinput/preload_supplier.h b/cpp/include/libaddressinput/preload_supplier.h
index 52a5e7c..740b4d0 100644
--- a/cpp/include/libaddressinput/preload_supplier.h
+++ b/cpp/include/libaddressinput/preload_supplier.h
@@ -20,7 +20,6 @@
#include <libaddressinput/util/basictypes.h>
#include <libaddressinput/util/scoped_ptr.h>
-#include <map>
#include <set>
#include <string>
#include <vector>
@@ -29,6 +28,7 @@ namespace i18n {
namespace addressinput {
class Downloader;
+class IndexMap;
class LookupKey;
class Retriever;
class Rule;
@@ -91,7 +91,7 @@ class PreloadSupplier : public Supplier {
const scoped_ptr<const Retriever> retriever_;
std::set<std::string> pending_;
- std::map<std::string, const Rule*> rule_index_;
+ const scoped_ptr<IndexMap> rule_index_;
std::vector<const Rule*> rule_storage_;
DISALLOW_COPY_AND_ASSIGN(PreloadSupplier);
diff --git a/cpp/src/preload_supplier.cc b/cpp/src/preload_supplier.cc
index e4b223f..1a1ecf6 100644
--- a/cpp/src/preload_supplier.cc
+++ b/cpp/src/preload_supplier.cc
@@ -24,6 +24,7 @@
#include <algorithm>
#include <cassert>
#include <cstddef>
+#include <functional>
#include <map>
#include <set>
#include <stack>
@@ -37,12 +38,36 @@
#include "retriever.h"
#include "rule.h"
#include "util/json.h"
+#include "util/string_compare.h"
namespace i18n {
namespace addressinput {
namespace {
+// STL predicate less<> that uses StringCompare to match strings that a human
+// reader would consider to be "the same". The default implementation just does
+// case insensitive string comparison, but StringCompare can be overriden with
+// more sophisticated implementations.
+class IndexLess : public std::binary_function<std::string, std::string, bool> {
+ public:
+ result_type operator()(const first_argument_type& a,
+ const second_argument_type& b) const {
+ return kStringCompare.NaturalLess(a, b);
+ }
+
+ private:
+ static const StringCompare kStringCompare;
+};
+
+const StringCompare IndexLess::kStringCompare;
+
+} // namespace
+
+class IndexMap : public std::map<std::string, const Rule*, IndexLess> {};
+
+namespace {
+
class Helper {
public:
// Does not take ownership of its parameters.
@@ -51,7 +76,7 @@ class Helper {
const PreloadSupplier::Callback& loaded,
const Retriever& retriever,
std::set<std::string>* pending,
- std::map<std::string, const Rule*>* rule_index,
+ IndexMap* rule_index,
std::vector<const Rule*>* rule_storage)
: region_code_(region_code),
loaded_(loaded),
@@ -207,7 +232,7 @@ class Helper {
const std::string region_code_;
const PreloadSupplier::Callback& loaded_;
std::set<std::string>* const pending_;
- std::map<std::string, const Rule*>* const rule_index_;
+ IndexMap* const rule_index_;
std::vector<const Rule*>* const rule_storage_;
const scoped_ptr<const Retriever::Callback> retrieved_;
@@ -229,7 +254,7 @@ PreloadSupplier::PreloadSupplier(const std::string& validation_data_url,
Storage* storage)
: retriever_(new Retriever(validation_data_url, downloader, storage)),
pending_(),
- rule_index_(),
+ rule_index_(new IndexMap),
rule_storage_() {}
PreloadSupplier::~PreloadSupplier() {
@@ -274,7 +299,7 @@ void PreloadSupplier::LoadRules(const std::string& region_code,
loaded,
*retriever_,
&pending_,
- &rule_index_,
+ rule_index_.get(),
&rule_storage_);
}
@@ -298,8 +323,8 @@ bool PreloadSupplier::GetRuleHierarchy(const LookupKey& lookup_key,
for (size_t depth = 0; depth <= max_depth; ++depth) {
const std::string& key = lookup_key.ToKeyString(depth);
std::map<std::string, const Rule*>::const_iterator it =
- rule_index_.find(key);
- if (it == rule_index_.end()) {
+ rule_index_->find(key);
+ if (it == rule_index_->end()) {
return depth > 0; // No data on COUNTRY level is failure.
}
hierarchy->rule[depth] = it->second;
@@ -310,7 +335,7 @@ bool PreloadSupplier::GetRuleHierarchy(const LookupKey& lookup_key,
}
bool PreloadSupplier::IsLoadedKey(const std::string& key) const {
- return rule_index_.find(key) != rule_index_.end();
+ return rule_index_->find(key) != rule_index_->end();
}
bool PreloadSupplier::IsPendingKey(const std::string& key) const {
diff --git a/cpp/src/util/string_compare.cc b/cpp/src/util/string_compare.cc
index c63b138..31a7534 100644
--- a/cpp/src/util/string_compare.cc
+++ b/cpp/src/util/string_compare.cc
@@ -12,20 +12,54 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "re2ptr.h" // Must be the first #include statement!
-
#include "string_compare.h"
#include <libaddressinput/util/basictypes.h>
+#include <cassert>
#include <string>
+#include <re2/re2.h>
+
+#include "lru_cache_using_std.h"
+
+// RE2 uses type string, which is not necessarily the same as type std::string.
+// In order to create objects of the correct type, to be able to pass pointers
+// to these objects to RE2, the function that does that is defined inside an
+// unnamed namespace inside the re2 namespace. Oh, my ...
+namespace re2 {
+namespace {
+
+// In order to (mis-)use RE2 to implement UTF-8 capable less<>, this function
+// calls RE2::PossibleMatchRange() to calculate the "lessest" string that would
+// be a case-insensitive match to the string. This is far too expensive to do
+// repeatedly, so the function is only ever called through an LRU cache.
+std::string ComputeMinPossibleMatch(const std::string& str) {
+ string min, max; // N.B.: RE2 type string!
+
+ RE2::Options options;
+ options.set_literal(true);
+ options.set_case_sensitive(false);
+ RE2 matcher(str, options);
+
+ bool success = matcher.PossibleMatchRange(&min, &max, str.size());
+ assert(success);
+ (void)success; // Prevent unused variable if assert() is optimized away.
+
+ return min;
+}
+
+} // namespace
+} // namespace re2
+
namespace i18n {
namespace addressinput {
class StringCompare::Impl {
+ enum { MAX_CACHE_SIZE = 1 << 15 };
+
public:
- Impl() {
+ Impl() : min_possible_match_(&re2::ComputeMinPossibleMatch, MAX_CACHE_SIZE) {
options_.set_literal(true);
options_.set_case_sensitive(false);
}
@@ -37,8 +71,15 @@ class StringCompare::Impl {
return RE2::FullMatch(a, matcher);
}
+ bool NaturalLess(const std::string& a, const std::string& b) const {
+ const std::string& min_a(min_possible_match_(a));
+ const std::string& min_b(min_possible_match_(b));
+ return min_a < min_b;
+ }
+
private:
RE2::Options options_;
+ mutable lru_cache_using_std<std::string, std::string> min_possible_match_;
DISALLOW_COPY_AND_ASSIGN(Impl);
};
@@ -52,5 +93,10 @@ bool StringCompare::NaturalEquals(const std::string& a,
return impl_->NaturalEquals(a, b);
}
+bool StringCompare::NaturalLess(const std::string& a,
+ const std::string& b) const {
+ return impl_->NaturalLess(a, b);
+}
+
} // namespace addressinput
} // namespace i18n
diff --git a/cpp/src/util/string_compare.h b/cpp/src/util/string_compare.h
index 9d530fa..ae680dd 100644
--- a/cpp/src/util/string_compare.h
+++ b/cpp/src/util/string_compare.h
@@ -33,6 +33,12 @@ class StringCompare {
// default implementation just does case insensitive string matching.
bool NaturalEquals(const std::string& a, const std::string& b) const;
+ // Comparison function for use with the STL analogous to NaturalEquals().
+ // Libaddressinput itself isn't really concerned about how this is done, as
+ // long as it conforms to the STL requirements on less<> predicates. This
+ // default implementation is VERY SLOW! Must be replaced if you need speed.
+ bool NaturalLess(const std::string& a, const std::string& b) const;
+
private:
class Impl;
scoped_ptr<Impl> impl_;
diff --git a/cpp/test/address_validator_test.cc b/cpp/test/address_validator_test.cc
index 201b823..5703a37 100644
--- a/cpp/test/address_validator_test.cc
+++ b/cpp/test/address_validator_test.cc
@@ -353,7 +353,7 @@ TEST_P(AddressValidatorTest, ValidLatinAddressJP) {
if (GetParam() == &OndemandValidatorWrapper::Build) return;
address_.region_code = "JP";
- address_.administrative_area = "TOKUSHIMA";
+ address_.administrative_area = "Tokushima";
address_.locality = "Tokushima";
address_.postal_code = "770-0847";
address_.address_line.push_back("...");
diff --git a/cpp/test/util/string_compare_test.cc b/cpp/test/util/string_compare_test.cc
index d5990d9..8f8d4d5 100644
--- a/cpp/test/util/string_compare_test.cc
+++ b/cpp/test/util/string_compare_test.cc
@@ -25,14 +25,19 @@ using i18n::addressinput::StringCompare;
struct TestCase {
TestCase(const std::string& left,
const std::string& right,
- bool should_be_equal)
- : left(left), right(right), should_be_equal(should_be_equal) {}
+ bool should_be_equal,
+ bool should_be_less)
+ : left(left),
+ right(right),
+ should_be_equal(should_be_equal),
+ should_be_less(should_be_less) {}
~TestCase() {}
std::string left;
std::string right;
bool should_be_equal;
+ bool should_be_less;
};
class StringCompareTest : public testing::TestWithParam<TestCase> {
@@ -48,13 +53,25 @@ TEST_P(StringCompareTest, CorrectComparison) {
}
}
+TEST_P(StringCompareTest, CorrectLess) {
+ if (GetParam().should_be_less) {
+ EXPECT_TRUE(compare_.NaturalLess(GetParam().left, GetParam().right));
+ } else {
+ EXPECT_FALSE(compare_.NaturalLess(GetParam().left, GetParam().right));
+ }
+}
+
INSTANTIATE_TEST_CASE_P(
Comparisons, StringCompareTest,
- testing::Values(TestCase("foo", "foo", true),
- TestCase("foo", "FOO", true),
- TestCase("bar", "foo", false),
- TestCase("강원도", "강원도", true),
- TestCase("강원도", "대구광역시", false),
- TestCase("ZÜRICH", "zürich", true)));
+ testing::Values(TestCase("foo", "foo", true, false),
+ TestCase("foo", "FOO", true, false),
+ TestCase("bar", "foo", false, true),
+ TestCase("강원도", "강원도", true, false),
+ TestCase("강원도", "대구광역시", false, true),
+ TestCase("ZÜRICH", "zürich", true, false),
+ TestCase("абв", "где", false, true),
+ TestCase("абв", "ГДЕ", false, true),
+ TestCase("где", "абв", false, false),
+ TestCase("где", "АБВ", false, false)));
} // namespace