// Copyright 2017 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "components/zucchini/suffix_array.h" #include #include #include #include #include #include #include "testing/gtest/include/gtest/gtest.h" namespace zucchini { namespace { using SLType = InducedSuffixSort::SLType; } // namespace using ustring = std::basic_string; constexpr uint16_t kNumChar = 256; ustring MakeUnsignedString(const std::string& str) { return {str.begin(), str.end()}; } template std::vector MakeVector(const std::initializer_list& ilist) { return {ilist.begin(), ilist.end()}; } void TestSlPartition(std::initializer_list expected_sl_partition, std::initializer_list expected_lms_indices, std::string str) { using SaisImpl = InducedSuffixSort::Implementation; std::vector sl_partition(str.size()); EXPECT_EQ(expected_lms_indices.size(), SaisImpl::BuildSLPartition(str.begin(), str.size(), kNumChar, sl_partition.rbegin())); EXPECT_EQ(MakeVector(expected_sl_partition), sl_partition); std::vector lms_indices(expected_lms_indices.size()); SaisImpl::FindLmsSuffixes(expected_sl_partition, lms_indices.begin()); EXPECT_EQ(MakeVector(expected_lms_indices), lms_indices); } TEST(InducedSuffixSortTest, BuildSLPartition) { TestSlPartition({}, {}, ""); TestSlPartition( { SLType::LType, }, {}, "a"); TestSlPartition( { SLType::LType, SLType::LType, }, {}, "ba"); TestSlPartition( { SLType::SType, SLType::LType, }, {}, "ab"); TestSlPartition( { SLType::SType, SLType::SType, SLType::LType, }, {}, "aab"); TestSlPartition( { SLType::LType, SLType::LType, SLType::LType, }, {}, "bba"); TestSlPartition( { SLType::LType, SLType::SType, SLType::LType, }, {1}, "bab"); TestSlPartition( { SLType::LType, SLType::SType, SLType::SType, SLType::LType, }, {1}, "baab"); TestSlPartition( { SLType::LType, // zucchini SLType::LType, // ucchini SLType::SType, // cchini SLType::SType, // chini SLType::SType, // hini SLType::SType, // ini SLType::LType, // ni SLType::LType, // i }, {2}, "zucchini"); } std::vector BucketCount(const std::initializer_list str, uint16_t max_key) { using SaisImpl = InducedSuffixSort::Implementation; return SaisImpl::MakeBucketCount(str.begin(), str.size(), max_key); } TEST(InducedSuffixSortTest, BucketCount) { using vec = std::vector; EXPECT_EQ(vec({0, 0, 0, 0}), BucketCount({}, 4)); EXPECT_EQ(vec({1, 0, 0, 0}), BucketCount({0}, 4)); EXPECT_EQ(vec({0, 2, 0, 1}), BucketCount({1, 1, 3}, 4)); } std::vector InducedSortSubstring(ustring str) { using SaisImpl = InducedSuffixSort::Implementation; std::vector sl_partition(str.size()); size_t lms_count = SaisImpl::BuildSLPartition( str.begin(), str.size(), kNumChar, sl_partition.rbegin()); std::vector lms_indices(lms_count); SaisImpl::FindLmsSuffixes(sl_partition, lms_indices.begin()); auto buckets = SaisImpl::MakeBucketCount(str.begin(), str.size(), kNumChar); std::vector suffix_array(str.size()); SaisImpl::InducedSort(str, str.size(), sl_partition, lms_indices, buckets, suffix_array.begin()); return suffix_array; } TEST(InducedSuffixSortTest, InducedSortSubstring) { using vec = std::vector; auto us = MakeUnsignedString; // L; a$ EXPECT_EQ(vec({0}), InducedSortSubstring(us("a"))); // SL; ab$, b$ EXPECT_EQ(vec({0, 1}), InducedSortSubstring(us("ab"))); // LL; a$, ba$ EXPECT_EQ(vec({1, 0}), InducedSortSubstring(us("ba"))); // SLL; a$, aba$, ba$ EXPECT_EQ(vec({2, 0, 1}), InducedSortSubstring(us("aba"))); // LSL; ab$, b$, ba EXPECT_EQ(vec({1, 2, 0}), InducedSortSubstring(us("bab"))); // SSL; aab$, ab$, b$ EXPECT_EQ(vec({0, 1, 2}), InducedSortSubstring(us("aab"))); // LSSL; aab$, ab$, b$, ba EXPECT_EQ(vec({1, 2, 3, 0}), InducedSortSubstring(us("baab"))); } template void TestSuffixSort(ustring test_str) { std::vector suffix_array = MakeSuffixArray(test_str, kNumChar); EXPECT_EQ(test_str.size(), suffix_array.size()); // Expect that I[] is a permutation of [0, len]. std::vector sorted_suffix(suffix_array.begin(), suffix_array.end()); std::sort(sorted_suffix.begin(), sorted_suffix.end()); for (size_t i = 0; i < test_str.size(); ++i) EXPECT_EQ(i, sorted_suffix[i]); // Expect that all suffixes are strictly ordered. auto end = test_str.end(); for (size_t i = 1; i < test_str.size(); ++i) { auto suf1 = test_str.begin() + suffix_array[i - 1]; auto suf2 = test_str.begin() + suffix_array[i]; bool is_less = std::lexicographical_compare(suf1, end, suf2, end); EXPECT_TRUE(is_less); } } constexpr const char* test_strs[] = { "", "a", "aa", "za", "CACAO", "aaaaa", "banana", "tobeornottobe", "The quick brown fox jumps over the lazy dog.", "elephantelephantelephantelephantelephant", "walawalawashington", "-------------------------", "011010011001011010010110011010010", "3141592653589793238462643383279502884197169399375105", "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD", "abccbaabccbaabccbaabccbaabccbaabccbaabccbaabccba", "0123456789876543210", "9876543210123456789", "aababcabcdabcdeabcdefabcdefg", "asdhklgalksdjghalksdjghalksdjgh", }; TEST(SuffixSortTest, NaiveSuffixSort) { for (const std::string& test_str : test_strs) { TestSuffixSort(MakeUnsignedString(test_str)); } } TEST(SuffixSortTest, InducedSuffixSortSort) { for (const std::string& test_str : test_strs) { TestSuffixSort(MakeUnsignedString(test_str)); } } // Test with sequence that has every character. TEST(SuffixSortTest, AllChar) { std::vector all_char(kNumChar); std::iota(all_char.begin(), all_char.end(), 0); { std::vector suffix_array = MakeSuffixArray(all_char, kNumChar); for (size_t i = 0; i < kNumChar; ++i) EXPECT_EQ(i, suffix_array[i]); } std::vector all_char_reverse(all_char.rbegin(), all_char.rend()); { std::vector suffix_array = MakeSuffixArray(all_char_reverse, kNumChar); for (size_t i = 0; i < kNumChar; ++i) EXPECT_EQ(kNumChar - i - 1, suffix_array[i]); } } void TestSuffixLowerBound(ustring base_str, ustring search_str) { std::vector suffix_array = MakeSuffixArray(base_str, kNumChar); auto pos = SuffixLowerBound(suffix_array, base_str.begin(), search_str.begin(), search_str.end()); auto end = base_str.end(); if (pos != suffix_array.begin()) { // Previous suffix is less than |search_str|. auto suf = base_str.begin() + pos[-1]; bool is_less = std::lexicographical_compare(suf, end, search_str.begin(), search_str.end()); EXPECT_TRUE(is_less); } if (pos != suffix_array.end()) { // Current suffix is greater of equal to |search_str|. auto suf = base_str.begin() + *pos; bool is_less = std::lexicographical_compare(suf, end, search_str.begin(), search_str.end()); EXPECT_FALSE(is_less); } } TEST(SuffixArrayTest, LowerBound) { auto us = MakeUnsignedString; TestSuffixLowerBound(us(""), us("")); TestSuffixLowerBound(us(""), us("a")); TestSuffixLowerBound(us("b"), us("")); TestSuffixLowerBound(us("b"), us("a")); TestSuffixLowerBound(us("b"), us("c")); TestSuffixLowerBound(us("b"), us("bc")); TestSuffixLowerBound(us("aa"), us("a")); TestSuffixLowerBound(us("aa"), us("aa")); ustring sentence = us("the quick brown fox jumps over the lazy dog."); // Entire string: exact and unique. TestSuffixLowerBound(sentence, sentence); // Empty string: exact and non-unique. TestSuffixLowerBound(sentence, us("")); // Exact and unique suffix matches. TestSuffixLowerBound(sentence, us(".")); TestSuffixLowerBound(sentence, us("the lazy dog.")); // Exact and unique non-suffix matches. TestSuffixLowerBound(sentence, us("quick")); TestSuffixLowerBound(sentence, us("the quick")); // Partial and unique matches. TestSuffixLowerBound(sentence, us("fox jumps with the hosps")); TestSuffixLowerBound(sentence, us("xyz")); // Exact and non-unique match: take lexicographical first. TestSuffixLowerBound(sentence, us("the")); TestSuffixLowerBound(sentence, us(" ")); // Partial and non-unique match. // query < "the l"... < "the q"... TestSuffixLowerBound(sentence, us("the apple")); // "the l"... < query < "the q"... TestSuffixLowerBound(sentence, us("the opera")); // "the l"... < "the q"... < query TestSuffixLowerBound(sentence, us("the zebra")); // Prefix match dominates suffix match (unique). TestSuffixLowerBound(sentence, us("over quick brown fox")); // Empty matchs. TestSuffixLowerBound(sentence, us(",")); TestSuffixLowerBound(sentence, us("1234")); TestSuffixLowerBound(sentence, us("THE QUICK BROWN FOX")); TestSuffixLowerBound(sentence, us("(the")); } TEST(SuffixArrayTest, LowerBoundExact) { for (const std::string& test_str : test_strs) { ustring test_ustr = MakeUnsignedString(test_str); std::vector suffix_array = MakeSuffixArray(test_ustr, kNumChar); for (size_t lo = 0; lo < test_str.size(); ++lo) { for (size_t hi = lo + 1; hi <= test_str.size(); ++hi) { ustring query(test_ustr.begin() + lo, test_ustr.begin() + hi); ASSERT_EQ(query.size(), hi - lo); auto pos = SuffixLowerBound(suffix_array, test_ustr.begin(), query.begin(), query.end()); EXPECT_TRUE( std::equal(query.begin(), query.end(), test_ustr.begin() + *pos)); } } } } } // namespace zucchini