diff options
author | Samuel Huang <huangs@chromium.org> | 2018-03-13 18:19:34 +0000 |
---|---|---|
committer | Edward Lesmes <ehmaldonado@google.com> | 2021-07-23 21:50:59 +0000 |
commit | 06f1ae9aaca969ee95ef840f22b6b461c304542d (patch) | |
tree | f1e5c6624e70628e81fbf38d6cd14b974abe5d93 /suffix_array.h | |
download | zucchini-06f1ae9aaca969ee95ef840f22b6b461c304542d.tar.gz |
[Zucchini] Move Zucchini from /chrome/installer/ to /components/.
(Use "git log --follow" to see older revisions of files).
/components/ is the most logical place to put Zucchini, which only
depends on /base and /testing/gtest. This move also enables Zucchini to
be used by the Component Updater. Details:
- Move all files; run the following to change deps and guards:
sed 's/chrome\/installer/components/' *.cc *.h -i
sed 's/CHROME_INSTALLER/COMPONENTS/' *.cc *.h -i
- Sorting works out pretty well!
- Change all 'chrome/installer/zucchini' to 'components/zucchini'
throughout other parts of the repo; sort if necessary.
- Fix 6 'git cl lint' errors.
- Change 1 Bind() usage to BindRepeated().
- Update OWNER.
Bug: 729154
Change-Id: I50c5a7d411ea85f707b5994ab319dfb2a1acccf7
Reviewed-on: https://chromium-review.googlesource.com/954923
Reviewed-by: Greg Thompson <grt@chromium.org>
Reviewed-by: Jochen Eisinger <jochen@chromium.org>
Reviewed-by: Samuel Huang <huangs@chromium.org>
Commit-Queue: Samuel Huang <huangs@chromium.org>
Cr-Commit-Position: refs/heads/master@{#542857}
NOKEYCHECK=True
GitOrigin-RevId: 577ef6c435e8d43be6e3e60ccbcbd1881780f4ec
Diffstat (limited to 'suffix_array.h')
-rw-r--r-- | suffix_array.h | 475 |
1 files changed, 475 insertions, 0 deletions
diff --git a/suffix_array.h b/suffix_array.h new file mode 100644 index 0000000..ea49a43 --- /dev/null +++ b/suffix_array.h @@ -0,0 +1,475 @@ +// Copyright 2017 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_ +#define COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_ + +#include <algorithm> +#include <iterator> +#include <numeric> +#include <vector> + +#include "base/logging.h" +#include "base/macros.h" + +namespace zucchini { + +// A functor class that implements the naive suffix sorting algorithm that uses +// std::sort with lexicographical compare. This is only meant as reference of +// the interface. +class NaiveSuffixSort { + public: + // Type requirements: + // |InputRng| is an input random access range. + // |KeyType| is an unsigned integer type. + // |SAIt| is a random access iterator with mutable references. + template <class InputRng, class KeyType, class SAIt> + // |str| is the input string on which suffix sort is applied. + // Characters found in |str| must be in the range [0, |key_bound|) + // |suffix_array| is the beginning of the destination range, which is at least + // as large as |str|. + void operator()(const InputRng& str, + KeyType key_bound, + SAIt suffix_array) const { + using size_type = typename SAIt::value_type; + + size_type n = static_cast<size_type>(std::end(str) - std::begin(str)); + + // |suffix_array| is first filled with ordered indices of |str|. + // Those indices are then sorted with lexicographical comparisons in |str|. + std::iota(suffix_array, suffix_array + n, 0); + std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) { + return std::lexicographical_compare(std::begin(str) + i, std::end(str), + std::begin(str) + j, std::end(str)); + }); + } +}; + +// A functor class that implements suffix array induced sorting (SA-IS) +// algorithm with linear time and memory complexity, +// see http://ieeexplore.ieee.org/abstract/document/5582081/ +class InducedSuffixSort { + public: + // Type requirements: + // |InputRng| is an input random access range. + // |KeyType| is an unsigned integer type. + // |SAIt| is a random access iterator with mutable values. + template <class InputRng, class KeyType, class SAIt> + // |str| is the input string on which suffix sort is applied. + // Characters found in |str| must be in the range [0, |key_bound|) + // |suffix_array| is the beginning of the destination range, which is at least + // as large as |str|. + void operator()(const InputRng& str, + KeyType key_bound, + SAIt suffix_array) const { + using value_type = typename InputRng::value_type; + using size_type = typename SAIt::value_type; + + static_assert(std::is_unsigned<value_type>::value, + "SA-IS only supports input string with unsigned values"); + static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned"); + + size_type n = static_cast<size_type>(std::end(str) - std::begin(str)); + + Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n, + key_bound, suffix_array); + } + + // Given string S of length n. We assume S is terminated by a unique sentinel + // $, which is considered as the smallest character. This sentinel does not + // exist in memory and is only treated implicitly, hence |n| does not count + // the sentinel in this implementation. We denote suf(S,i) the suffix formed + // by S[i..n). + + // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1) + // or suf(S,i) > suf(S,i+1), respectively. + enum SLType : bool { SType, LType }; + + // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is + // S-type or L-type, respectively. + + // A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and + // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS + // character. + + // A substring S[i..j) is an LMS-substring if + // (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other + // LMS characters, or + // (2) S[i..j) is the sentinel $. + + template <class SizeType, class KeyType> + struct Implementation { + static_assert(std::is_unsigned<SizeType>::value, + "SizeType must be unsigned"); + static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned"); + using size_type = SizeType; + using key_type = KeyType; + + using iterator = typename std::vector<size_type>::iterator; + using const_iterator = typename std::vector<size_type>::const_iterator; + + // Partition every suffix based on SL-type. Returns the number of LMS + // suffixes. + template <class StrIt> + static size_type BuildSLPartition( + StrIt str, + size_type length, + key_type key_bound, + std::vector<SLType>::reverse_iterator sl_partition_it) { + // We will count LMS suffixes (S to L-type or last S-type). + size_type lms_count = 0; + + // |previous_type| is initialized to L-type to avoid counting an extra + // LMS suffix at the end + SLType previous_type = LType; + + // Initialized to dummy, impossible key. + key_type previous_key = key_bound; + + // We're travelling backward to determine the partition, + // as if we prepend one character at a time to the string, ex: + // b$ is L-type because b > $. + // ab$ is S-type because a < b, implying ab$ < b$. + // bab$ is L-type because b > a, implying bab$ > ab$. + // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$. + for (auto str_it = std::reverse_iterator<StrIt>(str + length); + str_it != std::reverse_iterator<StrIt>(str); + ++str_it, ++sl_partition_it) { + key_type current_key = *str_it; + + if (current_key > previous_key || previous_key == key_bound) { + // S[i] > S[i + 1] or S[i] is last character. + if (previous_type == SType) + // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore, + // suf(S,i+1) was a LMS suffix. + ++lms_count; + + previous_type = LType; // For next round. + } else if (current_key < previous_key) { + // S[i] < S[i + 1] + previous_type = SType; // For next round. + } + // Else, S[i] == S[i + 1]: + // The next character that differs determines the SL-type, + // so we reuse the last seen type. + + *sl_partition_it = previous_type; + previous_key = current_key; // For next round. + } + + return lms_count; + } + + // Find indices of LMS suffixes and write result to |lms_indices|. + static void FindLmsSuffixes(const std::vector<SLType>& sl_partition, + iterator lms_indices) { + // |previous_type| is initialized to S-type to avoid counting an extra + // LMS suffix at the beginning + SLType previous_type = SType; + for (size_type i = 0; i < sl_partition.size(); ++i) { + if (sl_partition[i] == SType && previous_type == LType) + *lms_indices++ = i; + previous_type = sl_partition[i]; + } + } + + template <class StrIt> + static std::vector<size_type> MakeBucketCount(StrIt str, + size_type length, + key_type key_bound) { + // Occurrence of every unique character is counted in |buckets| + std::vector<size_type> buckets(static_cast<size_type>(key_bound)); + + for (auto it = str; it != str + length; ++it) + ++buckets[*it]; + return buckets; + } + + // Apply induced sort from |lms_indices| to |suffix_array| associated with + // the string |str|. + template <class StrIt, class SAIt> + static void InducedSort(StrIt str, + size_type length, + const std::vector<SLType>& sl_partition, + const std::vector<size_type>& lms_indices, + const std::vector<size_type>& buckets, + SAIt suffix_array) { + // All indices are first marked as unset with the illegal value |length|. + std::fill(suffix_array, suffix_array + length, length); + + // Used to mark bucket boundaries (head or end) as indices in str. + DCHECK(!buckets.empty()); + std::vector<size_type> bucket_bounds(buckets.size()); + + // Step 1: Assign indices for LMS suffixes, populating the end of + // respective buckets but keeping relative order. + + // Find the end of each bucket and write it to |bucket_bounds|. + std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin()); + + // Process each |lms_indices| backward, and assign them to the end of + // their respective buckets, so relative order is preserved. + for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) { + key_type key = str[*it]; + suffix_array[--bucket_bounds[key]] = *it; + } + + // Step 2 + // Scan forward |suffix_array|; for each modified suf(S,i) for which + // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current + // head of the corresponding bucket and forward the bucket head to the + // right. + + // Find the head of each bucket and write it to |bucket_bounds|. Since + // only LMS suffixes where inserted in |suffix_array| during Step 1, + // |bucket_bounds| does not contains the head of each bucket and needs to + // be updated. + bucket_bounds[0] = 0; + std::partial_sum(buckets.begin(), buckets.end() - 1, + bucket_bounds.begin() + 1); + + // From Step 1, the sentinel $, which we treat implicitly, would have + // been placed at the beginning of |suffix_array|, since $ is always + // considered as the smallest character. We then have to deal with the + // previous (last) suffix. + if (sl_partition[length - 1] == LType) { + key_type key = str[length - 1]; + suffix_array[bucket_bounds[key]++] = length - 1; + } + for (auto it = suffix_array; it != suffix_array + length; ++it) { + size_type suffix_index = *it; + + // While the original algorithm marks unset suffixes with -1, + // we found that marking them with |length| is also possible and more + // convenient because we are working with unsigned integers. + if (suffix_index != length && suffix_index > 0 && + sl_partition[--suffix_index] == LType) { + key_type key = str[suffix_index]; + suffix_array[bucket_bounds[key]++] = suffix_index; + } + } + + // Step 3 + // Scan backward |suffix_array|; for each modified suf(S, i) for which + // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current + // end of the corresponding bucket and forward the bucket head to the + // left. + + // Find the end of each bucket and write it to |bucket_bounds|. Since + // only L-type suffixes where inserted in |suffix_array| during Step 2, + // |bucket_bounds| does not contain the end of each bucket and needs to + // be updated. + std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin()); + + for (auto it = std::reverse_iterator<SAIt>(suffix_array + length); + it != std::reverse_iterator<SAIt>(suffix_array); ++it) { + size_type suffix_index = *it; + if (suffix_index != length && suffix_index > 0 && + sl_partition[--suffix_index] == SType) { + key_type key = str[suffix_index]; + suffix_array[--bucket_bounds[key]] = suffix_index; + } + } + // Deals with the last suffix, because of the sentinel. + if (sl_partition[length - 1] == SType) { + key_type key = str[length - 1]; + suffix_array[--bucket_bounds[key]] = length - 1; + } + } + + // Given a string S starting at |str| with length |length|, an array + // starting at |substring_array| containing lexicographically ordered LMS + // terminated substring indices of S and an SL-Type partition |sl_partition| + // of S, assigns a unique label to every unique LMS substring. The sorted + // labels for all LMS substrings are written to |lms_str|, while the indices + // of LMS suffixes are written to |lms_indices|. In addition, returns the + // total number of unique labels. + template <class StrIt, class SAIt> + static size_type LabelLmsSubstrings(StrIt str, + size_type length, + const std::vector<SLType>& sl_partition, + SAIt suffix_array, + iterator lms_indices, + iterator lms_str) { + // Labelling starts at 0. + size_type label = 0; + + // |previous_lms| is initialized to 0 to indicate it is unset. + // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in + // lexicographical order. + size_type previous_lms = 0; + for (auto it = suffix_array; it != suffix_array + length; ++it) { + if (*it > 0 && sl_partition[*it] == SType && + sl_partition[*it - 1] == LType) { + // suf(S, *it) is a LMS suffix. + + size_type current_lms = *it; + if (previous_lms != 0) { + // There was a previous LMS suffix. Check if the current LMS + // substring is equal to the previous one. + SLType current_lms_type = SType; + SLType previous_lms_type = SType; + for (size_type k = 0;; ++k) { + // |current_lms_end| and |previous_lms_end| denote whether we have + // reached the end of the current and previous LMS substring, + // respectively + bool current_lms_end = false; + bool previous_lms_end = false; + + // Check for both previous and current substring ends. + // Note that it is more convenient to check if + // suf(S,current_lms + k) is an LMS suffix than to retrieve it + // from lms_indices. + if (current_lms + k >= length || + (current_lms_type == LType && + sl_partition[current_lms + k] == SType)) { + current_lms_end = true; + } + if (previous_lms + k >= length || + (previous_lms_type == LType && + sl_partition[previous_lms + k] == SType)) { + previous_lms_end = true; + } + + if (current_lms_end && previous_lms_end) { + break; // Previous and current substrings are identical. + } else if (current_lms_end != previous_lms_end || + str[current_lms + k] != str[previous_lms + k]) { + // Previous and current substrings differ, a new label is used. + ++label; + break; + } + + current_lms_type = sl_partition[current_lms + k]; + previous_lms_type = sl_partition[previous_lms + k]; + } + } + *lms_indices++ = *it; + *lms_str++ = label; + previous_lms = current_lms; + } + } + + return label + 1; + } + + // Implementation of the SA-IS algorithm. |str| must be a random access + // iterator pointing at the beginning of S with length |length|. The result + // is writtend in |suffix_array|, a random access iterator. + template <class StrIt, class SAIt> + static void SuffixSort(StrIt str, + size_type length, + key_type key_bound, + SAIt suffix_array) { + if (length == 1) + *suffix_array = 0; + if (length < 2) + return; + + std::vector<SLType> sl_partition(length); + size_type lms_count = + BuildSLPartition(str, length, key_bound, sl_partition.rbegin()); + std::vector<size_type> lms_indices(lms_count); + FindLmsSuffixes(sl_partition, lms_indices.begin()); + std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound); + + if (lms_indices.size() > 1) { + // Given |lms_indices| in the same order they appear in |str|, induce + // LMS substrings relative order and write result to |suffix_array|. + InducedSort(str, length, sl_partition, lms_indices, buckets, + suffix_array); + std::vector<size_type> lms_str(lms_indices.size()); + + // Given LMS substrings in relative order found in |suffix_array|, + // map LMS substrings to unique labels to form a new string, |lms_str|. + size_type label_count = + LabelLmsSubstrings(str, length, sl_partition, suffix_array, + lms_indices.begin(), lms_str.begin()); + + if (label_count < lms_str.size()) { + // Reorder |lms_str| to have LMS suffixes in the same order they + // appear in |str|. + for (size_type i = 0; i < lms_indices.size(); ++i) + suffix_array[lms_indices[i]] = lms_str[i]; + + SLType previous_type = SType; + for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) { + if (sl_partition[i] == SType && previous_type == LType) { + lms_str[j] = suffix_array[i]; + lms_indices[j++] = i; + } + previous_type = sl_partition[i]; + } + + // Recursively apply SuffixSort on |lms_str|, which is formed from + // labeled LMS suffixes in the same order they appear in |str|. + // Note that |KeyType| will be size_type because |lms_str| contains + // indices. |lms_str| is at most half the length of |str|. + Implementation<size_type, size_type>::SuffixSort( + lms_str.begin(), static_cast<size_type>(lms_str.size()), + label_count, suffix_array); + + // Map LMS labels back to indices in |str| and write result to + // |lms_indices|. We're using |suffix_array| as a temporary buffer. + for (size_type i = 0; i < lms_indices.size(); ++i) + suffix_array[i] = lms_indices[suffix_array[i]]; + std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin()); + + // At this point, |lms_indices| contains sorted LMS suffixes of |str|. + } + } + // Given |lms_indices| where LMS suffixes are sorted, induce the full + // order of suffixes in |str|. + InducedSort(str, length, sl_partition, lms_indices, buckets, + suffix_array); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Implementation); + }; +}; + +// Generates a sorted suffix array for the input string |str| using the functor +// |Algorithm| which provides an interface equivalent to NaiveSuffixSort. +/// Characters found in |str| are assumed to be in range [0, |key_bound|). +// Returns the suffix array as a vector. +// |StrRng| is an input random access range. +// |KeyType| is an unsigned integer type. +template <class Algorithm, class StrRng, class KeyType> +std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str, + KeyType key_bound) { + Algorithm sort; + std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin()); + sort(str, key_bound, suffix_array.begin()); + return suffix_array; +} + +// Type requirements: +// |SARng| is an input random access range. +// |StrIt1| is a random access iterator. +// |StrIt2| is a forward iterator. +template <class SARng, class StrIt1, class StrIt2> +// Lexicographical lower bound using binary search for +// [|str2_first|, |str2_last|) in the suffix array |suffix_array| of a string +// starting at |str1_first|. This does not necessarily return the index of +// the longest matching substring. +auto SuffixLowerBound(const SARng& suffix_array, + StrIt1 str1_first, + StrIt2 str2_first, + StrIt2 str2_last) -> decltype(std::begin(suffix_array)) { + using size_type = typename SARng::value_type; + + size_t n = std::end(suffix_array) - std::begin(suffix_array); + auto it = std::lower_bound( + std::begin(suffix_array), std::end(suffix_array), str2_first, + [str1_first, str2_last, n](size_type a, StrIt2 b) { + return std::lexicographical_compare(str1_first + a, str1_first + n, b, + str2_last); + }); + return it; +} + +} // namespace zucchini + +#endif // COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_ |