// Copyright 2017 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_ #define COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_ #include #include #include #include #include "base/check.h" namespace zucchini { // A functor class that implements the naive suffix sorting algorithm that uses // std::sort with lexicographical compare. This is only meant as reference of // the interface. class NaiveSuffixSort { public: // Type requirements: // |InputRng| is an input random access range. // |KeyType| is an unsigned integer type. // |SAIt| is a random access iterator with mutable references. template // |str| is the input string on which suffix sort is applied. // Characters found in |str| must be in the range [0, |key_bound|) // |suffix_array| is the beginning of the destination range, which is at least // as large as |str|. void operator()(const InputRng& str, KeyType key_bound, SAIt suffix_array) const { using size_type = typename SAIt::value_type; size_type n = static_cast(std::end(str) - std::begin(str)); // |suffix_array| is first filled with ordered indices of |str|. // Those indices are then sorted with lexicographical comparisons in |str|. std::iota(suffix_array, suffix_array + n, 0); std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) { return std::lexicographical_compare(std::begin(str) + i, std::end(str), std::begin(str) + j, std::end(str)); }); } }; // A functor class that implements suffix array induced sorting (SA-IS) // algorithm with linear time and memory complexity, // see http://ieeexplore.ieee.org/abstract/document/5582081/ class InducedSuffixSort { public: // Type requirements: // |InputRng| is an input random access range. // |KeyType| is an unsigned integer type. // |SAIt| is a random access iterator with mutable values. template // |str| is the input string on which suffix sort is applied. // Characters found in |str| must be in the range [0, |key_bound|) // |suffix_array| is the beginning of the destination range, which is at least // as large as |str|. void operator()(const InputRng& str, KeyType key_bound, SAIt suffix_array) const { using value_type = typename InputRng::value_type; using size_type = typename SAIt::value_type; static_assert(std::is_unsigned::value, "SA-IS only supports input string with unsigned values"); static_assert(std::is_unsigned::value, "KeyType must be unsigned"); size_type n = static_cast(std::end(str) - std::begin(str)); Implementation::SuffixSort(std::begin(str), n, key_bound, suffix_array); } // Given string S of length n. We assume S is terminated by a unique sentinel // $, which is considered as the smallest character. This sentinel does not // exist in memory and is only treated implicitly, hence |n| does not count // the sentinel in this implementation. We denote suf(S,i) the suffix formed // by S[i..n). // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1) // or suf(S,i) > suf(S,i+1), respectively. enum SLType : bool { SType, LType }; // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is // S-type or L-type, respectively. // A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS // character. // A substring S[i..j) is an LMS-substring if // (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other // LMS characters, or // (2) S[i..j) is the sentinel $. template struct Implementation { static_assert(std::is_unsigned::value, "SizeType must be unsigned"); static_assert(std::is_unsigned::value, "KeyType must be unsigned"); using size_type = SizeType; using key_type = KeyType; using iterator = typename std::vector::iterator; using const_iterator = typename std::vector::const_iterator; // Partition every suffix based on SL-type. Returns the number of LMS // suffixes. template static size_type BuildSLPartition( StrIt str, size_type length, key_type key_bound, std::vector::reverse_iterator sl_partition_it) { // We will count LMS suffixes (S to L-type or last S-type). size_type lms_count = 0; // |previous_type| is initialized to L-type to avoid counting an extra // LMS suffix at the end SLType previous_type = LType; // Initialized to dummy, impossible key. key_type previous_key = key_bound; // We're travelling backward to determine the partition, // as if we prepend one character at a time to the string, ex: // b$ is L-type because b > $. // ab$ is S-type because a < b, implying ab$ < b$. // bab$ is L-type because b > a, implying bab$ > ab$. // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$. for (auto str_it = std::reverse_iterator(str + length); str_it != std::reverse_iterator(str); ++str_it, ++sl_partition_it) { key_type current_key = *str_it; if (current_key > previous_key || previous_key == key_bound) { // S[i] > S[i + 1] or S[i] is last character. if (previous_type == SType) // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore, // suf(S,i+1) was a LMS suffix. ++lms_count; previous_type = LType; // For next round. } else if (current_key < previous_key) { // S[i] < S[i + 1] previous_type = SType; // For next round. } // Else, S[i] == S[i + 1]: // The next character that differs determines the SL-type, // so we reuse the last seen type. *sl_partition_it = previous_type; previous_key = current_key; // For next round. } return lms_count; } // Find indices of LMS suffixes and write result to |lms_indices|. static void FindLmsSuffixes(const std::vector& sl_partition, iterator lms_indices) { // |previous_type| is initialized to S-type to avoid counting an extra // LMS suffix at the beginning SLType previous_type = SType; for (size_type i = 0; i < sl_partition.size(); ++i) { if (sl_partition[i] == SType && previous_type == LType) *lms_indices++ = i; previous_type = sl_partition[i]; } } template static std::vector MakeBucketCount(StrIt str, size_type length, key_type key_bound) { // Occurrence of every unique character is counted in |buckets| std::vector buckets(static_cast(key_bound)); for (auto it = str; it != str + length; ++it) ++buckets[*it]; return buckets; } // Apply induced sort from |lms_indices| to |suffix_array| associated with // the string |str|. template static void InducedSort(StrIt str, size_type length, const std::vector& sl_partition, const std::vector& lms_indices, const std::vector& buckets, SAIt suffix_array) { // All indices are first marked as unset with the illegal value |length|. std::fill(suffix_array, suffix_array + length, length); // Used to mark bucket boundaries (head or end) as indices in str. DCHECK(!buckets.empty()); std::vector bucket_bounds(buckets.size()); // Step 1: Assign indices for LMS suffixes, populating the end of // respective buckets but keeping relative order. // Find the end of each bucket and write it to |bucket_bounds|. std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin()); // Process each |lms_indices| backward, and assign them to the end of // their respective buckets, so relative order is preserved. for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) { key_type key = str[*it]; suffix_array[--bucket_bounds[key]] = *it; } // Step 2 // Scan forward |suffix_array|; for each modified suf(S,i) for which // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current // head of the corresponding bucket and forward the bucket head to the // right. // Find the head of each bucket and write it to |bucket_bounds|. Since // only LMS suffixes where inserted in |suffix_array| during Step 1, // |bucket_bounds| does not contains the head of each bucket and needs to // be updated. bucket_bounds[0] = 0; std::partial_sum(buckets.begin(), buckets.end() - 1, bucket_bounds.begin() + 1); // From Step 1, the sentinel $, which we treat implicitly, would have // been placed at the beginning of |suffix_array|, since $ is always // considered as the smallest character. We then have to deal with the // previous (last) suffix. if (sl_partition[length - 1] == LType) { key_type key = str[length - 1]; suffix_array[bucket_bounds[key]++] = length - 1; } for (auto it = suffix_array; it != suffix_array + length; ++it) { size_type suffix_index = *it; // While the original algorithm marks unset suffixes with -1, // we found that marking them with |length| is also possible and more // convenient because we are working with unsigned integers. if (suffix_index != length && suffix_index > 0 && sl_partition[--suffix_index] == LType) { key_type key = str[suffix_index]; suffix_array[bucket_bounds[key]++] = suffix_index; } } // Step 3 // Scan backward |suffix_array|; for each modified suf(S, i) for which // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current // end of the corresponding bucket and forward the bucket head to the // left. // Find the end of each bucket and write it to |bucket_bounds|. Since // only L-type suffixes where inserted in |suffix_array| during Step 2, // |bucket_bounds| does not contain the end of each bucket and needs to // be updated. std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin()); for (auto it = std::reverse_iterator(suffix_array + length); it != std::reverse_iterator(suffix_array); ++it) { size_type suffix_index = *it; if (suffix_index != length && suffix_index > 0 && sl_partition[--suffix_index] == SType) { key_type key = str[suffix_index]; suffix_array[--bucket_bounds[key]] = suffix_index; } } // Deals with the last suffix, because of the sentinel. if (sl_partition[length - 1] == SType) { key_type key = str[length - 1]; suffix_array[--bucket_bounds[key]] = length - 1; } } // Given a string S starting at |str| with length |length|, an array // starting at |substring_array| containing lexicographically ordered LMS // terminated substring indices of S and an SL-Type partition |sl_partition| // of S, assigns a unique label to every unique LMS substring. The sorted // labels for all LMS substrings are written to |lms_str|, while the indices // of LMS suffixes are written to |lms_indices|. In addition, returns the // total number of unique labels. template static size_type LabelLmsSubstrings(StrIt str, size_type length, const std::vector& sl_partition, SAIt suffix_array, iterator lms_indices, iterator lms_str) { // Labelling starts at 0. size_type label = 0; // |previous_lms| is initialized to 0 to indicate it is unset. // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in // lexicographical order. size_type previous_lms = 0; for (auto it = suffix_array; it != suffix_array + length; ++it) { if (*it > 0 && sl_partition[*it] == SType && sl_partition[*it - 1] == LType) { // suf(S, *it) is a LMS suffix. size_type current_lms = *it; if (previous_lms != 0) { // There was a previous LMS suffix. Check if the current LMS // substring is equal to the previous one. SLType current_lms_type = SType; SLType previous_lms_type = SType; for (size_type k = 0;; ++k) { // |current_lms_end| and |previous_lms_end| denote whether we have // reached the end of the current and previous LMS substring, // respectively bool current_lms_end = false; bool previous_lms_end = false; // Check for both previous and current substring ends. // Note that it is more convenient to check if // suf(S,current_lms + k) is an LMS suffix than to retrieve it // from lms_indices. if (current_lms + k >= length || (current_lms_type == LType && sl_partition[current_lms + k] == SType)) { current_lms_end = true; } if (previous_lms + k >= length || (previous_lms_type == LType && sl_partition[previous_lms + k] == SType)) { previous_lms_end = true; } if (current_lms_end && previous_lms_end) { break; // Previous and current substrings are identical. } else if (current_lms_end != previous_lms_end || str[current_lms + k] != str[previous_lms + k]) { // Previous and current substrings differ, a new label is used. ++label; break; } current_lms_type = sl_partition[current_lms + k]; previous_lms_type = sl_partition[previous_lms + k]; } } *lms_indices++ = *it; *lms_str++ = label; previous_lms = current_lms; } } return label + 1; } // Implementation of the SA-IS algorithm. |str| must be a random access // iterator pointing at the beginning of S with length |length|. The result // is writtend in |suffix_array|, a random access iterator. template static void SuffixSort(StrIt str, size_type length, key_type key_bound, SAIt suffix_array) { if (length == 1) *suffix_array = 0; if (length < 2) return; std::vector sl_partition(length); size_type lms_count = BuildSLPartition(str, length, key_bound, sl_partition.rbegin()); std::vector lms_indices(lms_count); FindLmsSuffixes(sl_partition, lms_indices.begin()); std::vector buckets = MakeBucketCount(str, length, key_bound); if (lms_indices.size() > 1) { // Given |lms_indices| in the same order they appear in |str|, induce // LMS substrings relative order and write result to |suffix_array|. InducedSort(str, length, sl_partition, lms_indices, buckets, suffix_array); std::vector lms_str(lms_indices.size()); // Given LMS substrings in relative order found in |suffix_array|, // map LMS substrings to unique labels to form a new string, |lms_str|. size_type label_count = LabelLmsSubstrings(str, length, sl_partition, suffix_array, lms_indices.begin(), lms_str.begin()); if (label_count < lms_str.size()) { // Reorder |lms_str| to have LMS suffixes in the same order they // appear in |str|. for (size_type i = 0; i < lms_indices.size(); ++i) suffix_array[lms_indices[i]] = lms_str[i]; SLType previous_type = SType; for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) { if (sl_partition[i] == SType && previous_type == LType) { lms_str[j] = suffix_array[i]; lms_indices[j++] = i; } previous_type = sl_partition[i]; } // Recursively apply SuffixSort on |lms_str|, which is formed from // labeled LMS suffixes in the same order they appear in |str|. // Note that |KeyType| will be size_type because |lms_str| contains // indices. |lms_str| is at most half the length of |str|. Implementation::SuffixSort( lms_str.begin(), static_cast(lms_str.size()), label_count, suffix_array); // Map LMS labels back to indices in |str| and write result to // |lms_indices|. We're using |suffix_array| as a temporary buffer. for (size_type i = 0; i < lms_indices.size(); ++i) suffix_array[i] = lms_indices[suffix_array[i]]; std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin()); // At this point, |lms_indices| contains sorted LMS suffixes of |str|. } } // Given |lms_indices| where LMS suffixes are sorted, induce the full // order of suffixes in |str|. InducedSort(str, length, sl_partition, lms_indices, buckets, suffix_array); } Implementation() = delete; Implementation(const Implementation&) = delete; const Implementation& operator=(const Implementation&) = delete; }; }; // Generates a sorted suffix array for the input string |str| using the functor // |Algorithm| which provides an interface equivalent to NaiveSuffixSort. /// Characters found in |str| are assumed to be in range [0, |key_bound|). // Returns the suffix array as a vector. // |StrRng| is an input random access range. // |KeyType| is an unsigned integer type. template std::vector MakeSuffixArray(const StrRng& str, KeyType key_bound) { Algorithm sort; std::vector suffix_array(str.end() - str.begin()); sort(str, key_bound, suffix_array.begin()); return suffix_array; } // Type requirements: // |SARng| is an input random access range. // |StrIt1| is a random access iterator. // |StrIt2| is a forward iterator. template // Lexicographical lower bound using binary search for // [|str2_first|, |str2_last|) in the suffix array |suffix_array| of a string // starting at |str1_first|. This does not necessarily return the index of // the longest matching substring. auto SuffixLowerBound(const SARng& suffix_array, StrIt1 str1_first, StrIt2 str2_first, StrIt2 str2_last) -> decltype(std::begin(suffix_array)) { using size_type = typename SARng::value_type; size_t n = std::end(suffix_array) - std::begin(suffix_array); auto it = std::lower_bound( std::begin(suffix_array), std::end(suffix_array), str2_first, [str1_first, str2_last, n](size_type a, StrIt2 b) { return std::lexicographical_compare(str1_first + a, str1_first + n, b, str2_last); }); return it; } } // namespace zucchini #endif // COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_