[Zucchini] Move Zucchini from /chrome/installer/ to /components/.

(Use "git log --follow" to see older revisions of files). /components/ is the most logical place to put Zucchini, which only depends on /base and /testing/gtest. This move also enables Zucchini to be used by the Component Updater. Details: - Move all files; run the following to change deps and guards: sed 's/chrome\/installer/components/' *.cc *.h -i sed 's/CHROME_INSTALLER/COMPONENTS/' *.cc *.h -i - Sorting works out pretty well! - Change all 'chrome/installer/zucchini' to 'components/zucchini' throughout other parts of the repo; sort if necessary. - Fix 6 'git cl lint' errors. - Change 1 Bind() usage to BindRepeated(). - Update OWNER. Bug: 729154 Change-Id: I50c5a7d411ea85f707b5994ab319dfb2a1acccf7 Reviewed-on: https://chromium-review.googlesource.com/954923 Reviewed-by: Greg Thompson <grt@chromium.org> Reviewed-by: Jochen Eisinger <jochen@chromium.org> Reviewed-by: Samuel Huang <huangs@chromium.org> Commit-Queue: Samuel Huang <huangs@chromium.org> Cr-Commit-Position: refs/heads/master@{#542857} NOKEYCHECK=True GitOrigin-RevId: 577ef6c435e8d43be6e3e60ccbcbd1881780f4ec
author: Samuel Huang <huangs@chromium.org> 2018-03-13 18:19:34 +0000
committer: Edward Lesmes <ehmaldonado@google.com> 2021-07-23 21:50:59 +0000
commit: 06f1ae9aaca969ee95ef840f22b6b461c304542d (patch)
tree: f1e5c6624e70628e81fbf38d6cd14b974abe5d93 /suffix_array.h
download: zucchini-06f1ae9aaca969ee95ef840f22b6b461c304542d.tar.gz
1 files changed, 475 insertions, 0 deletions
diff --git a/suffix_array.h b/suffix_array.h
new file mode 100644
index 0000000..ea49a43
--- /dev/null
+++ b/suffix_array.h
@@ -0,0 +1,475 @@
+// Copyright 2017 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
+#define COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+#include <vector>
+
+#include "base/logging.h"
+#include "base/macros.h"
+
+namespace zucchini {
+
+// A functor class that implements the naive suffix sorting algorithm that uses
+// std::sort with lexicographical compare. This is only meant as reference of
+// the interface.
+class NaiveSuffixSort {
+ public:
+  // Type requirements:
+  // |InputRng| is an input random access range.
+  // |KeyType| is an unsigned integer type.
+  // |SAIt| is a random access iterator with mutable references.
+  template <class InputRng, class KeyType, class SAIt>
+  // |str| is the input string on which suffix sort is applied.
+  // Characters found in |str| must be in the range [0, |key_bound|)
+  // |suffix_array| is the beginning of the destination range, which is at least
+  // as large as |str|.
+  void operator()(const InputRng& str,
+                  KeyType key_bound,
+                  SAIt suffix_array) const {
+    using size_type = typename SAIt::value_type;
+
+    size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
+
+    // |suffix_array| is first filled with ordered indices of |str|.
+    // Those indices are then sorted with lexicographical comparisons in |str|.
+    std::iota(suffix_array, suffix_array + n, 0);
+    std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {
+      return std::lexicographical_compare(std::begin(str) + i, std::end(str),
+                                          std::begin(str) + j, std::end(str));
+    });
+  }
+};
+
+// A functor class that implements suffix array induced sorting (SA-IS)
+// algorithm with linear time and memory complexity,
+// see http://ieeexplore.ieee.org/abstract/document/5582081/
+class InducedSuffixSort {
+ public:
+  // Type requirements:
+  // |InputRng| is an input random access range.
+  // |KeyType| is an unsigned integer type.
+  // |SAIt| is a random access iterator with mutable values.
+  template <class InputRng, class KeyType, class SAIt>
+  // |str| is the input string on which suffix sort is applied.
+  // Characters found in |str| must be in the range [0, |key_bound|)
+  // |suffix_array| is the beginning of the destination range, which is at least
+  // as large as |str|.
+  void operator()(const InputRng& str,
+                  KeyType key_bound,
+                  SAIt suffix_array) const {
+    using value_type = typename InputRng::value_type;
+    using size_type = typename SAIt::value_type;
+
+    static_assert(std::is_unsigned<value_type>::value,
+                  "SA-IS only supports input string with unsigned values");
+    static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
+
+    size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
+
+    Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n,
+                                                   key_bound, suffix_array);
+  }
+
+  // Given string S of length n. We assume S is terminated by a unique sentinel
+  // $, which is considered as the smallest character. This sentinel does not
+  // exist in memory and is only treated implicitly, hence |n| does not count
+  // the sentinel in this implementation. We denote suf(S,i) the suffix formed
+  // by S[i..n).
+
+  // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1)
+  // or suf(S,i) > suf(S,i+1), respectively.
+  enum SLType : bool { SType, LType };
+
+  // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is
+  // S-type or L-type, respectively.
+
+  // A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and
+  // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS
+  // character.
+
+  // A substring S[i..j) is an LMS-substring if
+  // (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other
+  //     LMS characters, or
+  // (2) S[i..j) is the sentinel $.
+
+  template <class SizeType, class KeyType>
+  struct Implementation {
+    static_assert(std::is_unsigned<SizeType>::value,
+                  "SizeType must be unsigned");
+    static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
+    using size_type = SizeType;
+    using key_type = KeyType;
+
+    using iterator = typename std::vector<size_type>::iterator;
+    using const_iterator = typename std::vector<size_type>::const_iterator;
+
+    // Partition every suffix based on SL-type. Returns the number of LMS
+    // suffixes.
+    template <class StrIt>
+    static size_type BuildSLPartition(
+        StrIt str,
+        size_type length,
+        key_type key_bound,
+        std::vector<SLType>::reverse_iterator sl_partition_it) {
+      // We will count LMS suffixes (S to L-type or last S-type).
+      size_type lms_count = 0;
+
+      // |previous_type| is initialized to L-type to avoid counting an extra
+      // LMS suffix at the end
+      SLType previous_type = LType;
+
+      // Initialized to dummy, impossible key.
+      key_type previous_key = key_bound;
+
+      // We're travelling backward to determine the partition,
+      // as if we prepend one character at a time to the string, ex:
+      // b$ is L-type because b > $.
+      // ab$ is S-type because a < b, implying ab$ < b$.
+      // bab$ is L-type because b > a, implying bab$ > ab$.
+      // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$.
+      for (auto str_it = std::reverse_iterator<StrIt>(str + length);
+           str_it != std::reverse_iterator<StrIt>(str);
+           ++str_it, ++sl_partition_it) {
+        key_type current_key = *str_it;
+
+        if (current_key > previous_key || previous_key == key_bound) {
+          // S[i] > S[i + 1] or S[i] is last character.
+          if (previous_type == SType)
+            // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore,
+            // suf(S,i+1) was a LMS suffix.
+            ++lms_count;
+
+          previous_type = LType;  // For next round.
+        } else if (current_key < previous_key) {
+          // S[i] < S[i + 1]
+          previous_type = SType;  // For next round.
+        }
+        // Else, S[i] == S[i + 1]:
+        // The next character that differs determines the SL-type,
+        // so we reuse the last seen type.
+
+        *sl_partition_it = previous_type;
+        previous_key = current_key;  // For next round.
+      }
+
+      return lms_count;
+    }
+
+    // Find indices of LMS suffixes and write result to |lms_indices|.
+    static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,
+                                iterator lms_indices) {
+      // |previous_type| is initialized to S-type to avoid counting an extra
+      // LMS suffix at the beginning
+      SLType previous_type = SType;
+      for (size_type i = 0; i < sl_partition.size(); ++i) {
+        if (sl_partition[i] == SType && previous_type == LType)
+          *lms_indices++ = i;
+        previous_type = sl_partition[i];
+      }
+    }
+
+    template <class StrIt>
+    static std::vector<size_type> MakeBucketCount(StrIt str,
+                                                  size_type length,
+                                                  key_type key_bound) {
+      // Occurrence of every unique character is counted in |buckets|
+      std::vector<size_type> buckets(static_cast<size_type>(key_bound));
+
+      for (auto it = str; it != str + length; ++it)
+        ++buckets[*it];
+      return buckets;
+    }
+
+    // Apply induced sort from |lms_indices| to |suffix_array| associated with
+    // the string |str|.
+    template <class StrIt, class SAIt>
+    static void InducedSort(StrIt str,
+                            size_type length,
+                            const std::vector<SLType>& sl_partition,
+                            const std::vector<size_type>& lms_indices,
+                            const std::vector<size_type>& buckets,
+                            SAIt suffix_array) {
+      // All indices are first marked as unset with the illegal value |length|.
+      std::fill(suffix_array, suffix_array + length, length);
+
+      // Used to mark bucket boundaries (head or end) as indices in str.
+      DCHECK(!buckets.empty());
+      std::vector<size_type> bucket_bounds(buckets.size());
+
+      // Step 1: Assign indices for LMS suffixes, populating the end of
+      // respective buckets but keeping relative order.
+
+      // Find the end of each bucket and write it to |bucket_bounds|.
+      std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
+
+      // Process each |lms_indices| backward, and assign them to the end of
+      // their respective buckets, so relative order is preserved.
+      for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) {
+        key_type key = str[*it];
+        suffix_array[--bucket_bounds[key]] = *it;
+      }
+
+      // Step 2
+      // Scan forward |suffix_array|; for each modified suf(S,i) for which
+      // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current
+      // head of the corresponding bucket and forward the bucket head to the
+      // right.
+
+      // Find the head of each bucket and write it to |bucket_bounds|. Since
+      // only LMS suffixes where inserted in |suffix_array| during Step 1,
+      // |bucket_bounds| does not contains the head of each bucket and needs to
+      // be updated.
+      bucket_bounds[0] = 0;
+      std::partial_sum(buckets.begin(), buckets.end() - 1,
+                       bucket_bounds.begin() + 1);
+
+      // From Step 1, the sentinel $, which we treat implicitly, would have
+      // been placed at the beginning of |suffix_array|, since $ is always
+      // considered as the smallest character. We then have to deal with the
+      // previous (last) suffix.
+      if (sl_partition[length - 1] == LType) {
+        key_type key = str[length - 1];
+        suffix_array[bucket_bounds[key]++] = length - 1;
+      }
+      for (auto it = suffix_array; it != suffix_array + length; ++it) {
+        size_type suffix_index = *it;
+
+        // While the original algorithm marks unset suffixes with -1,
+        // we found that marking them with |length| is also possible and more
+        // convenient because we are working with unsigned integers.
+        if (suffix_index != length && suffix_index > 0 &&
+            sl_partition[--suffix_index] == LType) {
+          key_type key = str[suffix_index];
+          suffix_array[bucket_bounds[key]++] = suffix_index;
+        }
+      }
+
+      // Step 3
+      // Scan backward |suffix_array|; for each modified suf(S, i) for which
+      // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current
+      // end of the corresponding bucket and forward the bucket head to the
+      // left.
+
+      // Find the end of each bucket and write it to |bucket_bounds|. Since
+      // only L-type suffixes where inserted in |suffix_array| during Step 2,
+      // |bucket_bounds| does not contain the end of each bucket and needs to
+      // be updated.
+      std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
+
+      for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);
+           it != std::reverse_iterator<SAIt>(suffix_array); ++it) {
+        size_type suffix_index = *it;
+        if (suffix_index != length && suffix_index > 0 &&
+            sl_partition[--suffix_index] == SType) {
+          key_type key = str[suffix_index];
+          suffix_array[--bucket_bounds[key]] = suffix_index;
+        }
+      }
+      // Deals with the last suffix, because of the sentinel.
+      if (sl_partition[length - 1] == SType) {
+        key_type key = str[length - 1];
+        suffix_array[--bucket_bounds[key]] = length - 1;
+      }
+    }
+
+    // Given a string S starting at |str| with length |length|, an array
+    // starting at |substring_array| containing lexicographically ordered LMS
+    // terminated substring indices of S and an SL-Type partition |sl_partition|
+    // of S, assigns a unique label to every unique LMS substring. The sorted
+    // labels for all LMS substrings are written to |lms_str|, while the indices
+    // of LMS suffixes are written to |lms_indices|. In addition, returns the
+    // total number of unique labels.
+    template <class StrIt, class SAIt>
+    static size_type LabelLmsSubstrings(StrIt str,
+                                        size_type length,
+                                        const std::vector<SLType>& sl_partition,
+                                        SAIt suffix_array,
+                                        iterator lms_indices,
+                                        iterator lms_str) {
+      // Labelling starts at 0.
+      size_type label = 0;
+
+      // |previous_lms| is initialized to 0 to indicate it is unset.
+      // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in
+      // lexicographical order.
+      size_type previous_lms = 0;
+      for (auto it = suffix_array; it != suffix_array + length; ++it) {
+        if (*it > 0 && sl_partition[*it] == SType &&
+            sl_partition[*it - 1] == LType) {
+          // suf(S, *it) is a LMS suffix.
+
+          size_type current_lms = *it;
+          if (previous_lms != 0) {
+            // There was a previous LMS suffix. Check if the current LMS
+            // substring is equal to the previous one.
+            SLType current_lms_type = SType;
+            SLType previous_lms_type = SType;
+            for (size_type k = 0;; ++k) {
+              // |current_lms_end| and |previous_lms_end| denote whether we have
+              // reached the end of the current and previous LMS substring,
+              // respectively
+              bool current_lms_end = false;
+              bool previous_lms_end = false;
+
+              // Check for both previous and current substring ends.
+              // Note that it is more convenient to check if
+              // suf(S,current_lms + k) is an LMS suffix than to retrieve it
+              // from lms_indices.
+              if (current_lms + k >= length ||
+                  (current_lms_type == LType &&
+                   sl_partition[current_lms + k] == SType)) {
+                current_lms_end = true;
+              }
+              if (previous_lms + k >= length ||
+                  (previous_lms_type == LType &&
+                   sl_partition[previous_lms + k] == SType)) {
+                previous_lms_end = true;
+              }
+
+              if (current_lms_end && previous_lms_end) {
+                break;  // Previous and current substrings are identical.
+              } else if (current_lms_end != previous_lms_end ||
+                         str[current_lms + k] != str[previous_lms + k]) {
+                // Previous and current substrings differ, a new label is used.
+                ++label;
+                break;
+              }
+
+              current_lms_type = sl_partition[current_lms + k];
+              previous_lms_type = sl_partition[previous_lms + k];
+            }
+          }
+          *lms_indices++ = *it;
+          *lms_str++ = label;
+          previous_lms = current_lms;
+        }
+      }
+
+      return label + 1;
+    }
+
+    // Implementation of the SA-IS algorithm. |str| must be a random access
+    // iterator pointing at the beginning of S with length |length|. The result
+    // is writtend in |suffix_array|, a random access iterator.
+    template <class StrIt, class SAIt>
+    static void SuffixSort(StrIt str,
+                           size_type length,
+                           key_type key_bound,
+                           SAIt suffix_array) {
+      if (length == 1)
+        *suffix_array = 0;
+      if (length < 2)
+        return;
+
+      std::vector<SLType> sl_partition(length);
+      size_type lms_count =
+          BuildSLPartition(str, length, key_bound, sl_partition.rbegin());
+      std::vector<size_type> lms_indices(lms_count);
+      FindLmsSuffixes(sl_partition, lms_indices.begin());
+      std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound);
+
+      if (lms_indices.size() > 1) {
+        // Given |lms_indices| in the same order they appear in |str|, induce
+        // LMS substrings relative order and write result to |suffix_array|.
+        InducedSort(str, length, sl_partition, lms_indices, buckets,
+                    suffix_array);
+        std::vector<size_type> lms_str(lms_indices.size());
+
+        // Given LMS substrings in relative order found in |suffix_array|,
+        // map LMS substrings to unique labels to form a new string, |lms_str|.
+        size_type label_count =
+            LabelLmsSubstrings(str, length, sl_partition, suffix_array,
+                               lms_indices.begin(), lms_str.begin());
+
+        if (label_count < lms_str.size()) {
+          // Reorder |lms_str| to have LMS suffixes in the same order they
+          // appear in |str|.
+          for (size_type i = 0; i < lms_indices.size(); ++i)
+            suffix_array[lms_indices[i]] = lms_str[i];
+
+          SLType previous_type = SType;
+          for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {
+            if (sl_partition[i] == SType && previous_type == LType) {
+              lms_str[j] = suffix_array[i];
+              lms_indices[j++] = i;
+            }
+            previous_type = sl_partition[i];
+          }
+
+          // Recursively apply SuffixSort on |lms_str|, which is formed from
+          // labeled LMS suffixes in the same order they appear in |str|.
+          // Note that |KeyType| will be size_type because |lms_str| contains
+          // indices. |lms_str| is at most half the length of |str|.
+          Implementation<size_type, size_type>::SuffixSort(
+              lms_str.begin(), static_cast<size_type>(lms_str.size()),
+              label_count, suffix_array);
+
+          // Map LMS labels back to indices in |str| and write result to
+          // |lms_indices|. We're using |suffix_array| as a temporary buffer.
+          for (size_type i = 0; i < lms_indices.size(); ++i)
+            suffix_array[i] = lms_indices[suffix_array[i]];
+          std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin());
+
+          // At this point, |lms_indices| contains sorted LMS suffixes of |str|.
+        }
+      }
+      // Given |lms_indices| where LMS suffixes are sorted, induce the full
+      // order of suffixes in |str|.
+      InducedSort(str, length, sl_partition, lms_indices, buckets,
+                  suffix_array);
+    }
+
+   private:
+    DISALLOW_IMPLICIT_CONSTRUCTORS(Implementation);
+  };
+};
+
+// Generates a sorted suffix array for the input string |str| using the functor
+// |Algorithm| which provides an interface equivalent to NaiveSuffixSort.
+/// Characters found in |str| are assumed to be in range [0, |key_bound|).
+// Returns the suffix array as a vector.
+// |StrRng| is an input random access range.
+// |KeyType| is an unsigned integer type.
+template <class Algorithm, class StrRng, class KeyType>
+std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,
+                                                        KeyType key_bound) {
+  Algorithm sort;
+  std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());
+  sort(str, key_bound, suffix_array.begin());
+  return suffix_array;
+}
+
+// Type requirements:
+// |SARng| is an input random access range.
+// |StrIt1| is a random access iterator.
+// |StrIt2| is a forward iterator.
+template <class SARng, class StrIt1, class StrIt2>
+// Lexicographical lower bound using binary search for
+// [|str2_first|, |str2_last|) in the suffix array |suffix_array| of a string
+// starting at |str1_first|. This does not necessarily return the index of
+// the longest matching substring.
+auto SuffixLowerBound(const SARng& suffix_array,
+                      StrIt1 str1_first,
+                      StrIt2 str2_first,
+                      StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {
+  using size_type = typename SARng::value_type;
+
+  size_t n = std::end(suffix_array) - std::begin(suffix_array);
+  auto it = std::lower_bound(
+      std::begin(suffix_array), std::end(suffix_array), str2_first,
+      [str1_first, str2_last, n](size_type a, StrIt2 b) {
+        return std::lexicographical_compare(str1_first + a, str1_first + n, b,
+                                            str2_last);
+      });
+  return it;
+}
+
+}  // namespace zucchini
+
+#endif  // COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
author	Samuel Huang <huangs@chromium.org>	2018-03-13 18:19:34 +0000
committer	Edward Lesmes <ehmaldonado@google.com>	2021-07-23 21:50:59 +0000
commit	06f1ae9aaca969ee95ef840f22b6b461c304542d (patch)
tree	f1e5c6624e70628e81fbf38d6cd14b974abe5d93 /suffix_array.h
download	zucchini-06f1ae9aaca969ee95ef840f22b6b461c304542d.tar.gz