aboutsummaryrefslogtreecommitdiff
path: root/suffix_array.h
blob: 75b3a38e4eba957b5f4ed047ead19c6d665b321b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
#define COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_

#include <algorithm>
#include <iterator>
#include <numeric>
#include <vector>

#include "base/check.h"

namespace zucchini {

// A functor class that implements the naive suffix sorting algorithm that uses
// std::sort with lexicographical compare. This is only meant as reference of
// the interface.
class NaiveSuffixSort {
 public:
  // Type requirements:
  // |InputRng| is an input random access range.
  // |KeyType| is an unsigned integer type.
  // |SAIt| is a random access iterator with mutable references.
  template <class InputRng, class KeyType, class SAIt>
  // |str| is the input string on which suffix sort is applied.
  // Characters found in |str| must be in the range [0, |key_bound|)
  // |suffix_array| is the beginning of the destination range, which is at least
  // as large as |str|.
  void operator()(const InputRng& str,
                  KeyType key_bound,
                  SAIt suffix_array) const {
    using size_type = typename SAIt::value_type;

    size_type n = static_cast<size_type>(std::end(str) - std::begin(str));

    // |suffix_array| is first filled with ordered indices of |str|.
    // Those indices are then sorted with lexicographical comparisons in |str|.
    std::iota(suffix_array, suffix_array + n, 0);
    std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {
      return std::lexicographical_compare(std::begin(str) + i, std::end(str),
                                          std::begin(str) + j, std::end(str));
    });
  }
};

// A functor class that implements suffix array induced sorting (SA-IS)
// algorithm with linear time and memory complexity,
// see http://ieeexplore.ieee.org/abstract/document/5582081/
class InducedSuffixSort {
 public:
  // Type requirements:
  // |InputRng| is an input random access range.
  // |KeyType| is an unsigned integer type.
  // |SAIt| is a random access iterator with mutable values.
  template <class InputRng, class KeyType, class SAIt>
  // |str| is the input string on which suffix sort is applied.
  // Characters found in |str| must be in the range [0, |key_bound|)
  // |suffix_array| is the beginning of the destination range, which is at least
  // as large as |str|.
  void operator()(const InputRng& str,
                  KeyType key_bound,
                  SAIt suffix_array) const {
    using value_type = typename InputRng::value_type;
    using size_type = typename SAIt::value_type;

    static_assert(std::is_unsigned<value_type>::value,
                  "SA-IS only supports input string with unsigned values");
    static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");

    size_type n = static_cast<size_type>(std::end(str) - std::begin(str));

    Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n,
                                                   key_bound, suffix_array);
  }

  // Given string S of length n. We assume S is terminated by a unique sentinel
  // $, which is considered as the smallest character. This sentinel does not
  // exist in memory and is only treated implicitly, hence |n| does not count
  // the sentinel in this implementation. We denote suf(S,i) the suffix formed
  // by S[i..n).

  // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1)
  // or suf(S,i) > suf(S,i+1), respectively.
  enum SLType : bool { SType, LType };

  // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is
  // S-type or L-type, respectively.

  // A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and
  // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS
  // character.

  // A substring S[i..j) is an LMS-substring if
  // (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other
  //     LMS characters, or
  // (2) S[i..j) is the sentinel $.

  template <class SizeType, class KeyType>
  struct Implementation {
    static_assert(std::is_unsigned<SizeType>::value,
                  "SizeType must be unsigned");
    static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
    using size_type = SizeType;
    using key_type = KeyType;

    using iterator = typename std::vector<size_type>::iterator;
    using const_iterator = typename std::vector<size_type>::const_iterator;

    // Partition every suffix based on SL-type. Returns the number of LMS
    // suffixes.
    template <class StrIt>
    static size_type BuildSLPartition(
        StrIt str,
        size_type length,
        key_type key_bound,
        std::vector<SLType>::reverse_iterator sl_partition_it) {
      // We will count LMS suffixes (S to L-type or last S-type).
      size_type lms_count = 0;

      // |previous_type| is initialized to L-type to avoid counting an extra
      // LMS suffix at the end
      SLType previous_type = LType;

      // Initialized to dummy, impossible key.
      key_type previous_key = key_bound;

      // We're travelling backward to determine the partition,
      // as if we prepend one character at a time to the string, ex:
      // b$ is L-type because b > $.
      // ab$ is S-type because a < b, implying ab$ < b$.
      // bab$ is L-type because b > a, implying bab$ > ab$.
      // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$.
      for (auto str_it = std::reverse_iterator<StrIt>(str + length);
           str_it != std::reverse_iterator<StrIt>(str);
           ++str_it, ++sl_partition_it) {
        key_type current_key = *str_it;

        if (current_key > previous_key || previous_key == key_bound) {
          // S[i] > S[i + 1] or S[i] is last character.
          if (previous_type == SType)
            // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore,
            // suf(S,i+1) was a LMS suffix.
            ++lms_count;

          previous_type = LType;  // For next round.
        } else if (current_key < previous_key) {
          // S[i] < S[i + 1]
          previous_type = SType;  // For next round.
        }
        // Else, S[i] == S[i + 1]:
        // The next character that differs determines the SL-type,
        // so we reuse the last seen type.

        *sl_partition_it = previous_type;
        previous_key = current_key;  // For next round.
      }

      return lms_count;
    }

    // Find indices of LMS suffixes and write result to |lms_indices|.
    static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,
                                iterator lms_indices) {
      // |previous_type| is initialized to S-type to avoid counting an extra
      // LMS suffix at the beginning
      SLType previous_type = SType;
      for (size_type i = 0; i < sl_partition.size(); ++i) {
        if (sl_partition[i] == SType && previous_type == LType)
          *lms_indices++ = i;
        previous_type = sl_partition[i];
      }
    }

    template <class StrIt>
    static std::vector<size_type> MakeBucketCount(StrIt str,
                                                  size_type length,
                                                  key_type key_bound) {
      // Occurrence of every unique character is counted in |buckets|
      std::vector<size_type> buckets(static_cast<size_type>(key_bound));

      for (auto it = str; it != str + length; ++it)
        ++buckets[*it];
      return buckets;
    }

    // Apply induced sort from |lms_indices| to |suffix_array| associated with
    // the string |str|.
    template <class StrIt, class SAIt>
    static void InducedSort(StrIt str,
                            size_type length,
                            const std::vector<SLType>& sl_partition,
                            const std::vector<size_type>& lms_indices,
                            const std::vector<size_type>& buckets,
                            SAIt suffix_array) {
      // All indices are first marked as unset with the illegal value |length|.
      std::fill(suffix_array, suffix_array + length, length);

      // Used to mark bucket boundaries (head or end) as indices in str.
      DCHECK(!buckets.empty());
      std::vector<size_type> bucket_bounds(buckets.size());

      // Step 1: Assign indices for LMS suffixes, populating the end of
      // respective buckets but keeping relative order.

      // Find the end of each bucket and write it to |bucket_bounds|.
      std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());

      // Process each |lms_indices| backward, and assign them to the end of
      // their respective buckets, so relative order is preserved.
      for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) {
        key_type key = str[*it];
        suffix_array[--bucket_bounds[key]] = *it;
      }

      // Step 2
      // Scan forward |suffix_array|; for each modified suf(S,i) for which
      // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current
      // head of the corresponding bucket and forward the bucket head to the
      // right.

      // Find the head of each bucket and write it to |bucket_bounds|. Since
      // only LMS suffixes where inserted in |suffix_array| during Step 1,
      // |bucket_bounds| does not contains the head of each bucket and needs to
      // be updated.
      bucket_bounds[0] = 0;
      std::partial_sum(buckets.begin(), buckets.end() - 1,
                       bucket_bounds.begin() + 1);

      // From Step 1, the sentinel $, which we treat implicitly, would have
      // been placed at the beginning of |suffix_array|, since $ is always
      // considered as the smallest character. We then have to deal with the
      // previous (last) suffix.
      if (sl_partition[length - 1] == LType) {
        key_type key = str[length - 1];
        suffix_array[bucket_bounds[key]++] = length - 1;
      }
      for (auto it = suffix_array; it != suffix_array + length; ++it) {
        size_type suffix_index = *it;

        // While the original algorithm marks unset suffixes with -1,
        // we found that marking them with |length| is also possible and more
        // convenient because we are working with unsigned integers.
        if (suffix_index != length && suffix_index > 0 &&
            sl_partition[--suffix_index] == LType) {
          key_type key = str[suffix_index];
          suffix_array[bucket_bounds[key]++] = suffix_index;
        }
      }

      // Step 3
      // Scan backward |suffix_array|; for each modified suf(S, i) for which
      // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current
      // end of the corresponding bucket and forward the bucket head to the
      // left.

      // Find the end of each bucket and write it to |bucket_bounds|. Since
      // only L-type suffixes where inserted in |suffix_array| during Step 2,
      // |bucket_bounds| does not contain the end of each bucket and needs to
      // be updated.
      std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());

      for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);
           it != std::reverse_iterator<SAIt>(suffix_array); ++it) {
        size_type suffix_index = *it;
        if (suffix_index != length && suffix_index > 0 &&
            sl_partition[--suffix_index] == SType) {
          key_type key = str[suffix_index];
          suffix_array[--bucket_bounds[key]] = suffix_index;
        }
      }
      // Deals with the last suffix, because of the sentinel.
      if (sl_partition[length - 1] == SType) {
        key_type key = str[length - 1];
        suffix_array[--bucket_bounds[key]] = length - 1;
      }
    }

    // Given a string S starting at |str| with length |length|, an array
    // starting at |substring_array| containing lexicographically ordered LMS
    // terminated substring indices of S and an SL-Type partition |sl_partition|
    // of S, assigns a unique label to every unique LMS substring. The sorted
    // labels for all LMS substrings are written to |lms_str|, while the indices
    // of LMS suffixes are written to |lms_indices|. In addition, returns the
    // total number of unique labels.
    template <class StrIt, class SAIt>
    static size_type LabelLmsSubstrings(StrIt str,
                                        size_type length,
                                        const std::vector<SLType>& sl_partition,
                                        SAIt suffix_array,
                                        iterator lms_indices,
                                        iterator lms_str) {
      // Labelling starts at 0.
      size_type label = 0;

      // |previous_lms| is initialized to 0 to indicate it is unset.
      // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in
      // lexicographical order.
      size_type previous_lms = 0;
      for (auto it = suffix_array; it != suffix_array + length; ++it) {
        if (*it > 0 && sl_partition[*it] == SType &&
            sl_partition[*it - 1] == LType) {
          // suf(S, *it) is a LMS suffix.

          size_type current_lms = *it;
          if (previous_lms != 0) {
            // There was a previous LMS suffix. Check if the current LMS
            // substring is equal to the previous one.
            SLType current_lms_type = SType;
            SLType previous_lms_type = SType;
            for (size_type k = 0;; ++k) {
              // |current_lms_end| and |previous_lms_end| denote whether we have
              // reached the end of the current and previous LMS substring,
              // respectively
              bool current_lms_end = false;
              bool previous_lms_end = false;

              // Check for both previous and current substring ends.
              // Note that it is more convenient to check if
              // suf(S,current_lms + k) is an LMS suffix than to retrieve it
              // from lms_indices.
              if (current_lms + k >= length ||
                  (current_lms_type == LType &&
                   sl_partition[current_lms + k] == SType)) {
                current_lms_end = true;
              }
              if (previous_lms + k >= length ||
                  (previous_lms_type == LType &&
                   sl_partition[previous_lms + k] == SType)) {
                previous_lms_end = true;
              }

              if (current_lms_end && previous_lms_end) {
                break;  // Previous and current substrings are identical.
              } else if (current_lms_end != previous_lms_end ||
                         str[current_lms + k] != str[previous_lms + k]) {
                // Previous and current substrings differ, a new label is used.
                ++label;
                break;
              }

              current_lms_type = sl_partition[current_lms + k];
              previous_lms_type = sl_partition[previous_lms + k];
            }
          }
          *lms_indices++ = *it;
          *lms_str++ = label;
          previous_lms = current_lms;
        }
      }

      return label + 1;
    }

    // Implementation of the SA-IS algorithm. |str| must be a random access
    // iterator pointing at the beginning of S with length |length|. The result
    // is writtend in |suffix_array|, a random access iterator.
    template <class StrIt, class SAIt>
    static void SuffixSort(StrIt str,
                           size_type length,
                           key_type key_bound,
                           SAIt suffix_array) {
      if (length == 1)
        *suffix_array = 0;
      if (length < 2)
        return;

      std::vector<SLType> sl_partition(length);
      size_type lms_count =
          BuildSLPartition(str, length, key_bound, sl_partition.rbegin());
      std::vector<size_type> lms_indices(lms_count);
      FindLmsSuffixes(sl_partition, lms_indices.begin());
      std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound);

      if (lms_indices.size() > 1) {
        // Given |lms_indices| in the same order they appear in |str|, induce
        // LMS substrings relative order and write result to |suffix_array|.
        InducedSort(str, length, sl_partition, lms_indices, buckets,
                    suffix_array);
        std::vector<size_type> lms_str(lms_indices.size());

        // Given LMS substrings in relative order found in |suffix_array|,
        // map LMS substrings to unique labels to form a new string, |lms_str|.
        size_type label_count =
            LabelLmsSubstrings(str, length, sl_partition, suffix_array,
                               lms_indices.begin(), lms_str.begin());

        if (label_count < lms_str.size()) {
          // Reorder |lms_str| to have LMS suffixes in the same order they
          // appear in |str|.
          for (size_type i = 0; i < lms_indices.size(); ++i)
            suffix_array[lms_indices[i]] = lms_str[i];

          SLType previous_type = SType;
          for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {
            if (sl_partition[i] == SType && previous_type == LType) {
              lms_str[j] = suffix_array[i];
              lms_indices[j++] = i;
            }
            previous_type = sl_partition[i];
          }

          // Recursively apply SuffixSort on |lms_str|, which is formed from
          // labeled LMS suffixes in the same order they appear in |str|.
          // Note that |KeyType| will be size_type because |lms_str| contains
          // indices. |lms_str| is at most half the length of |str|.
          Implementation<size_type, size_type>::SuffixSort(
              lms_str.begin(), static_cast<size_type>(lms_str.size()),
              label_count, suffix_array);

          // Map LMS labels back to indices in |str| and write result to
          // |lms_indices|. We're using |suffix_array| as a temporary buffer.
          for (size_type i = 0; i < lms_indices.size(); ++i)
            suffix_array[i] = lms_indices[suffix_array[i]];
          std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin());

          // At this point, |lms_indices| contains sorted LMS suffixes of |str|.
        }
      }
      // Given |lms_indices| where LMS suffixes are sorted, induce the full
      // order of suffixes in |str|.
      InducedSort(str, length, sl_partition, lms_indices, buckets,
                  suffix_array);
    }

    Implementation() = delete;
    Implementation(const Implementation&) = delete;
    const Implementation& operator=(const Implementation&) = delete;
  };
};

// Generates a sorted suffix array for the input string |str| using the functor
// |Algorithm| which provides an interface equivalent to NaiveSuffixSort.
/// Characters found in |str| are assumed to be in range [0, |key_bound|).
// Returns the suffix array as a vector.
// |StrRng| is an input random access range.
// |KeyType| is an unsigned integer type.
template <class Algorithm, class StrRng, class KeyType>
std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,
                                                        KeyType key_bound) {
  Algorithm sort;
  std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());
  sort(str, key_bound, suffix_array.begin());
  return suffix_array;
}

// Type requirements:
// |SARng| is an input random access range.
// |StrIt1| is a random access iterator.
// |StrIt2| is a forward iterator.
template <class SARng, class StrIt1, class StrIt2>
// Lexicographical lower bound using binary search for
// [|str2_first|, |str2_last|) in the suffix array |suffix_array| of a string
// starting at |str1_first|. This does not necessarily return the index of
// the longest matching substring.
auto SuffixLowerBound(const SARng& suffix_array,
                      StrIt1 str1_first,
                      StrIt2 str2_first,
                      StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {
  using size_type = typename SARng::value_type;

  size_t n = std::end(suffix_array) - std::begin(suffix_array);
  auto it = std::lower_bound(
      std::begin(suffix_array), std::end(suffix_array), str2_first,
      [str1_first, str2_last, n](size_type a, StrIt2 b) {
        return std::lexicographical_compare(str1_first + a, str1_first + n, b,
                                            str2_last);
      });
  return it;
}

}  // namespace zucchini

#endif  // COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_