summaryrefslogtreecommitdiff
path: root/suffix_array_index.cc
blob: 710249c17b19745630aa5125fde4e0cd708b1d7a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
// Copyright 2017 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "bsdiff/suffix_array_index.h"

#include <algorithm>
#include <limits>
#include <vector>

#include <divsufsort.h>
#include <divsufsort64.h>

#include "bsdiff/logging.h"

namespace {

// libdivsufsort C++ overloaded functions used to allow calling the right
// implementation based on the pointer size.
int CallDivSufSort(const uint8_t* text, saidx_t* sa, size_t n) {
  return divsufsort(text, sa, n);
}
int CallDivSufSort(const uint8_t* text, saidx64_t* sa, size_t n) {
  return divsufsort64(text, sa, n);
}

saidx_t CallSaSearch(const uint8_t* text,
                     size_t text_size,
                     const uint8_t* pattern,
                     size_t pattern_size,
                     const saidx_t* sa,
                     size_t sa_size,
                     saidx_t* left) {
  return sa_search(text, text_size, pattern, pattern_size, sa, sa_size, left);
}
saidx64_t CallSaSearch(const uint8_t* text,
                       size_t text_size,
                       const uint8_t* pattern,
                       size_t pattern_size,
                       const saidx64_t* sa,
                       size_t sa_size,
                       saidx64_t* left) {
  return sa_search64(text, text_size, pattern, pattern_size, sa, sa_size, left);
}

}  // namespace

namespace bsdiff {

// The SAIDX template type must be either saidx_t or saidx64_t, which will
// depend on the maximum size of the input text needed.
template <typename SAIDX>
class SuffixArrayIndex : public SuffixArrayIndexInterface {
 public:
  SuffixArrayIndex() = default;

  // Initialize and construct the suffix array of the |text| string of length
  // |n|. The memory pointed by |text| must be kept alive. Returns whether the
  // construction succeeded.
  bool Init(const uint8_t* text, size_t n);

  // SuffixArrayIndexInterface overrides.
  void SearchPrefix(const uint8_t* target,
                    size_t length,
                    size_t* out_length,
                    uint64_t* out_pos) const override;

 private:
  const uint8_t* text_{nullptr};  // Owned by the caller.
  size_t n_{0};

  std::vector<SAIDX> sa_;
};

template <typename SAIDX>
bool SuffixArrayIndex<SAIDX>::Init(const uint8_t* text, size_t n) {
  if (!sa_.empty()) {
    // Already initialized.
    LOG(ERROR) << "SuffixArray already initialized";
    return false;
  }
  if (static_cast<uint64_t>(n) >
      static_cast<uint64_t>(std::numeric_limits<SAIDX>::max())) {
    LOG(ERROR) << "Input too big (" << n << ") for this implementation";
    return false;
  }
  text_ = text;
  n_ = n;
  sa_.resize(n + 1);

  if (n > 0 && CallDivSufSort(text_, sa_.data(), n) != 0) {
    LOG(ERROR) << "divsufsrot() failed";
    return false;
  }

  return true;
}

template <typename SAIDX>
void SuffixArrayIndex<SAIDX>::SearchPrefix(const uint8_t* target,
                                           size_t length,
                                           size_t* out_length,
                                           uint64_t* out_pos) const {
  SAIDX suf_left;
  SAIDX count =
      CallSaSearch(text_, n_, target, length, sa_.data(), n_, &suf_left);
  if (count > 0) {
    // This is the simple case where we found the whole |target| string was
    // found.
    *out_pos = sa_[suf_left];
    *out_length = length;
    return;
  }
  // In this case, |suf_left| points to the first suffix array position such
  // that the suffix at that position is lexicographically larger than |target|.
  // We only need to check whether the previous entry or the current entry is a
  // longer match.
  size_t prev_suffix_len = 0;
  if (suf_left > 0) {
    const size_t prev_max_len =
        std::min(n_ - static_cast<size_t>(sa_[suf_left - 1]), length);
    const uint8_t* prev_suffix = text_ + sa_[suf_left - 1];
    prev_suffix_len =
        std::mismatch(target, target + prev_max_len, prev_suffix).first -
        target;
  }

  size_t next_suffix_len = 0;
  if (static_cast<size_t>(suf_left) < n_) {
    const uint8_t* next_suffix = text_ + sa_[suf_left];
    const size_t next_max_len =
        std::min(n_ - static_cast<size_t>(sa_[suf_left]), length);
    next_suffix_len =
        std::mismatch(target, target + next_max_len, next_suffix).first -
        target;
  }

  *out_length = std::max(next_suffix_len, prev_suffix_len);
  if (!*out_length) {
    *out_pos = 0;
  } else if (next_suffix_len >= prev_suffix_len) {
    *out_pos = sa_[suf_left];
  } else {
    *out_pos = sa_[suf_left - 1];
  }
}

std::unique_ptr<SuffixArrayIndexInterface> CreateSuffixArrayIndex(
    const uint8_t* text,
    size_t n) {
  // The maximum supported size when using the suffix array based on the 32-bit
  // saidx_t type. We limit this to something a bit smaller (16 bytes smaller)
  // than the maximum representable number so references like "n + 1" are don't
  // overflow.
  const size_t kMaxSaidxSize = std::numeric_limits<saidx_t>::max() - 16;
  std::unique_ptr<SuffixArrayIndexInterface> ret;

  if (n > kMaxSaidxSize) {
    SuffixArrayIndex<saidx64_t>* sa_ptr = new SuffixArrayIndex<saidx64_t>();
    ret.reset(sa_ptr);
    if (!sa_ptr->Init(text, n))
      return nullptr;
  } else {
    SuffixArrayIndex<saidx_t>* sa_ptr = new SuffixArrayIndex<saidx_t>();
    ret.reset(sa_ptr);
    if (!sa_ptr->Init(text, n))
      return nullptr;
  }
  return ret;
}


}  // namespace bsdiff