aboutsummaryrefslogtreecommitdiff
path: root/icing/index/main/main-index.h
blob: abb0418a3c4c48a04309553fa2a3bf1d1552cb26 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef ICING_INDEX_MAIN_MAIN_INDEX_H_
#define ICING_INDEX_MAIN_MAIN_INDEX_H_

#include <memory>

#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/file/filesystem.h"
#include "icing/index/lite/term-id-hit-pair.h"
#include "icing/index/main/flash-index-storage.h"
#include "icing/index/main/posting-list-accessor.h"
#include "icing/index/term-id-codec.h"
#include "icing/index/term-metadata.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/proto/debug.pb.h"
#include "icing/proto/storage.pb.h"
#include "icing/store/namespace-checker.h"
#include "icing/store/namespace-id.h"
#include "icing/util/status-macros.h"

namespace icing {
namespace lib {

class MainIndex {
 public:
  // RETURNS:
  //  - valid instance of MainIndex, on success.
  //  - INTERNAL error if unable to create the lexicon or flash storage.
  static libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> Create(
      const std::string& index_directory, const Filesystem* filesystem,
      const IcingFilesystem* icing_filesystem);

  // Get a PostingListAccessor that holds the posting list chain for 'term'.
  //
  // RETURNS:
  //  - On success, a valid PostingListAccessor
  //  - NOT_FOUND if term is not present in the main index.
  libtextclassifier3::StatusOr<std::unique_ptr<PostingListAccessor>>
  GetAccessorForExactTerm(const std::string& term);

  // Get a PostingListAccessor for 'prefix'.
  //
  // RETURNS:
  //  - On success, a result containing a valid PostingListAccessor.
  //  - NOT_FOUND if neither 'prefix' nor any terms for which 'prefix' is a
  //    prefix are present in the main index.
  struct GetPrefixAccessorResult {
    // A PostingListAccessor that holds the posting list chain for the term
    // that best represents 'prefix' in the main index.
    std::unique_ptr<PostingListAccessor> accessor;
    // True if the returned posting list chain is for 'prefix' or false if the
    // returned posting list chain is for a term for which 'prefix' is a prefix.
    bool exact;
  };
  libtextclassifier3::StatusOr<GetPrefixAccessorResult>
  GetAccessorForPrefixTerm(const std::string& prefix);

  // Finds terms with the given prefix in the given namespaces. If
  // 'namespace_ids' is empty, returns results from all the namespaces. The
  // input prefix must be normalized, otherwise inaccurate results may be
  // returned. If term_match_type is EXACT, only exact hit will be counted and
  // it is PREFIX, both prefix and exact hits will be counted. Results are not
  // sorted specifically and are in lexigraphical order. Number of results are
  // no more than 'num_to_return'.
  //
  // Returns:
  //   A list of TermMetadata on success
  //   INTERNAL_ERROR if failed to access term data.
  libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
      const std::string& prefix, TermMatchType::Code term_match_type,
      const NamespaceChecker* namespace_checker);

  struct LexiconMergeOutputs {
    // Maps from main_lexicon tvi for new branching point to the main_lexicon
    // tvi for posting list whose hits must be backfilled.
    std::unordered_map<uint32_t, uint32_t> backfill_map;

    // Maps from lexicon tvis to main_lexicon tvis.
    std::unordered_map<uint32_t, uint32_t> other_tvi_to_main_tvi;

    // Maps from main lexicon tvi to the block index. Tvis with no entry do not
    // have an allocated posting list.
    std::unordered_map<uint32_t, int> main_tvi_to_block_index;

    // Maps from the lexicon tvi to the beginning position in
    // prefix_tvis_buf and the length.
    std::unordered_map<uint32_t, std::pair<int, int>>
        other_tvi_to_prefix_main_tvis;

    // Stores tvis that are mapped to by other_tvi_to_prefix_tvis.
    std::vector<uint32_t> prefix_tvis_buf;
  };

  // Merge the lexicon into the main lexicon and populate the data
  // structures necessary to translate lite tvis to main tvis, track backfilling
  // and expanding lite terms to prefix terms.
  //
  // RETURNS:
  //   - OK on success
  //   - INTERNAL on IO error while writing to the main lexicon.
  libtextclassifier3::StatusOr<LexiconMergeOutputs> MergeLexicon(
      const IcingDynamicTrie& other_lexicon) {
    // Backfill branch points need to be added first so that the backfill_map
    // can be correctly populated.
    ICING_ASSIGN_OR_RETURN(LexiconMergeOutputs outputs,
                           AddBackfillBranchPoints(other_lexicon));
    ICING_ASSIGN_OR_RETURN(outputs,
                           AddTerms(other_lexicon, std::move(outputs)));
    // Non-backfill branch points need to be added last so that the mapping of
    // newly added terms to prefix terms can be correctly populated (prefix
    // terms might be branch points between two new terms or between a
    // pre-existing term and a new term).
    ICING_ASSIGN_OR_RETURN(outputs,
                           AddBranchPoints(other_lexicon, std::move(outputs)));
    return outputs;
  }

  // Add hits to the main index and backfill from existing posting lists to new
  // backfill branch points.
  //
  // The backfill_map maps from main_lexicon tvi for a newly added branching
  // point to the main_lexicon tvi for the posting list whose hits must be
  // backfilled. backfill_map should be populated as part of LexiconMergeOutputs
  // in MergeLexicon and be blindly passed to this function.
  //
  // RETURNS:
  //  - OK on success
  //  - INVALID_ARGUMENT if one of the elements in the lite index has a term_id
  //  exceeds the max TermId, is not valid or is not less than pre-existing hits
  //  in the main index.
  //  - INTERNAL_ERROR if unable to mmap necessary IndexBlocks
  //  - RESOURCE_EXHAUSTED error if unable to grow the index
  libtextclassifier3::Status AddHits(
      const TermIdCodec& term_id_codec,
      std::unordered_map<uint32_t, uint32_t>&& backfill_map,
      std::vector<TermIdHitPair>&& hits, DocumentId last_added_document_id);

  libtextclassifier3::Status PersistToDisk() {
    if (main_lexicon_->Sync() && flash_index_storage_->PersistToDisk()) {
      return libtextclassifier3::Status::OK;
    }
    return absl_ports::InternalError("Unable to sync lite index components.");
  }

  DocumentId last_added_document_id() const {
    return flash_index_storage_->get_last_indexed_docid();
  }

  libtextclassifier3::Status Reset() {
    ICING_RETURN_IF_ERROR(flash_index_storage_->Reset());
    main_lexicon_->Clear();
    return libtextclassifier3::Status::OK;
  }

  void Warm() { main_lexicon_->Warm(); }

  // Returns:
  //  - elements size of lexicon and index, on success
  //  - INTERNAL on IO error
  libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;

  // Takes the provided storage_info, populates the fields related to the main
  // index and returns that storage_info.
  //
  // If an IO error occurs while trying to calculate the value for a field, then
  // that field will be set to -1.
  IndexStorageInfoProto GetStorageInfo(
      IndexStorageInfoProto storage_info) const;

  // Returns debug information for the main index in out.
  // verbosity <= 0, simplest debug information - just the lexicon
  // verbosity > 0, more detailed debug information including raw postings
  //                lists.
  IndexDebugInfoProto::MainIndexDebugInfoProto GetDebugInfo(
      int verbosity) const;

 private:
  libtextclassifier3::Status Init(const std::string& index_directory,
                                  const Filesystem* filesystem,
                                  const IcingFilesystem* icing_filesystem);

  // Helpers for merging the lexicon
  // Add all 'backfill' branch points. Backfill branch points are prefix
  // branch points that are a prefix of terms that existed in the lexicon
  // to the merge.
  //
  // For example, if the main lexicon only contains "foot" and is then merged
  // with a lite lexicon containing only "fool", then a backfill branch point
  // for "foo" will be added to contain prefix hits from both the pre-existing
  // posting list for "foot" and the new posting list for "fool".
  //
  // Populates LexiconMergeOutputs.backfill_map
  //
  // RETURNS:
  //   - OK on success
  //   - INTERNAL on IO error while writing to the main lexicon.
  libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBackfillBranchPoints(
      const IcingDynamicTrie& other_lexicon);

  // Add all terms from the lexicon.
  //
  // Populates LexiconMergeOutputs.other_tvi_to_main_tvi
  //
  // RETURNS:
  //   - OK on success
  //   - INTERNAL on IO error while writing to the main lexicon.
  libtextclassifier3::StatusOr<LexiconMergeOutputs> AddTerms(
      const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);

  // Add all branch points for terms added from the lexicon.
  // For example, if the main lexicon is empty and is then merged with a
  // lexicon containing only "foot" and "fool", then a branch point for "foo"
  // will be added to contain prefix hits from both "foot" and "fool".
  //
  // Populates LexiconMergeOutputs.other_tvi_to_prefix_main_tvis and
  // LexiconMergeOutputs.prefix_tvis_buf;
  //
  // RETURNS:
  //   - OK on success
  //   - INTERNAL on IO error while writing to the main lexicon.
  libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBranchPoints(
      const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);

  // Copies all properties from old_tvi in the other lexicon to the new_tvi in
  // the main lexicon.
  // Returns true on success, false if an IO error is encountered.
  bool CopyProperties(const IcingDynamicTrie::PropertyReadersAll& prop_reader,
                      const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
                      uint32_t new_main_tvi);

  // Add all hits between [hit_elements, hit_elements + len) to main_index,
  // updating the entry in the main lexicon at trie_value_index to point to the
  // resulting posting list. Hits are sorted in descending document id order, so
  // they should be to posting lists in reverse (starting at hit_elements
  // + len - 1) and working backwards. Therefore, hit_elements must be in sorted
  // order.
  //
  // trie_value_index may point to a valid posting list id if there is a
  // pre-existing posting list to append to.
  //
  // If backfill_posting_list_id is valid, then the hits from the posting list
  // identified by backfill_posting_list_id should be added to the new posting
  // list before the hits in hit_elements.
  //
  // RETURNS:
  //  - OK on success
  //  - INVALID_ARGUMENT if posting_list_id stored at trie_value_index is valid
  //  but points out of bounds in the IndexBlock referred to by
  //  id.block_index(), if one of the hits from [hit_elements,hit_elements+len)
  //  is not valid, or if one of the hits from [hit_elements,hit_elements+len)
  //  is not less than the previously added hits.
  //  - INTERNAL_ERROR if posting_list_id stored at trie_value_index is valid
  //  but points to an invalid block index or if unable to mmap the IndexBlock.
  //  - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
  //  posting list.
  libtextclassifier3::Status AddHitsForTerm(
      uint32_t tvi, PostingListIdentifier backfill_posting_list_id,
      const TermIdHitPair* hit_elements, size_t len);

  // Adds all prefix hits or hits from prefix sections present on the posting
  // list identified by backfill_posting_list_id to hit_accum.
  //
  // RETURNS:
  //  - OK, on success
  //  - INVALID_ARGUMENT if backfill_posting_list_id points out of bounds in the
  //  IndexBlock referred to by id.block_index()
  //  - INTERNAL_ERROR if unable to mmap the block identified by
  //  backfill_posting_list_id or if the posting list identified by
  //  backfill_posting_list_id has been corrupted.
  //  - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
  //  posting list.
  libtextclassifier3::Status AddPrefixBackfillHits(
      PostingListIdentifier backfill_posting_list_id,
      PostingListAccessor* hit_accum);

  std::unique_ptr<FlashIndexStorage> flash_index_storage_;
  std::unique_ptr<IcingDynamicTrie> main_lexicon_;
};

}  // namespace lib
}  // namespace icing

#endif  // ICING_INDEX_MAIN_MAIN_INDEX_H_