diff options
Diffstat (limited to 'icing/index/main/posting-list-hit-serializer.cc')
-rw-r--r-- | icing/index/main/posting-list-hit-serializer.cc | 254 |
1 files changed, 171 insertions, 83 deletions
diff --git a/icing/index/main/posting-list-hit-serializer.cc b/icing/index/main/posting-list-hit-serializer.cc index e14a0c0..88c0754 100644 --- a/icing/index/main/posting-list-hit-serializer.cc +++ b/icing/index/main/posting-list-hit-serializer.cc @@ -19,8 +19,11 @@ #include <limits> #include <vector> +#include "icing/text_classifier/lib3/utils/base/status.h" +#include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/file/posting_list/posting-list-used.h" +#include "icing/index/hit/hit.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/legacy/index/icing-bit-util.h" #include "icing/util/logging.h" @@ -35,6 +38,10 @@ uint32_t GetTermFrequencyByteSize(const Hit& hit) { return hit.has_term_frequency() ? sizeof(Hit::TermFrequency) : 0; } +uint32_t GetFlagsByteSize(const Hit& hit) { + return hit.has_flags() ? sizeof(Hit::Flags) : 0; +} + } // namespace uint32_t PostingListHitSerializer::GetBytesUsed( @@ -55,14 +62,45 @@ uint32_t PostingListHitSerializer::GetMinPostingListSizeToFit( return posting_list_used->size_in_bytes(); } - // In NOT_FULL status BytesUsed contains no special hits. The minimum sized - // posting list that would be guaranteed to fit these hits would be - // ALMOST_FULL, with kInvalidHit in special_hit(0), the uncompressed Hit in - // special_hit(1) and the n compressed hits in the compressed region. - // BytesUsed contains one uncompressed Hit and n compressed hits. Therefore, - // fitting these hits into a posting list would require BytesUsed plus one - // extra hit. - return GetBytesUsed(posting_list_used) + sizeof(Hit); + // Edge case: if number of hits <= 2, and posting_list_used is NOT_FULL, we + // should return kMinPostingListSize as we know that two hits is able to fit + // in the 2 special hits position of a min-sized posting list. + // - Checking this scenario requires deserializing posting_list_used, which is + // an expensive operation. However this is only possible when + // GetBytesUsed(posting_list_used) <= + // sizeof(uncompressed hit) + max(delta encoded value size) + + // (sizeof(Hit) + (sizeof(Hit) - sizeof(Hit::Value)), so we don't need to do + // this when the size exceeds this number. + if (GetBytesUsed(posting_list_used) <= + 2 * sizeof(Hit) + 5 - sizeof(Hit::Value)) { + // Check if we're able to get more than 2 hits from posting_list_used + std::vector<Hit> hits; + libtextclassifier3::Status status = + GetHitsInternal(posting_list_used, /*limit=*/3, /*pop=*/false, &hits); + if (status.ok() && hits.size() <= 2) { + return GetMinPostingListSize(); + } + } + + // - In NOT_FULL status, BytesUsed contains no special hits. For a posting + // list in the NOT_FULL state with n hits, we would have n-1 compressed hits + // and 1 uncompressed hit. + // - The minimum sized posting list that would be guaranteed to fit these hits + // would be FULL, but calculating the size required for the FULL posting + // list would require deserializing the last two added hits, so instead we + // will calculate the size of an ALMOST_FULL posting list to fit. + // - An ALMOST_FULL posting list would have kInvalidHit in special_hit(0), the + // full uncompressed Hit in special_hit(1), and the n-1 compressed hits in + // the compressed region. + // - Currently BytesUsed contains one uncompressed Hit and n-1 compressed + // hits. However, it's possible that the uncompressed Hit is not a full hit, + // but rather only the Hit::Value (this is the case if + // !hit.has_term_frequency()). + // - Therefore, fitting these hits into a posting list would require + // BytesUsed + one extra full hit + byte difference between a full hit and + // Hit::Value. i.e: + // ByteUsed + sizeof(Hit) + (sizeof(Hit) - sizeof(Hit::Value)). + return GetBytesUsed(posting_list_used) + 2 * sizeof(Hit) - sizeof(Hit::Value); } void PostingListHitSerializer::Clear(PostingListUsed* posting_list_used) const { @@ -160,27 +198,37 @@ libtextclassifier3::Status PostingListHitSerializer::PrependHitToAlmostFull( // in the padded area and put new hit at the special position 1. // Calling ValueOrDie is safe here because 1 < kNumSpecialData. Hit cur = GetSpecialHit(posting_list_used, /*index=*/1).ValueOrDie(); - if (cur.value() <= hit.value()) { + if (cur < hit || cur == hit) { return absl_ports::InvalidArgumentError( "Hit being prepended must be strictly less than the most recent Hit"); } - uint64_t delta = cur.value() - hit.value(); uint8_t delta_buf[VarInt::kMaxEncodedLen64]; - size_t delta_len = VarInt::Encode(delta, delta_buf); + size_t delta_len = + EncodeNextHitValue(/*next_hit_value=*/hit.value(), + /*curr_hit_value=*/cur.value(), delta_buf); + uint32_t cur_flags_bytes = GetFlagsByteSize(cur); uint32_t cur_term_frequency_bytes = GetTermFrequencyByteSize(cur); uint32_t pad_end = GetPadEnd(posting_list_used, /*offset=*/kSpecialHitsSize); - if (pad_end >= kSpecialHitsSize + delta_len + cur_term_frequency_bytes) { - // Pad area has enough space for delta and term_frequency of existing hit - // (cur). Write delta at pad_end - delta_len - cur_term_frequency_bytes. + if (pad_end >= kSpecialHitsSize + delta_len + cur_flags_bytes + + cur_term_frequency_bytes) { + // Pad area has enough space for delta, flags and term_frequency of existing + // hit (cur). Write delta at pad_end - delta_len - cur_flags_bytes - + // cur_term_frequency_bytes. uint8_t* delta_offset = posting_list_used->posting_list_buffer() + pad_end - - delta_len - cur_term_frequency_bytes; + delta_len - cur_flags_bytes - + cur_term_frequency_bytes; memcpy(delta_offset, delta_buf, delta_len); - // Now copy term_frequency. + + // Now copy flags. + Hit::Flags flags = cur.flags(); + uint8_t* flags_offset = delta_offset + delta_len; + memcpy(flags_offset, &flags, cur_flags_bytes); + // Copy term_frequency. Hit::TermFrequency term_frequency = cur.term_frequency(); - uint8_t* term_frequency_offset = delta_offset + delta_len; + uint8_t* term_frequency_offset = flags_offset + cur_flags_bytes; memcpy(term_frequency_offset, &term_frequency, cur_term_frequency_bytes); // Now first hit is the new hit, at special position 1. Safe to ignore the @@ -231,23 +279,36 @@ libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull( return absl_ports::FailedPreconditionError( "Posting list is in an invalid state."); } + + // Retrieve the last added (cur) hit's value and flags and compare to the hit + // we're adding. Hit::Value cur_value; - memcpy(&cur_value, posting_list_used->posting_list_buffer() + offset, - sizeof(Hit::Value)); - if (cur_value <= hit.value()) { + uint8_t* cur_value_offset = posting_list_used->posting_list_buffer() + offset; + memcpy(&cur_value, cur_value_offset, sizeof(Hit::Value)); + Hit::Flags cur_flags = Hit::kNoEnabledFlags; + if (GetFlagsByteSize(Hit(cur_value)) > 0) { + uint8_t* cur_flags_offset = cur_value_offset + sizeof(Hit::Value); + memcpy(&cur_flags, cur_flags_offset, sizeof(Hit::Flags)); + } + // Term-frequency is not used for hit comparison so it's ok to pass in the + // default term-frequency here. + Hit cur(cur_value, cur_flags, Hit::kDefaultTermFrequency); + if (cur < hit || cur == hit) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "Hit %d being prepended must be strictly less than the most recent " - "Hit %d", - hit.value(), cur_value)); + "Hit (value=%d, flags=%d) being prepended must be strictly less than " + "the most recent Hit (value=%d, flags=%d)", + hit.value(), hit.flags(), cur_value, cur_flags)); } - uint64_t delta = cur_value - hit.value(); uint8_t delta_buf[VarInt::kMaxEncodedLen64]; - size_t delta_len = VarInt::Encode(delta, delta_buf); + size_t delta_len = + EncodeNextHitValue(/*next_hit_value=*/hit.value(), + /*curr_hit_value=*/cur.value(), delta_buf); + uint32_t hit_flags_bytes = GetFlagsByteSize(hit); uint32_t hit_term_frequency_bytes = GetTermFrequencyByteSize(hit); // offset now points to one past the end of the first hit. offset += sizeof(Hit::Value); - if (kSpecialHitsSize + sizeof(Hit::Value) + delta_len + + if (kSpecialHitsSize + sizeof(Hit::Value) + delta_len + hit_flags_bytes + hit_term_frequency_bytes <= offset) { // Enough space for delta in compressed area. @@ -257,9 +318,9 @@ libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull( memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf, delta_len); - // Prepend new hit with (possibly) its term_frequency. We know that there is - // room for 'hit' because of the if statement above, so calling ValueOrDie - // is safe. + // Prepend new hit with (possibly) its flags and term_frequency. We know + // that there is room for 'hit' because of the if statement above, so + // calling ValueOrDie is safe. offset = PrependHitUncompressed(posting_list_used, hit, offset).ValueOrDie(); // offset is guaranteed to be valid here. So it's safe to ignore the return @@ -295,13 +356,13 @@ libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull( // (i.e. varint delta encoding expanded required storage). We // move first hit to special position 1 and put new hit in // special position 0. - Hit cur(cur_value); - // offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes. + + // Offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes. // Therefore, offset must be less than kSpecialHitSize + 5. Since posting - // list size must be divisible by sizeof(Hit) (5), it is guaranteed that + // list size must be divisible by sizeof(Hit) (6), it is guaranteed that // offset < size_in_bytes, so it is safe to ignore the return value here. - ICING_RETURN_IF_ERROR( - ConsumeTermFrequencyIfPresent(posting_list_used, &cur, &offset)); + ICING_RETURN_IF_ERROR(ConsumeFlagsAndTermFrequencyIfPresent( + posting_list_used, &cur, &offset)); // Safe to ignore the return value of PadToEnd because offset must be less // than posting_list_used->size_in_bytes(). Otherwise, this function // already would have returned FAILED_PRECONDITION. @@ -463,19 +524,19 @@ libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal( offset += sizeof(Hit::Value); } else { // Now we have delta encoded subsequent hits. Decode and push. - uint64_t delta; - offset += VarInt::Decode( - posting_list_used->posting_list_buffer() + offset, &delta); - val += delta; + DecodedHitInfo decoded_hit_info = DecodeNextHitValue( + posting_list_used->posting_list_buffer() + offset, val); + offset += decoded_hit_info.encoded_size; + val = decoded_hit_info.hit_value; } Hit hit(val); libtextclassifier3::Status status = - ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset); + ConsumeFlagsAndTermFrequencyIfPresent(posting_list_used, &hit, &offset); if (!status.ok()) { // This posting list has been corrupted somehow. The first hit of the - // posting list claims to have a term frequency, but there's no more room - // in the posting list for that term frequency to exist. Return an empty - // vector and zero to indicate no hits retrieved. + // posting list claims to have a term frequency or flag, but there's no + // more room in the posting list for that term frequency or flag to exist. + // Return an empty vector and zero to indicate no hits retrieved. if (out != nullptr) { out->clear(); } @@ -497,10 +558,10 @@ libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal( // In the compressed area. Pop and reconstruct. offset/val is // the last traversed hit, which we must discard. So move one // more forward. - uint64_t delta; - offset += VarInt::Decode( - posting_list_used->posting_list_buffer() + offset, &delta); - val += delta; + DecodedHitInfo decoded_hit_info = DecodeNextHitValue( + posting_list_used->posting_list_buffer() + offset, val); + offset += decoded_hit_info.encoded_size; + val = decoded_hit_info.hit_value; // Now val is the first hit of the new posting list. if (kSpecialHitsSize + sizeof(Hit::Value) <= offset) { @@ -509,17 +570,18 @@ libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal( memcpy(mutable_posting_list_used->posting_list_buffer() + offset, &val, sizeof(Hit::Value)); } else { - // val won't fit in compressed area. Also see if there is a + // val won't fit in compressed area. Also see if there is a flag or // term_frequency. Hit hit(val); libtextclassifier3::Status status = - ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset); + ConsumeFlagsAndTermFrequencyIfPresent(posting_list_used, &hit, + &offset); if (!status.ok()) { // This posting list has been corrupted somehow. The first hit of - // the posting list claims to have a term frequency, but there's no - // more room in the posting list for that term frequency to exist. - // Return an empty vector and zero to indicate no hits retrieved. Do - // not pop anything. + // the posting list claims to have a term frequency or flag, but + // there's no more room in the posting list for that term frequency or + // flag to exist. Return an empty vector and zero to indicate no hits + // retrieved. Do not pop anything. if (out != nullptr) { out->clear(); } @@ -569,7 +631,7 @@ libtextclassifier3::StatusOr<Hit> PostingListHitSerializer::GetSpecialHit( return absl_ports::InvalidArgumentError( "Special hits only exist at indices 0 and 1"); } - Hit val; + Hit val(Hit::kInvalidValue); memcpy(&val, posting_list_used->posting_list_buffer() + index * sizeof(val), sizeof(val)); return val; @@ -653,11 +715,11 @@ bool PostingListHitSerializer::SetStartByteOffset( // not_full state. Safe to ignore the return value because 0 and 1 are both // < kNumSpecialData. SetSpecialHit(posting_list_used, /*index=*/0, Hit(offset)); - SetSpecialHit(posting_list_used, /*index=*/1, Hit()); + SetSpecialHit(posting_list_used, /*index=*/1, Hit(Hit::kInvalidValue)); } else if (offset == sizeof(Hit)) { // almost_full state. Safe to ignore the return value because 1 is both < // kNumSpecialData. - SetSpecialHit(posting_list_used, /*index=*/0, Hit()); + SetSpecialHit(posting_list_used, /*index=*/0, Hit(Hit::kInvalidValue)); } // Nothing to do for the FULL state - the offset isn't actually stored // anywhere and both special hits hold valid hits. @@ -667,46 +729,72 @@ bool PostingListHitSerializer::SetStartByteOffset( libtextclassifier3::StatusOr<uint32_t> PostingListHitSerializer::PrependHitUncompressed( PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const { - if (hit.has_term_frequency()) { - if (offset < kSpecialHitsSize + sizeof(Hit)) { - return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "Not enough room to prepend Hit at offset %d.", offset)); - } - offset -= sizeof(Hit); - memcpy(posting_list_used->posting_list_buffer() + offset, &hit, - sizeof(Hit)); - } else { - if (offset < kSpecialHitsSize + sizeof(Hit::Value)) { - return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "Not enough room to prepend Hit::Value at offset %d.", offset)); - } - offset -= sizeof(Hit::Value); - Hit::Value val = hit.value(); - memcpy(posting_list_used->posting_list_buffer() + offset, &val, - sizeof(Hit::Value)); + uint32_t hit_bytes_to_prepend = sizeof(Hit::Value) + GetFlagsByteSize(hit) + + GetTermFrequencyByteSize(hit); + + if (offset < kSpecialHitsSize + hit_bytes_to_prepend) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Not enough room to prepend Hit at offset %d.", offset)); } + + if (hit.has_term_frequency()) { + offset -= sizeof(Hit::TermFrequency); + Hit::TermFrequency term_frequency = hit.term_frequency(); + memcpy(posting_list_used->posting_list_buffer() + offset, &term_frequency, + sizeof(Hit::TermFrequency)); + } + if (hit.has_flags()) { + offset -= sizeof(Hit::Flags); + Hit::Flags flags = hit.flags(); + memcpy(posting_list_used->posting_list_buffer() + offset, &flags, + sizeof(Hit::Flags)); + } + offset -= sizeof(Hit::Value); + Hit::Value val = hit.value(); + memcpy(posting_list_used->posting_list_buffer() + offset, &val, + sizeof(Hit::Value)); return offset; } libtextclassifier3::Status -PostingListHitSerializer::ConsumeTermFrequencyIfPresent( +PostingListHitSerializer::ConsumeFlagsAndTermFrequencyIfPresent( const PostingListUsed* posting_list_used, Hit* hit, uint32_t* offset) const { - if (!hit->has_term_frequency()) { - // No term frequency to consume. Everything is fine. + if (!hit->has_flags()) { + // No flags (and by extension, no term-frequency) to consume. Everything is + // fine. return libtextclassifier3::Status::OK; } - if (*offset + sizeof(Hit::TermFrequency) > - posting_list_used->size_in_bytes()) { + + // Consume flags + Hit::Flags flags; + if (*offset + sizeof(Hit::Flags) > posting_list_used->size_in_bytes()) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( - "offset %d must not point past the end of the posting list of size %d.", + "offset %d must not point past the end of the posting list of size " + "%d.", *offset, posting_list_used->size_in_bytes())); } - Hit::TermFrequency term_frequency; - memcpy(&term_frequency, posting_list_used->posting_list_buffer() + *offset, - sizeof(Hit::TermFrequency)); - *hit = Hit(hit->value(), term_frequency); - *offset += sizeof(Hit::TermFrequency); + memcpy(&flags, posting_list_used->posting_list_buffer() + *offset, + sizeof(Hit::Flags)); + *hit = Hit(hit->value(), flags, Hit::kDefaultTermFrequency); + *offset += sizeof(Hit::Flags); + + if (hit->has_term_frequency()) { + // Consume term frequency + if (*offset + sizeof(Hit::TermFrequency) > + posting_list_used->size_in_bytes()) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "offset %d must not point past the end of the posting list of size " + "%d.", + *offset, posting_list_used->size_in_bytes())); + } + Hit::TermFrequency term_frequency; + memcpy(&term_frequency, posting_list_used->posting_list_buffer() + *offset, + sizeof(Hit::TermFrequency)); + *hit = Hit(hit->value(), flags, term_frequency); + *offset += sizeof(Hit::TermFrequency); + } + return libtextclassifier3::Status::OK; } |