// Copyright (C) 2022 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "icing/index/main/posting-list-hit-serializer.h" #include #include #include #include #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/file/posting_list/posting-list-used.h" #include "icing/index/hit/hit.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/legacy/index/icing-bit-util.h" #include "icing/util/logging.h" #include "icing/util/status-macros.h" namespace icing { namespace lib { namespace { uint32_t GetTermFrequencyByteSize(const Hit& hit) { return hit.has_term_frequency() ? sizeof(Hit::TermFrequency) : 0; } uint32_t GetFlagsByteSize(const Hit& hit) { return hit.has_flags() ? sizeof(Hit::Flags) : 0; } } // namespace uint32_t PostingListHitSerializer::GetBytesUsed( const PostingListUsed* posting_list_used) const { // The special hits will be included if they represent actual hits. If they // represent the hit offset or the invalid hit sentinel, they are not // included. return posting_list_used->size_in_bytes() - GetStartByteOffset(posting_list_used); } uint32_t PostingListHitSerializer::GetMinPostingListSizeToFit( const PostingListUsed* posting_list_used) const { if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) { // If in either the FULL state or ALMOST_FULL state, this posting list *is* // the minimum size posting list that can fit these hits. So just return the // size of the posting list. return posting_list_used->size_in_bytes(); } // Edge case: if number of hits <= 2, and posting_list_used is NOT_FULL, we // should return kMinPostingListSize as we know that two hits is able to fit // in the 2 special hits position of a min-sized posting list. // - Checking this scenario requires deserializing posting_list_used, which is // an expensive operation. However this is only possible when // GetBytesUsed(posting_list_used) <= // sizeof(uncompressed hit) + max(delta encoded value size) + // (sizeof(Hit) + (sizeof(Hit) - sizeof(Hit::Value)), so we don't need to do // this when the size exceeds this number. if (GetBytesUsed(posting_list_used) <= 2 * sizeof(Hit) + 5 - sizeof(Hit::Value)) { // Check if we're able to get more than 2 hits from posting_list_used std::vector hits; libtextclassifier3::Status status = GetHitsInternal(posting_list_used, /*limit=*/3, /*pop=*/false, &hits); if (status.ok() && hits.size() <= 2) { return GetMinPostingListSize(); } } // - In NOT_FULL status, BytesUsed contains no special hits. For a posting // list in the NOT_FULL state with n hits, we would have n-1 compressed hits // and 1 uncompressed hit. // - The minimum sized posting list that would be guaranteed to fit these hits // would be FULL, but calculating the size required for the FULL posting // list would require deserializing the last two added hits, so instead we // will calculate the size of an ALMOST_FULL posting list to fit. // - An ALMOST_FULL posting list would have kInvalidHit in special_hit(0), the // full uncompressed Hit in special_hit(1), and the n-1 compressed hits in // the compressed region. // - Currently BytesUsed contains one uncompressed Hit and n-1 compressed // hits. However, it's possible that the uncompressed Hit is not a full hit, // but rather only the Hit::Value (this is the case if // !hit.has_term_frequency()). // - Therefore, fitting these hits into a posting list would require // BytesUsed + one extra full hit + byte difference between a full hit and // Hit::Value. i.e: // ByteUsed + sizeof(Hit) + (sizeof(Hit) - sizeof(Hit::Value)). return GetBytesUsed(posting_list_used) + 2 * sizeof(Hit) - sizeof(Hit::Value); } void PostingListHitSerializer::Clear(PostingListUsed* posting_list_used) const { // Safe to ignore return value because posting_list_used->size_in_bytes() is // a valid argument. SetStartByteOffset(posting_list_used, /*offset=*/posting_list_used->size_in_bytes()); } libtextclassifier3::Status PostingListHitSerializer::MoveFrom( PostingListUsed* dst, PostingListUsed* src) const { ICING_RETURN_ERROR_IF_NULL(dst); ICING_RETURN_ERROR_IF_NULL(src); if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "src MinPostingListSizeToFit %d must be larger than size %d.", GetMinPostingListSizeToFit(src), dst->size_in_bytes())); } if (!IsPostingListValid(dst)) { return absl_ports::FailedPreconditionError( "Dst posting list is in an invalid state and can't be used!"); } if (!IsPostingListValid(src)) { return absl_ports::InvalidArgumentError( "Cannot MoveFrom an invalid src posting list!"); } // Pop just enough hits that all of src's compressed hits fit in // dst posting_list's compressed area. Then we can memcpy that area. std::vector hits; while (IsFull(src) || IsAlmostFull(src) || (dst->size_in_bytes() - kSpecialHitsSize < GetBytesUsed(src))) { if (!GetHitsInternal(src, /*limit=*/1, /*pop=*/true, &hits).ok()) { return absl_ports::AbortedError( "Unable to retrieve hits from src posting list."); } } // memcpy the area and set up start byte offset. Clear(dst); memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src), src->posting_list_buffer() + GetStartByteOffset(src), GetBytesUsed(src)); // Because we popped all hits from src outside of the compressed area and we // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() - // kSpecialHitSize. This is guaranteed to be a valid byte offset for the // NOT_FULL state, so ignoring the value is safe. SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src)); // Put back remaining hits. for (size_t i = 0; i < hits.size(); i++) { const Hit& hit = hits[hits.size() - i - 1]; // PrependHit can return either INVALID_ARGUMENT - if hit is invalid or not // less than the previous hit - or RESOURCE_EXHAUSTED. RESOURCE_EXHAUSTED // should be impossible because we've already assured that there is enough // room above. ICING_RETURN_IF_ERROR(PrependHit(dst, hit)); } Clear(src); return libtextclassifier3::Status::OK; } uint32_t PostingListHitSerializer::GetPadEnd( const PostingListUsed* posting_list_used, uint32_t offset) const { Hit::Value pad; uint32_t pad_end = offset; while (pad_end < posting_list_used->size_in_bytes()) { size_t pad_len = VarInt::Decode( posting_list_used->posting_list_buffer() + pad_end, &pad); if (pad != 0) { // No longer a pad. break; } pad_end += pad_len; } return pad_end; } bool PostingListHitSerializer::PadToEnd(PostingListUsed* posting_list_used, uint32_t start, uint32_t end) const { if (end > posting_list_used->size_in_bytes()) { ICING_LOG(ERROR) << "Cannot pad a region that ends after size!"; return false; } // In VarInt a value of 0 encodes to 0. memset(posting_list_used->posting_list_buffer() + start, 0, end - start); return true; } libtextclassifier3::Status PostingListHitSerializer::PrependHitToAlmostFull( PostingListUsed* posting_list_used, const Hit& hit) const { // Get delta between first hit and the new hit. Try to fit delta // in the padded area and put new hit at the special position 1. // Calling ValueOrDie is safe here because 1 < kNumSpecialData. Hit cur = GetSpecialHit(posting_list_used, /*index=*/1).ValueOrDie(); if (cur < hit || cur == hit) { return absl_ports::InvalidArgumentError( "Hit being prepended must be strictly less than the most recent Hit"); } uint8_t delta_buf[VarInt::kMaxEncodedLen64]; size_t delta_len = EncodeNextHitValue(/*next_hit_value=*/hit.value(), /*curr_hit_value=*/cur.value(), delta_buf); uint32_t cur_flags_bytes = GetFlagsByteSize(cur); uint32_t cur_term_frequency_bytes = GetTermFrequencyByteSize(cur); uint32_t pad_end = GetPadEnd(posting_list_used, /*offset=*/kSpecialHitsSize); if (pad_end >= kSpecialHitsSize + delta_len + cur_flags_bytes + cur_term_frequency_bytes) { // Pad area has enough space for delta, flags and term_frequency of existing // hit (cur). Write delta at pad_end - delta_len - cur_flags_bytes - // cur_term_frequency_bytes. uint8_t* delta_offset = posting_list_used->posting_list_buffer() + pad_end - delta_len - cur_flags_bytes - cur_term_frequency_bytes; memcpy(delta_offset, delta_buf, delta_len); // Now copy flags. Hit::Flags flags = cur.flags(); uint8_t* flags_offset = delta_offset + delta_len; memcpy(flags_offset, &flags, cur_flags_bytes); // Copy term_frequency. Hit::TermFrequency term_frequency = cur.term_frequency(); uint8_t* term_frequency_offset = flags_offset + cur_flags_bytes; memcpy(term_frequency_offset, &term_frequency, cur_term_frequency_bytes); // Now first hit is the new hit, at special position 1. Safe to ignore the // return value because 1 < kNumSpecialData. SetSpecialHit(posting_list_used, /*index=*/1, hit); // Safe to ignore the return value because sizeof(Hit) is a valid argument. SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit)); } else { // No space for delta. We put the new hit at special position 0 // and go to the full state. Safe to ignore the return value because 1 < // kNumSpecialData. SetSpecialHit(posting_list_used, /*index=*/0, hit); } return libtextclassifier3::Status::OK; } void PostingListHitSerializer::PrependHitToEmpty( PostingListUsed* posting_list_used, const Hit& hit) const { // First hit to be added. Just add verbatim, no compression. if (posting_list_used->size_in_bytes() == kSpecialHitsSize) { // Safe to ignore the return value because 1 < kNumSpecialData SetSpecialHit(posting_list_used, /*index=*/1, hit); // Safe to ignore the return value because sizeof(Hit) is a valid argument. SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit)); } else { // Since this is the first hit, size != kSpecialHitsSize and // size % sizeof(Hit) == 0, we know that there is room to fit 'hit' into // the compressed region, so ValueOrDie is safe. uint32_t offset = PrependHitUncompressed(posting_list_used, hit, /*offset=*/posting_list_used->size_in_bytes()) .ValueOrDie(); // Safe to ignore the return value because PrependHitUncompressed is // guaranteed to return a valid offset. SetStartByteOffset(posting_list_used, offset); } } libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull( PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const { // First hit in compressed area. It is uncompressed. See if delta // between the first hit and new hit will still fit in the // compressed area. if (offset + sizeof(Hit::Value) > posting_list_used->size_in_bytes()) { // The first hit in the compressed region *should* be uncompressed, but // somehow there isn't enough room between offset and the end of the // compressed area to fit an uncompressed hit. This should NEVER happen. return absl_ports::FailedPreconditionError( "Posting list is in an invalid state."); } // Retrieve the last added (cur) hit's value and flags and compare to the hit // we're adding. Hit::Value cur_value; uint8_t* cur_value_offset = posting_list_used->posting_list_buffer() + offset; memcpy(&cur_value, cur_value_offset, sizeof(Hit::Value)); Hit::Flags cur_flags = Hit::kNoEnabledFlags; if (GetFlagsByteSize(Hit(cur_value)) > 0) { uint8_t* cur_flags_offset = cur_value_offset + sizeof(Hit::Value); memcpy(&cur_flags, cur_flags_offset, sizeof(Hit::Flags)); } // Term-frequency is not used for hit comparison so it's ok to pass in the // default term-frequency here. Hit cur(cur_value, cur_flags, Hit::kDefaultTermFrequency); if (cur < hit || cur == hit) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Hit (value=%d, flags=%d) being prepended must be strictly less than " "the most recent Hit (value=%d, flags=%d)", hit.value(), hit.flags(), cur_value, cur_flags)); } uint8_t delta_buf[VarInt::kMaxEncodedLen64]; size_t delta_len = EncodeNextHitValue(/*next_hit_value=*/hit.value(), /*curr_hit_value=*/cur.value(), delta_buf); uint32_t hit_flags_bytes = GetFlagsByteSize(hit); uint32_t hit_term_frequency_bytes = GetTermFrequencyByteSize(hit); // offset now points to one past the end of the first hit. offset += sizeof(Hit::Value); if (kSpecialHitsSize + sizeof(Hit::Value) + delta_len + hit_flags_bytes + hit_term_frequency_bytes <= offset) { // Enough space for delta in compressed area. // Prepend delta. offset -= delta_len; memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf, delta_len); // Prepend new hit with (possibly) its flags and term_frequency. We know // that there is room for 'hit' because of the if statement above, so // calling ValueOrDie is safe. offset = PrependHitUncompressed(posting_list_used, hit, offset).ValueOrDie(); // offset is guaranteed to be valid here. So it's safe to ignore the return // value. The if above will guarantee that offset >= kSpecialHitSize and < // posting_list_used->size_in_bytes() because the if ensures that there is // enough room between offset and kSpecialHitSize to fit the delta of the // previous hit, any term_frequency and the uncompressed hit. SetStartByteOffset(posting_list_used, offset); } else if (kSpecialHitsSize + delta_len <= offset) { // Only have space for delta. The new hit must be put in special // position 1. // Prepend delta. offset -= delta_len; memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf, delta_len); // Prepend pad. Safe to ignore the return value of PadToEnd because offset // must be less than posting_list_used->size_in_bytes(). Otherwise, this // function already would have returned FAILED_PRECONDITION. PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize, /*end=*/offset); // Put new hit in special position 1. Safe to ignore return value because 1 // < kNumSpecialData. SetSpecialHit(posting_list_used, /*index=*/1, hit); // State almost_full. Safe to ignore the return value because sizeof(Hit) is // a valid argument. SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit)); } else { // Very rare case where delta is larger than sizeof(Hit::Value) // (i.e. varint delta encoding expanded required storage). We // move first hit to special position 1 and put new hit in // special position 0. // Offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes. // Therefore, offset must be less than kSpecialHitSize + 5. Since posting // list size must be divisible by sizeof(Hit) (6), it is guaranteed that // offset < size_in_bytes, so it is safe to ignore the return value here. ICING_RETURN_IF_ERROR(ConsumeFlagsAndTermFrequencyIfPresent( posting_list_used, &cur, &offset)); // Safe to ignore the return value of PadToEnd because offset must be less // than posting_list_used->size_in_bytes(). Otherwise, this function // already would have returned FAILED_PRECONDITION. PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize, /*end=*/offset); // Safe to ignore the return value here because 0 and 1 < kNumSpecialData. SetSpecialHit(posting_list_used, /*index=*/1, cur); SetSpecialHit(posting_list_used, /*index=*/0, hit); } return libtextclassifier3::Status::OK; } libtextclassifier3::Status PostingListHitSerializer::PrependHit( PostingListUsed* posting_list_used, const Hit& hit) const { static_assert(sizeof(Hit::Value) <= sizeof(uint64_t), "Hit::Value cannot be larger than 8 bytes because the delta " "must be able to fit in 8 bytes."); if (!hit.is_valid()) { return absl_ports::InvalidArgumentError("Cannot prepend an invalid hit!"); } if (!IsPostingListValid(posting_list_used)) { return absl_ports::FailedPreconditionError( "This PostingListUsed is in an invalid state and can't add any hits!"); } if (IsFull(posting_list_used)) { // State full: no space left. return absl_ports::ResourceExhaustedError("No more room for hits"); } else if (IsAlmostFull(posting_list_used)) { return PrependHitToAlmostFull(posting_list_used, hit); } else if (IsEmpty(posting_list_used)) { PrependHitToEmpty(posting_list_used, hit); return libtextclassifier3::Status::OK; } else { uint32_t offset = GetStartByteOffset(posting_list_used); return PrependHitToNotFull(posting_list_used, hit, offset); } } libtextclassifier3::StatusOr> PostingListHitSerializer::GetHits( const PostingListUsed* posting_list_used) const { std::vector hits_out; ICING_RETURN_IF_ERROR(GetHits(posting_list_used, &hits_out)); return hits_out; } libtextclassifier3::Status PostingListHitSerializer::GetHits( const PostingListUsed* posting_list_used, std::vector* hits_out) const { return GetHitsInternal(posting_list_used, /*limit=*/std::numeric_limits::max(), /*pop=*/false, hits_out); } libtextclassifier3::Status PostingListHitSerializer::PopFrontHits( PostingListUsed* posting_list_used, uint32_t num_hits) const { if (num_hits == 1 && IsFull(posting_list_used)) { // The PL is in full status which means that we save 2 uncompressed hits in // the 2 special postions. But full status may be reached by 2 different // statuses. // (1) In "almost full" status // +-----------------+----------------+-------+-----------------+ // |Hit::kInvalidVal |1st hit |(pad) |(compressed) hits| // +-----------------+----------------+-------+-----------------+ // When we prepend another hit, we can only put it at the special // position 0. And we get a full PL // +-----------------+----------------+-------+-----------------+ // |new 1st hit |original 1st hit|(pad) |(compressed) hits| // +-----------------+----------------+-------+-----------------+ // (2) In "not full" status // +-----------------+----------------+------+-------+------------------+ // |hits-start-offset|Hit::kInvalidVal|(pad) |1st hit|(compressed) hits | // +-----------------+----------------+------+-------+------------------+ // When we prepend another hit, we can reach any of the 3 following // scenarios: // (2.1) not full // if the space of pad and original 1st hit can accommodate the new 1st hit // and the encoded delta value. // +-----------------+----------------+------+-----------+-----------------+ // |hits-start-offset|Hit::kInvalidVal|(pad) |new 1st hit|(compressed) hits| // +-----------------+----------------+------+-----------+-----------------+ // (2.2) almost full // If the space of pad and original 1st hit cannot accommodate the new 1st // hit and the encoded delta value but can accommodate the encoded delta // value only. We can put the new 1st hit at special position 1. // +-----------------+----------------+-------+-----------------+ // |Hit::kInvalidVal |new 1st hit |(pad) |(compressed) hits| // +-----------------+----------------+-------+-----------------+ // (2.3) full // In very rare case, it cannot even accommodate only the encoded delta // value. we can move the original 1st hit into special position 1 and the // new 1st hit into special position 0. This may happen because we use // VarInt encoding method which may make the encoded value longer (about // 4/3 times of original) // +-----------------+----------------+-------+-----------------+ // |new 1st hit |original 1st hit|(pad) |(compressed) hits| // +-----------------+----------------+-------+-----------------+ // Suppose now the PL is full. But we don't know whether it arrived to // this status from "not full" like (2.3) or from "almost full" like (1). // We'll return to "almost full" status like (1) if we simply pop the new // 1st hit but we want to make the prepending operation "reversible". So // there should be some way to return to "not full" if possible. A simple // way to do it is to pop 2 hits out of the PL to status "almost full" or // "not full". And add the original 1st hit back. We can return to the // correct original statuses of (2.1) or (1). This makes our prepending // operation reversible. std::vector out; // Popping 2 hits should never fail because we've just ensured that the // posting list is in the FULL state. ICING_RETURN_IF_ERROR( GetHitsInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out)); // PrependHit should never fail because out[1] is a valid hit less than // previous hits in the posting list and because there's no way that the // posting list could run out of room because it previously stored this hit // AND another hit. ICING_RETURN_IF_ERROR(PrependHit(posting_list_used, out[1])); } else if (num_hits > 0) { return GetHitsInternal(posting_list_used, /*limit=*/num_hits, /*pop=*/true, nullptr); } return libtextclassifier3::Status::OK; } libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal( const PostingListUsed* posting_list_used, uint32_t limit, bool pop, std::vector* out) const { // Put current uncompressed val here. Hit::Value val = Hit::kInvalidValue; uint32_t offset = GetStartByteOffset(posting_list_used); uint32_t count = 0; // First traverse the first two special positions. while (count < limit && offset < kSpecialHitsSize) { // Calling ValueOrDie is safe here because offset / sizeof(Hit) < // kNumSpecialData because of the check above. Hit hit = GetSpecialHit(posting_list_used, /*index=*/offset / sizeof(Hit)) .ValueOrDie(); val = hit.value(); if (out != nullptr) { out->push_back(hit); } offset += sizeof(Hit); count++; } // If special position 1 was set then we need to skip padding. if (val != Hit::kInvalidValue && offset == kSpecialHitsSize) { offset = GetPadEnd(posting_list_used, offset); } while (count < limit && offset < posting_list_used->size_in_bytes()) { if (val == Hit::kInvalidValue) { // First hit is in compressed area. Put that in val. memcpy(&val, posting_list_used->posting_list_buffer() + offset, sizeof(Hit::Value)); offset += sizeof(Hit::Value); } else { // Now we have delta encoded subsequent hits. Decode and push. DecodedHitInfo decoded_hit_info = DecodeNextHitValue( posting_list_used->posting_list_buffer() + offset, val); offset += decoded_hit_info.encoded_size; val = decoded_hit_info.hit_value; } Hit hit(val); libtextclassifier3::Status status = ConsumeFlagsAndTermFrequencyIfPresent(posting_list_used, &hit, &offset); if (!status.ok()) { // This posting list has been corrupted somehow. The first hit of the // posting list claims to have a term frequency or flag, but there's no // more room in the posting list for that term frequency or flag to exist. // Return an empty vector and zero to indicate no hits retrieved. if (out != nullptr) { out->clear(); } return absl_ports::InternalError("Posting list has been corrupted!"); } if (out != nullptr) { out->push_back(hit); } count++; } if (pop) { PostingListUsed* mutable_posting_list_used = const_cast(posting_list_used); // Modify the posting list so that we pop all hits actually // traversed. if (offset >= kSpecialHitsSize && offset < posting_list_used->size_in_bytes()) { // In the compressed area. Pop and reconstruct. offset/val is // the last traversed hit, which we must discard. So move one // more forward. DecodedHitInfo decoded_hit_info = DecodeNextHitValue( posting_list_used->posting_list_buffer() + offset, val); offset += decoded_hit_info.encoded_size; val = decoded_hit_info.hit_value; // Now val is the first hit of the new posting list. if (kSpecialHitsSize + sizeof(Hit::Value) <= offset) { // val fits in compressed area. Simply copy. offset -= sizeof(Hit::Value); memcpy(mutable_posting_list_used->posting_list_buffer() + offset, &val, sizeof(Hit::Value)); } else { // val won't fit in compressed area. Also see if there is a flag or // term_frequency. Hit hit(val); libtextclassifier3::Status status = ConsumeFlagsAndTermFrequencyIfPresent(posting_list_used, &hit, &offset); if (!status.ok()) { // This posting list has been corrupted somehow. The first hit of // the posting list claims to have a term frequency or flag, but // there's no more room in the posting list for that term frequency or // flag to exist. Return an empty vector and zero to indicate no hits // retrieved. Do not pop anything. if (out != nullptr) { out->clear(); } return absl_ports::InternalError("Posting list has been corrupted!"); } // Okay to ignore the return value here because 1 < kNumSpecialData. SetSpecialHit(mutable_posting_list_used, /*index=*/1, hit); // Prepend pad. Safe to ignore the return value of PadToEnd because // offset must be less than posting_list_used->size_in_bytes() thanks to // the if above. PadToEnd(mutable_posting_list_used, /*start=*/kSpecialHitsSize, /*end=*/offset); offset = sizeof(Hit); } } // offset is guaranteed to be valid so ignoring the return value of // set_start_byte_offset is safe. It falls into one of four scenarios: // Scenario 1: the above if was false because offset is not < // posting_list_used->size_in_bytes() // In this case, offset must be == posting_list_used->size_in_bytes() // because we reached offset by unwinding hits on the posting list. // Scenario 2: offset is < kSpecialHitSize // In this case, offset is guaranteed to be either 0 or sizeof(Hit) // because offset is incremented by sizeof(Hit) within the first while // loop. // Scenario 3: offset is within the compressed region and the new first hit // in the posting list (the value that 'val' holds) will fit as an // uncompressed hit in the compressed region. The resulting offset from // decompressing val must be >= kSpecialHitSize because otherwise we'd be // in Scenario 4 // Scenario 4: offset is within the compressed region, but the new first hit // in the posting list is too large to fit as an uncompressed hit in the // in the compressed region. Therefore, it must be stored in a special hit // and offset will be sizeof(Hit). SetStartByteOffset(mutable_posting_list_used, offset); } return libtextclassifier3::Status::OK; } libtextclassifier3::StatusOr PostingListHitSerializer::GetSpecialHit( const PostingListUsed* posting_list_used, uint32_t index) const { static_assert(sizeof(Hit::Value) >= sizeof(uint32_t), "HitTooSmall"); if (index >= kNumSpecialData || index < 0) { return absl_ports::InvalidArgumentError( "Special hits only exist at indices 0 and 1"); } Hit val(Hit::kInvalidValue); memcpy(&val, posting_list_used->posting_list_buffer() + index * sizeof(val), sizeof(val)); return val; } bool PostingListHitSerializer::SetSpecialHit(PostingListUsed* posting_list_used, uint32_t index, const Hit& val) const { if (index >= kNumSpecialData || index < 0) { ICING_LOG(ERROR) << "Special hits only exist at indices 0 and 1"; return false; } memcpy(posting_list_used->posting_list_buffer() + index * sizeof(val), &val, sizeof(val)); return true; } bool PostingListHitSerializer::IsPostingListValid( const PostingListUsed* posting_list_used) const { if (IsAlmostFull(posting_list_used)) { // Special Hit 1 should hold a Hit. Calling ValueOrDie is safe because we // know that 1 < kNumSpecialData. if (!GetSpecialHit(posting_list_used, /*index=*/1) .ValueOrDie() .is_valid()) { ICING_LOG(ERROR) << "Both special hits cannot be invalid at the same time."; return false; } } else if (!IsFull(posting_list_used)) { // NOT_FULL. Special Hit 0 should hold a valid offset. Calling ValueOrDie is // safe because we know that 0 < kNumSpecialData. if (GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() > posting_list_used->size_in_bytes() || GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() < kSpecialHitsSize) { ICING_LOG(ERROR) << "Hit: " << GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() << " size: " << posting_list_used->size_in_bytes() << " sp size: " << kSpecialHitsSize; return false; } } return true; } uint32_t PostingListHitSerializer::GetStartByteOffset( const PostingListUsed* posting_list_used) const { if (IsFull(posting_list_used)) { return 0; } else if (IsAlmostFull(posting_list_used)) { return sizeof(Hit); } else { // NOT_FULL, calling ValueOrDie is safe because we know that 0 < // kNumSpecialData. return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value(); } } bool PostingListHitSerializer::SetStartByteOffset( PostingListUsed* posting_list_used, uint32_t offset) const { if (offset > posting_list_used->size_in_bytes()) { ICING_LOG(ERROR) << "offset cannot be a value greater than size " << posting_list_used->size_in_bytes() << ". offset is " << offset << "."; return false; } if (offset < kSpecialHitsSize && offset > sizeof(Hit)) { ICING_LOG(ERROR) << "offset cannot be a value between (" << sizeof(Hit) << ", " << kSpecialHitsSize << "). offset is " << offset << "."; return false; } if (offset < sizeof(Hit) && offset != 0) { ICING_LOG(ERROR) << "offset cannot be a value between (0, " << sizeof(Hit) << "). offset is " << offset << "."; return false; } if (offset >= kSpecialHitsSize) { // not_full state. Safe to ignore the return value because 0 and 1 are both // < kNumSpecialData. SetSpecialHit(posting_list_used, /*index=*/0, Hit(offset)); SetSpecialHit(posting_list_used, /*index=*/1, Hit(Hit::kInvalidValue)); } else if (offset == sizeof(Hit)) { // almost_full state. Safe to ignore the return value because 1 is both < // kNumSpecialData. SetSpecialHit(posting_list_used, /*index=*/0, Hit(Hit::kInvalidValue)); } // Nothing to do for the FULL state - the offset isn't actually stored // anywhere and both special hits hold valid hits. return true; } libtextclassifier3::StatusOr PostingListHitSerializer::PrependHitUncompressed( PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const { uint32_t hit_bytes_to_prepend = sizeof(Hit::Value) + GetFlagsByteSize(hit) + GetTermFrequencyByteSize(hit); if (offset < kSpecialHitsSize + hit_bytes_to_prepend) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "Not enough room to prepend Hit at offset %d.", offset)); } if (hit.has_term_frequency()) { offset -= sizeof(Hit::TermFrequency); Hit::TermFrequency term_frequency = hit.term_frequency(); memcpy(posting_list_used->posting_list_buffer() + offset, &term_frequency, sizeof(Hit::TermFrequency)); } if (hit.has_flags()) { offset -= sizeof(Hit::Flags); Hit::Flags flags = hit.flags(); memcpy(posting_list_used->posting_list_buffer() + offset, &flags, sizeof(Hit::Flags)); } offset -= sizeof(Hit::Value); Hit::Value val = hit.value(); memcpy(posting_list_used->posting_list_buffer() + offset, &val, sizeof(Hit::Value)); return offset; } libtextclassifier3::Status PostingListHitSerializer::ConsumeFlagsAndTermFrequencyIfPresent( const PostingListUsed* posting_list_used, Hit* hit, uint32_t* offset) const { if (!hit->has_flags()) { // No flags (and by extension, no term-frequency) to consume. Everything is // fine. return libtextclassifier3::Status::OK; } // Consume flags Hit::Flags flags; if (*offset + sizeof(Hit::Flags) > posting_list_used->size_in_bytes()) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "offset %d must not point past the end of the posting list of size " "%d.", *offset, posting_list_used->size_in_bytes())); } memcpy(&flags, posting_list_used->posting_list_buffer() + *offset, sizeof(Hit::Flags)); *hit = Hit(hit->value(), flags, Hit::kDefaultTermFrequency); *offset += sizeof(Hit::Flags); if (hit->has_term_frequency()) { // Consume term frequency if (*offset + sizeof(Hit::TermFrequency) > posting_list_used->size_in_bytes()) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "offset %d must not point past the end of the posting list of size " "%d.", *offset, posting_list_used->size_in_bytes())); } Hit::TermFrequency term_frequency; memcpy(&term_frequency, posting_list_used->posting_list_buffer() + *offset, sizeof(Hit::TermFrequency)); *hit = Hit(hit->value(), flags, term_frequency); *offset += sizeof(Hit::TermFrequency); } return libtextclassifier3::Status::OK; } } // namespace lib } // namespace icing