diff options
-rw-r--r-- | icing/file/file-backed-proto-log.h | 2 | ||||
-rw-r--r-- | icing/file/file-backed-vector.h | 42 | ||||
-rw-r--r-- | icing/file/file-backed-vector_test.cc | 197 | ||||
-rw-r--r-- | icing/icing-search-engine_test.cc | 3 | ||||
-rw-r--r-- | icing/index/main/posting-list-free.h | 2 | ||||
-rw-r--r-- | icing/performance-configuration.cc | 5 | ||||
-rw-r--r-- | icing/store/document-store.cc | 14 | ||||
-rw-r--r-- | icing/store/document-store.h | 22 | ||||
-rw-r--r-- | synced_AOSP_CL_number.txt | 2 |
9 files changed, 236 insertions, 53 deletions
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h index 4302dce..b2b37e8 100644 --- a/icing/file/file-backed-proto-log.h +++ b/icing/file/file-backed-proto-log.h @@ -406,7 +406,6 @@ class FileBackedProtoLog { // INTERNAL_ERROR if the metadata is invalid or any IO errors happen static libtextclassifier3::StatusOr<int> ReadProtoMetadata( MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size); - std::unique_ptr<Header> header_; // Magic number added in front of every proto. Used when reading out protos // as a first check for corruption in each entry in the file. Even if there is @@ -435,6 +434,7 @@ class FileBackedProtoLog { ScopedFd fd_; const Filesystem* const filesystem_; const std::string file_path_; + std::unique_ptr<Header> header_; }; template <typename ProtoT> diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h index 2443cb2..61f6923 100644 --- a/icing/file/file-backed-vector.h +++ b/icing/file/file-backed-vector.h @@ -423,13 +423,6 @@ FileBackedVector<T>::InitializeExistingFile( absl_ports::StrCat("Invalid header kMagic for ", file_path)); } - // Mmap the content of the vector, excluding the header so its easier to - // access elements from the mmapped region - auto mmapped_file = - std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy); - ICING_RETURN_IF_ERROR( - mmapped_file->Remap(sizeof(Header), file_size - sizeof(Header))); - // Check header if (header->header_checksum != header->CalculateHeaderChecksum()) { return absl_ports::FailedPreconditionError( @@ -442,6 +435,20 @@ FileBackedVector<T>::InitializeExistingFile( header->element_size)); } + int64_t min_file_size = header->num_elements * sizeof(T) + sizeof(Header); + if (min_file_size > file_size) { + return absl_ports::InternalError(IcingStringUtil::StringPrintf( + "Inconsistent file size, expected %zd, actual %d", min_file_size, + file_size)); + } + + // Mmap the content of the vector, excluding the header so its easier to + // access elements from the mmapped region + auto mmapped_file = + std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy); + ICING_RETURN_IF_ERROR( + mmapped_file->Remap(sizeof(Header), file_size - sizeof(Header))); + // Check vector contents Crc32 vector_checksum; std::string_view vector_contents( @@ -591,9 +598,24 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary( least_file_size_needed = math_util::RoundUpTo( least_file_size_needed, int64_t{FileBackedVector<T>::kGrowElements * sizeof(T)}); - if (!filesystem_->Grow(file_path_.c_str(), least_file_size_needed)) { - return absl_ports::InternalError( - absl_ports::StrCat("Couldn't grow file ", file_path_)); + + // We use PWrite here rather than Grow because Grow doesn't actually allocate + // an underlying disk block. This can lead to problems with mmap because mmap + // has no effective way to signal that it was impossible to allocate the disk + // block and ends up crashing instead. PWrite will force the allocation of + // these blocks, which will ensure that any failure to grow will surface here. + int64_t page_size = getpagesize(); + auto buf = std::make_unique<uint8_t[]>(page_size); + int64_t size_to_write = page_size - (current_file_size % page_size); + ScopedFd sfd(filesystem_->OpenForWrite(file_path_.c_str())); + while (current_file_size < least_file_size_needed) { + if (!filesystem_->PWrite(sfd.get(), current_file_size, buf.get(), + size_to_write)) { + return absl_ports::InternalError( + absl_ports::StrCat("Couldn't grow file ", file_path_)); + } + current_file_size += size_to_write; + size_to_write = page_size - (current_file_size % page_size); } ICING_RETURN_IF_ERROR(mmapped_file_->Remap( diff --git a/icing/file/file-backed-vector_test.cc b/icing/file/file-backed-vector_test.cc index bc2fef6..b05ce2d 100644 --- a/icing/file/file-backed-vector_test.cc +++ b/icing/file/file-backed-vector_test.cc @@ -32,6 +32,7 @@ #include "icing/util/logging.h" using ::testing::Eq; +using ::testing::IsTrue; using ::testing::Pointee; namespace icing { @@ -278,7 +279,6 @@ TEST_F(FileBackedVectorTest, Grow) { filesystem_, file_path_, MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0))); - EXPECT_THAT(vector->Set(kMaxNumElts + 11, 'a'), StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE)); EXPECT_THAT(vector->Set(-1, 'a'), @@ -318,25 +318,32 @@ TEST_F(FileBackedVectorTest, GrowsInChunks) { filesystem_, file_path_, MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); - // Our initial file size should just be the size of the header - EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()), - Eq(sizeof(FileBackedVector<char>::Header))); + // Our initial file size should just be the size of the header. Disk usage + // will indicate that one block has been allocated, which contains the header. + int header_size = sizeof(FileBackedVector<char>::Header); + int page_size = getpagesize(); + EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(header_size)); + EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(page_size)); - // Once we add something though, we'll grow to kGrowElements big + // Once we add something though, we'll grow to be kGrowElements big. From this + // point on, file size and disk usage should be the same because Growing will + // explicitly allocate the number of blocks needed to accomodate the file. Insert(vector.get(), 0, "a"); - EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()), - Eq(kGrowElements * sizeof(int))); + int file_size = kGrowElements * sizeof(int); + EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size)); + EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size)); // Should still be the same size, don't need to grow underlying file Insert(vector.get(), 1, "b"); - EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()), - Eq(kGrowElements * sizeof(int))); + EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size)); + EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size)); // Now we grow by a kGrowElements chunk, so the underlying file is 2 // kGrowElements big + file_size *= 2; Insert(vector.get(), 2, std::string(kGrowElements, 'c')); - EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()), - Eq(kGrowElements * 2 * sizeof(int))); + EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size)); + EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size)); // Destroy/persist the contents. vector.reset(); @@ -463,6 +470,174 @@ TEST_F(FileBackedVectorTest, TruncateAndReReadFile) { } } +TEST_F(FileBackedVectorTest, InitFileTooSmallForHeaderFails) { + { + // 1. Create a vector with a few elements. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), 0, "A"); + Insert(vector.get(), 1, "Z"); + ASSERT_THAT(vector->PersistToDisk(), IsOk()); + } + + // 2. Shrink the file to be smaller than the header. + filesystem_.Truncate(fd_, sizeof(FileBackedVector<char>::Header) - 1); + + { + // 3. Attempt to create the file and confirm that it fails. + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); + } +} + +TEST_F(FileBackedVectorTest, InitWrongDataSizeFails) { + { + // 1. Create a vector with a few elements. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), 0, "A"); + Insert(vector.get(), 1, "Z"); + ASSERT_THAT(vector->PersistToDisk(), IsOk()); + } + + { + // 2. Attempt to create the file with a different element size and confirm + // that it fails. + EXPECT_THAT(FileBackedVector<int>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); + } +} + +TEST_F(FileBackedVectorTest, InitCorruptHeaderFails) { + { + // 1. Create a vector with a few elements. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), 0, "A"); + Insert(vector.get(), 1, "Z"); + ASSERT_THAT(vector->PersistToDisk(), IsOk()); + } + + // 2. Modify the header, but don't update the checksum. This would be similar + // to corruption of the header. + FileBackedVector<char>::Header header; + ASSERT_THAT(filesystem_.PRead(fd_, &header, sizeof(header), /*offset=*/0), + IsTrue()); + header.num_elements = 1; + ASSERT_THAT(filesystem_.PWrite(fd_, /*offset=*/0, &header, sizeof(header)), + IsTrue()); + + { + // 3. Attempt to create the file with a header that doesn't match its + // checksum and confirm that it fails. + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + } +} + +TEST_F(FileBackedVectorTest, InitHeaderElementSizeTooBigFails) { + { + // 1. Create a vector with a few elements. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), 0, "A"); + Insert(vector.get(), 1, "Z"); + ASSERT_THAT(vector->PersistToDisk(), IsOk()); + } + + // 2. Modify the header so that the number of elements exceeds the actual size + // of the underlying file. + FileBackedVector<char>::Header header; + ASSERT_THAT(filesystem_.PRead(fd_, &header, sizeof(header), /*offset=*/0), + IsTrue()); + int64_t file_size = filesystem_.GetFileSize(fd_); + int64_t allocated_elements_size = file_size - sizeof(header); + header.num_elements = (allocated_elements_size / sizeof(char)) + 1; + header.header_checksum = header.CalculateHeaderChecksum(); + ASSERT_THAT(filesystem_.PWrite(fd_, /*offset=*/0, &header, sizeof(header)), + IsTrue()); + + { + // 3. Attempt to create the file with num_elements that is larger than the + // underlying file and confirm that it fails. + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC), + StatusIs(libtextclassifier3::StatusCode::INTERNAL)); + } +} + +TEST_F(FileBackedVectorTest, InitCorruptElementsFails) { + { + // 1. Create a vector with a few elements. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), 0, "A"); + Insert(vector.get(), 1, "Z"); + ASSERT_THAT(vector->PersistToDisk(), IsOk()); + } + + // 2. Overwrite the values of the first two elements. + std::string corrupted_content = "BY"; + ASSERT_THAT( + filesystem_.PWrite(fd_, /*offset=*/sizeof(FileBackedVector<char>::Header), + corrupted_content.c_str(), corrupted_content.length()), + IsTrue()); + + { + // 3. Attempt to create the file with elements that don't match their + // checksum and confirm that it fails. + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC), + StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION)); + } +} + +TEST_F(FileBackedVectorTest, InitNormalSucceeds) { + { + // 1. Create a vector with a few elements. + ICING_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<FileBackedVector<char>> vector, + FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC)); + Insert(vector.get(), 0, "A"); + Insert(vector.get(), 1, "Z"); + ASSERT_THAT(vector->PersistToDisk(), IsOk()); + } + + { + // 2. Attempt to create the file with a completely valid header and elements + // region. This should succeed. + EXPECT_THAT(FileBackedVector<char>::Create( + filesystem_, file_path_, + MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC), + IsOk()); + } +} + } // namespace } // namespace lib diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc index 1ee2d63..ef4625a 100644 --- a/icing/icing-search-engine_test.cc +++ b/icing/icing-search-engine_test.cc @@ -101,7 +101,10 @@ constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_PLAIN = constexpr StringIndexingConfig_TokenizerType_Code TOKENIZER_NONE = StringIndexingConfig_TokenizerType_Code_NONE; +#ifndef ICING_JNI_TEST constexpr TermMatchType_Code MATCH_EXACT = TermMatchType_Code_EXACT_ONLY; +#endif // !ICING_JNI_TEST + constexpr TermMatchType_Code MATCH_PREFIX = TermMatchType_Code_PREFIX; constexpr TermMatchType_Code MATCH_NONE = TermMatchType_Code_UNKNOWN; diff --git a/icing/index/main/posting-list-free.h b/icing/index/main/posting-list-free.h index 4b27401..4f06057 100644 --- a/icing/index/main/posting-list-free.h +++ b/icing/index/main/posting-list-free.h @@ -115,7 +115,7 @@ class PostingListFree { // bytes which will store the next posting list index, the rest are unused and // can be anything. uint8_t *posting_list_buffer_; - uint32_t size_in_bytes_; + [[maybe_unused]] uint32_t size_in_bytes_; static_assert(sizeof(PostingListIndex) <= posting_list_utils::min_posting_list_size(), diff --git a/icing/performance-configuration.cc b/icing/performance-configuration.cc index 45b03d3..4020dd0 100644 --- a/icing/performance-configuration.cc +++ b/icing/performance-configuration.cc @@ -55,11 +55,12 @@ constexpr int kMaxQueryLength = 23000; constexpr int kDefaultNumToScore = 30000; // New Android devices nowadays all allow more than 16 MB memory per app. Using -// that as a guideline, we set 16 MB as the safe memory threshold. +// that as a guideline and being more conservative, we set 4 MB as the safe +// memory threshold. // TODO(b/150029642): Android apps / framework have better understanding of how // much memory is allowed, so it would be better to let clients pass in this // value. -constexpr int kSafeMemoryUsage = 16 * 1024 * 1024; // 16MB +constexpr int kSafeMemoryUsage = 4 * 1024 * 1024; // 4MB // The maximum number of hits that can fit below the kSafeMemoryUsage threshold. constexpr int kMaxNumTotalHits = kSafeMemoryUsage / sizeof(ScoredDocumentHit); diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc index 6479b97..226a96b 100644 --- a/icing/store/document-store.cc +++ b/icing/store/document-store.cc @@ -64,7 +64,6 @@ namespace { // Used in DocumentId mapper to mark a document as deleted constexpr int64_t kDocDeletedFlag = -1; -constexpr char kDocumentLogFilename[] = "document_log"; constexpr char kDocumentIdMapperFilename[] = "document_id_mapper"; constexpr char kDocumentStoreHeaderFilename[] = "document_store_header"; constexpr char kScoreCacheFilename[] = "score_cache"; @@ -1578,6 +1577,7 @@ libtextclassifier3::Status DocumentStore::OptimizeInto( int size = document_id_mapper_->num_elements(); int num_deleted = 0; int num_expired = 0; + UsageStore::UsageScores default_usage; for (DocumentId document_id = 0; document_id < size; document_id++) { auto document_or = Get(document_id, /*clear_internal_fields=*/false); if (absl_ports::IsNotFound(document_or.status())) { @@ -1626,10 +1626,14 @@ libtextclassifier3::Status DocumentStore::OptimizeInto( // Copy over usage scores. ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores, usage_store_->GetUsageScores(document_id)); - - DocumentId new_document_id = new_document_id_or.ValueOrDie(); - ICING_RETURN_IF_ERROR( - new_doc_store->SetUsageScores(new_document_id, usage_scores)); + if (!(usage_scores == default_usage)) { + // If the usage scores for this document are the default (no usage), then + // don't bother setting it. No need to possibly allocate storage if + // there's nothing interesting to store. + DocumentId new_document_id = new_document_id_or.ValueOrDie(); + ICING_RETURN_IF_ERROR( + new_doc_store->SetUsageScores(new_document_id, usage_scores)); + } } if (stats != nullptr) { stats->set_num_original_documents(size); diff --git a/icing/store/document-store.h b/icing/store/document-store.h index a60aab1..c85c989 100644 --- a/icing/store/document-store.h +++ b/icing/store/document-store.h @@ -497,28 +497,6 @@ class DocumentStore { bool force_recovery_and_revalidate_documents, InitializeStatsProto* initialize_stats); - // Initializes a new DocumentStore and sets up any underlying files. - // - // Returns: - // Data loss status on success, effectively always DataLoss::NONE - // INTERNAL on I/O error - libtextclassifier3::StatusOr<DataLoss> InitializeNewStore( - InitializeStatsProto* initialize_stats); - - // Initializes a DocumentStore over an existing directory of files. - // - // stats will be set if non-null - // - // Returns: - // Data loss status on success - // INTERNAL on I/O error - libtextclassifier3::StatusOr<DataLoss> InitializeExistingStore( - bool force_recovery_and_revalidate_documents, - InitializeStatsProto* initialize_stats); - - libtextclassifier3::StatusOr<DataLoss> MigrateFromV0ToV1( - InitializeStatsProto* initialize_stats); - // Creates sub-components and verifies the integrity of each sub-component. // This assumes that the the underlying files already exist, and will return // an error if it doesn't find what it's expecting. diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index 71ea9fe..b5282cf 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=382354974) +set(synced_AOSP_CL_number=384818601) |