// Copyright (C) 2019 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // A simple file-backed proto with an in-memory cache. // WARNING: Only use this for small protos. Files storing larger protos can // benefit from more sophisticated strategies like chunked reads/writes, // using mmap and ideally, not even using protos. // // TODO(b/133793579) Consider exposing a checksum mismatch to callers. #ifndef ICING_FILE_FILE_BACKED_PROTO_H_ #define ICING_FILE_FILE_BACKED_PROTO_H_ #include #include #include #include #include "icing/text_classifier/lib3/utils/base/status.h" #include "icing/text_classifier/lib3/utils/base/statusor.h" #include "icing/absl_ports/canonical_errors.h" #include "icing/absl_ports/mutex.h" #include "icing/absl_ports/str_cat.h" #include "icing/absl_ports/thread_annotations.h" #include "icing/file/filesystem.h" #include "icing/legacy/core/icing-string-util.h" #include "icing/util/crc32.h" #include "icing/util/logging.h" namespace icing { namespace lib { // This class is go/thread-compatible template class FileBackedProto { public: // Header stored at the beginning of the file before the proto. struct Header { static constexpr int32_t kMagic = 0x726f746f; // Holds the magic as a quick sanity check against file corruption. int32_t magic; // Checksum of the serialized proto, for a more thorough check against file // corruption. uint32_t proto_checksum; }; // Used the specified file to read older version of the proto and store // newer versions of the proto. // // file_path : Must be a path within in a directory that already exists. FileBackedProto(const Filesystem& filesystem, std::string_view file_path); // Returns a reference to the proto read from the file. It // internally caches the read proto so that future calls are fast. // // NOTE: The caller does NOT get ownership of the object returned and // the returned object is only valid till a new version of the proto is // written to the file. // // Returns NOT_FOUND if the file was empty or never written to. // Returns INTERNAL_ERROR if an IO error or a corruption was encountered. libtextclassifier3::StatusOr Read() const ICING_LOCKS_EXCLUDED(mutex_); // Writes the new version of the proto provided through to disk. // Successful Write() invalidates any previously read version of the proto. // // Returns INTERNAL_ERROR if any IO error is encountered and will NOT // invalidate any previously read versions of the proto. // // TODO(cassiewang) The implementation today loses old data if Write() fails. // We should write to a tmp file first and rename the file to fix this. // TODO(samzheng) Change to Write(ProtoT&& proto) libtextclassifier3::Status Write(std::unique_ptr proto) ICING_LOCKS_EXCLUDED(mutex_); // Disallow copy and assign. FileBackedProto(const FileBackedProto&) = delete; FileBackedProto& operator=(const FileBackedProto&) = delete; private: // Upper bound of file-size that is supported. static constexpr int32_t kMaxFileSize = 1 * 1024 * 1024; // 1 MiB. // Used to provide reader and writer locks mutable absl_ports::shared_mutex mutex_; const Filesystem* const filesystem_; const std::string file_path_; mutable std::unique_ptr cached_proto_ ICING_GUARDED_BY(mutex_); }; template constexpr int32_t FileBackedProto::kMaxFileSize; template FileBackedProto::FileBackedProto(const Filesystem& filesystem, const std::string_view file_path) : filesystem_(&filesystem), file_path_(file_path) {} template libtextclassifier3::StatusOr FileBackedProto::Read() const { ICING_VLOG(1) << "Reading proto from file: " << file_path_; absl_ports::unique_lock l(&mutex_); // Return cached proto if we've already read from disk. if (cached_proto_ != nullptr) { ICING_VLOG(1) << "Reusing cached proto for file: " << file_path_; return cached_proto_.get(); } int64_t file_size = filesystem_->GetFileSize(file_path_.c_str()); if (file_size == Filesystem::kBadFileSize || file_size == 0) { return absl_ports::NotFoundError( absl_ports::StrCat("Missing file: ", file_path_)); } if (file_size > kMaxFileSize) { return absl_ports::InternalError(absl_ports::StrCat( "File larger than expected, couldn't read: ", file_path_)); } ScopedFd fd(filesystem_->OpenForRead(file_path_.c_str())); if (!fd.is_valid()) { return absl_ports::InternalError( absl_ports::StrCat("Unable to open file for read: ", file_path_)); } ICING_VLOG(1) << "Loading proto from file: " << file_path_ << " of size: " << file_size; Header header; if (!filesystem_->PRead(fd.get(), &header, sizeof(Header), /*offset=*/0)) { return absl_ports::InternalError( absl_ports::StrCat("Unable to read header of: ", file_path_)); } if (header.magic != Header::kMagic) { return absl_ports::InternalError( absl_ports::StrCat("Invalid header kMagic for: ", file_path_)); } int proto_size = file_size - sizeof(Header); auto buffer = std::make_unique(proto_size); if (!filesystem_->PRead(fd.get(), buffer.get(), proto_size, /*offset=*/sizeof(Header))) { return absl_ports::InternalError( absl_ports::StrCat("File read failed: ", file_path_)); } std::string_view buffer_str(reinterpret_cast(buffer.get()), proto_size); Crc32 crc; crc.Append(buffer_str); if (header.proto_checksum != crc.Get()) { return absl_ports::InternalError( absl_ports::StrCat("Checksum of file does not match: ", file_path_)); } auto proto = std::make_unique(); if (!proto->ParseFromArray(buffer.get(), proto_size)) { return absl_ports::InternalError( absl_ports::StrCat("Proto parse failed. File corrupted: ", file_path_)); } ICING_VLOG(1) << "Successfully read proto from file: " << file_path_; cached_proto_ = std::move(proto); return cached_proto_.get(); } template libtextclassifier3::Status FileBackedProto::Write( std::unique_ptr new_proto) { ICING_VLOG(1) << "Writing proto to file: " << file_path_; absl_ports::unique_lock l(&mutex_); const std::string new_proto_str = new_proto->SerializeAsString(); if (new_proto_str.size() >= kMaxFileSize) { return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( "New proto too large. size: %d; limit: %d.", static_cast(new_proto_str.size()), kMaxFileSize)); } if (cached_proto_ != nullptr && cached_proto_->SerializeAsString() == new_proto_str) { ICING_VLOG(1) << "Skip writing proto to file as contents are identical: " << file_path_; return libtextclassifier3::Status::OK; } ScopedFd fd(filesystem_->OpenForWrite(file_path_.c_str())); if (!fd.is_valid()) { return absl_ports::InternalError( absl_ports::StrCat("Unable to open file for write: ", file_path_)); } if (!filesystem_->Truncate(fd.get(), 0)) { return absl_ports::InternalError( absl_ports::StrCat("Failed to truncate file: ", file_path_)); } Header header; header.magic = Header::kMagic; Crc32 crc; crc.Append(new_proto_str); header.proto_checksum = crc.Get(); if (!filesystem_->Write(fd.get(), &header, sizeof(Header))) { return absl_ports::InternalError( absl_ports::StrCat("Failed to write header to file: ", file_path_)); } if (!filesystem_->Write(fd.get(), new_proto_str.data(), new_proto_str.size())) { return absl_ports::InternalError( absl_ports::StrCat("Failed to write proto to file: ", file_path_)); } if (!filesystem_->DataSync(fd.get())) { return absl_ports::InternalError(IcingStringUtil::StringPrintf( "Failed to sync file; filename: %s; content_size: %d ", file_path_.c_str(), static_cast(new_proto_str.size()))); } ICING_VLOG(1) << "Successfully wrote proto to file: " << file_path_; cached_proto_ = std::move(new_proto); return libtextclassifier3::Status::OK; } } // namespace lib } // namespace icing #endif // ICING_FILE_FILE_BACKED_PROTO_H_