diff options
author | Cassie Wang <cassiewang@google.com> | 2019-12-20 15:11:45 -0800 |
---|---|---|
committer | Cassie Wang <cassiewang@google.com> | 2019-12-20 16:18:05 -0800 |
commit | 128c9db88925c8425f2ad81e1d8985461d7ba21a (patch) | |
tree | f97ee47cc99d2c162eb30a5e051c606823dfd1ec /icing/file/file-backed-proto.h | |
parent | 1897505cb34f3d53e848da13fafe7691c17417ea (diff) | |
download | icing-128c9db88925c8425f2ad81e1d8985461d7ba21a.tar.gz |
Port over Icing c++ code from upstream
Change-Id: Ia3981fed7e0e70589efc027d4123f306cdfbe990
Diffstat (limited to 'icing/file/file-backed-proto.h')
-rw-r--r-- | icing/file/file-backed-proto.h | 251 |
1 files changed, 251 insertions, 0 deletions
diff --git a/icing/file/file-backed-proto.h b/icing/file/file-backed-proto.h new file mode 100644 index 0000000..1dc19ca --- /dev/null +++ b/icing/file/file-backed-proto.h @@ -0,0 +1,251 @@ +// Copyright (C) 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// A simple file-backed proto with an in-memory cache. +// WARNING: Only use this for small protos. Files storing larger protos can +// benefit from more sophisticated strategies like chunked reads/writes, +// using mmap and ideally, not even using protos. +// +// TODO(b/133793579) Consider exposing a checksum mismatch to callers. + +#ifndef ICING_FILE_FILE_BACKED_PROTO_H_ +#define ICING_FILE_FILE_BACKED_PROTO_H_ + +#include <cstdint> +#include <memory> +#include <string> +#include <string_view> + +#include "utils/base/status.h" +#include "utils/base/statusor.h" +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/mutex.h" +#include "icing/absl_ports/str_cat.h" +#include "icing/absl_ports/thread_annotations.h" +#include "icing/file/filesystem.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/util/crc32.h" +#include "icing/util/logging.h" + +namespace icing { +namespace lib { + +// This class is go/thread-compatible +template <typename ProtoT> +class FileBackedProto { + public: + // Header stored at the beginning of the file before the proto. + struct Header { + static constexpr int32_t kMagic = 0x726f746f; + + // Holds the magic as a quick sanity check against file corruption. + int32_t magic; + + // Checksum of the serialized proto, for a more thorough check against file + // corruption. + uint32_t proto_checksum; + }; + + // Used the specified file to read older version of the proto and store + // newer versions of the proto. + // + // file_path : Must be a path within in a directory that already exists. + FileBackedProto(const Filesystem& filesystem, std::string_view file_path); + + // Returns a reference to the proto read from the file. It + // internally caches the read proto so that future calls are fast. + // + // NOTE: The caller does NOT get ownership of the object returned and + // the returned object is only valid till a new version of the proto is + // written to the file. + // + // Returns NOT_FOUND if the file was empty or never written to. + // Returns INTERNAL_ERROR if an IO error or a corruption was encountered. + libtextclassifier3::StatusOr<const ProtoT*> Read() const + LOCKS_EXCLUDED(mutex_); + + // Writes the new version of the proto provided through to disk. + // Successful Write() invalidates any previously read version of the proto. + // + // Returns INTERNAL_ERROR if any IO error is encountered and will NOT + // invalidate any previously read versions of the proto. + // + // TODO(cassiewang) The implementation today loses old data if Write() fails. + // We should write to a tmp file first and rename the file to fix this. + // TODO(samzheng) Change to Write(ProtoT&& proto) + libtextclassifier3::Status Write(std::unique_ptr<ProtoT> proto) + LOCKS_EXCLUDED(mutex_); + + // Disallow copy and assign. + FileBackedProto(const FileBackedProto&) = delete; + FileBackedProto& operator=(const FileBackedProto&) = delete; + + private: + // Upper bound of file-size that is supported. + static constexpr int32_t kMaxFileSize = 1 * 1024 * 1024; // 1 MiB. + + // Used to provide reader and writer locks + mutable absl_ports::shared_mutex mutex_; + + const Filesystem* const filesystem_; + const std::string file_path_; + + mutable std::unique_ptr<ProtoT> cached_proto_ GUARDED_BY(mutex_); +}; + +template <typename ProtoT> +constexpr int32_t FileBackedProto<ProtoT>::kMaxFileSize; + +template <typename ProtoT> +FileBackedProto<ProtoT>::FileBackedProto(const Filesystem& filesystem, + const std::string_view file_path) + : filesystem_(&filesystem), file_path_(file_path) {} + +template <typename ProtoT> +libtextclassifier3::StatusOr<const ProtoT*> FileBackedProto<ProtoT>::Read() + const { + ICING_VLOG(1) << "Reading proto from file: " << file_path_; + + absl_ports::unique_lock l(&mutex_); + + // Return cached proto if we've already read from disk. + if (cached_proto_ != nullptr) { + ICING_VLOG(1) << "Reusing cached proto for file: " << file_path_; + return cached_proto_.get(); + } + + int64_t file_size = filesystem_->GetFileSize(file_path_.c_str()); + if (file_size == Filesystem::kBadFileSize || file_size == 0) { + return absl_ports::NotFoundError( + absl_ports::StrCat("Missing file: ", file_path_)); + } + + if (file_size > kMaxFileSize) { + return absl_ports::InternalError(absl_ports::StrCat( + "File larger than expected, couldn't read: ", file_path_)); + } + + ScopedFd fd(filesystem_->OpenForRead(file_path_.c_str())); + if (!fd.is_valid()) { + return absl_ports::InternalError( + absl_ports::StrCat("Unable to open file for read: ", file_path_)); + } + + ICING_VLOG(1) << "Loading proto from file: " << file_path_ + << " of size: " << file_size; + + Header header; + if (!filesystem_->PRead(fd.get(), &header, sizeof(Header), + /*offset=*/0)) { + return absl_ports::InternalError( + absl_ports::StrCat("Unable to read header of: ", file_path_)); + } + + if (header.magic != Header::kMagic) { + return absl_ports::InternalError( + absl_ports::StrCat("Invalid header kMagic for: ", file_path_)); + } + + int proto_size = file_size - sizeof(Header); + auto buffer = std::make_unique<uint8_t[]>(proto_size); + if (!filesystem_->PRead(fd.get(), buffer.get(), proto_size, + /*offset=*/sizeof(Header))) { + return absl_ports::InternalError( + absl_ports::StrCat("File read failed: ", file_path_)); + } + + std::string_view buffer_str(reinterpret_cast<const char*>(buffer.get()), + proto_size); + Crc32 crc; + crc.Append(buffer_str); + if (header.proto_checksum != crc.Get()) { + return absl_ports::InternalError( + absl_ports::StrCat("Checksum of file does not match: ", file_path_)); + } + + auto proto = std::make_unique<ProtoT>(); + if (!proto->ParseFromArray(buffer.get(), proto_size)) { + return absl_ports::InternalError( + absl_ports::StrCat("Proto parse failed. File corrupted: ", file_path_)); + } + + ICING_VLOG(1) << "Successfully read proto from file: " << file_path_; + cached_proto_ = std::move(proto); + return cached_proto_.get(); +} + +template <typename ProtoT> +libtextclassifier3::Status FileBackedProto<ProtoT>::Write( + std::unique_ptr<ProtoT> new_proto) { + ICING_VLOG(1) << "Writing proto to file: " << file_path_; + + absl_ports::unique_lock l(&mutex_); + + const std::string new_proto_str = new_proto->SerializeAsString(); + if (new_proto_str.size() >= kMaxFileSize) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "New proto too large. size: %d; limit: %d.", + static_cast<int>(new_proto_str.size()), kMaxFileSize)); + } + + if (cached_proto_ != nullptr && + cached_proto_->SerializeAsString() == new_proto_str) { + ICING_VLOG(1) << "Skip writing proto to file as contents are identical: " + << file_path_; + return libtextclassifier3::Status::OK; + } + + ScopedFd fd(filesystem_->OpenForWrite(file_path_.c_str())); + if (!fd.is_valid()) { + return absl_ports::InternalError( + absl_ports::StrCat("Unable to open file for write: ", file_path_)); + } + + if (!filesystem_->Truncate(fd.get(), 0)) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to truncate file: ", file_path_)); + } + + Header header; + header.magic = Header::kMagic; + + Crc32 crc; + crc.Append(new_proto_str); + header.proto_checksum = crc.Get(); + if (!filesystem_->Write(fd.get(), &header, sizeof(Header))) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to write header to file: ", file_path_)); + } + + if (!filesystem_->Write(fd.get(), new_proto_str.data(), + new_proto_str.size())) { + return absl_ports::InternalError( + absl_ports::StrCat("Failed to write proto to file: ", file_path_)); + } + + if (!filesystem_->DataSync(fd.get())) { + return absl_ports::InternalError(IcingStringUtil::StringPrintf( + "Failed to sync file; filename: %s; content_size: %d ", + file_path_.c_str(), static_cast<int>(new_proto_str.size()))); + } + + ICING_VLOG(1) << "Successfully wrote proto to file: " << file_path_; + cached_proto_ = std::move(new_proto); + return libtextclassifier3::Status::OK; +} + +} // namespace lib +} // namespace icing + +#endif // ICING_FILE_FILE_BACKED_PROTO_H_ |