summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Deymo <deymo@google.com>2017-10-02 21:01:15 +0200
committerAlex Deymo <deymo@google.com>2017-10-11 13:08:47 +0200
commite1140a25f4f26535d6c42d604e790ff4e432bb96 (patch)
tree50b377948ccf269606e6d6aacc72175e7603cced
parentf356f5013fdf4c46cb6eeef00883426fad6c3e12 (diff)
downloadbsdiff-e1140a25f4f26535d6c42d604e790ff4e432bb96.tar.gz
New SplitPatchWriter to split a bsdiff patch into multiple ones.
This new PatchWriterInterface derived class allows to split the output patch of a bsdiff run into multiple files of a fixed new file size. This allows to split long operations with minimal loss of patch size due to splitting the compression blocks. For large enough new file sections, this patch size reduction should be minimal. Bug: 34220646 Test: Added unittests. Change-Id: Ib90fddd5c55a38c87ce396f2792aa558ecd07b22
-rw-r--r--Android.bp2
-rw-r--r--Makefile4
-rw-r--r--bsdiff.gyp2
-rw-r--r--include/bsdiff/patch_writer_interface.h5
-rw-r--r--split_patch_writer.cc171
-rw-r--r--split_patch_writer.h82
-rw-r--r--split_patch_writer_unittest.cc156
7 files changed, 421 insertions, 1 deletions
diff --git a/Android.bp b/Android.bp
index 711efde..9907383 100644
--- a/Android.bp
+++ b/Android.bp
@@ -59,6 +59,7 @@ cc_library_static {
"bz2_compressor.cc",
"diff_encoder.cc",
"patch_writer.cc",
+ "split_patch_writer.cc",
],
static_libs: [
"libdivsufsort64",
@@ -104,6 +105,7 @@ cc_test {
"extents_file_unittest.cc",
"extents_unittest.cc",
"patch_writer_unittest.cc",
+ "split_patch_writer_unittest.cc",
"test_utils.cc",
"testrunner.cc",
],
diff --git a/Makefile b/Makefile
index b9c3b66..0a881a7 100644
--- a/Makefile
+++ b/Makefile
@@ -36,7 +36,8 @@ bsdiff_src_files := \
bsdiff.cc \
bz2_compressor.cc \
diff_encoder.cc \
- patch_writer.cc
+ patch_writer.cc \
+ split_patch_writer.cc
# "bspatch" program.
bspatch_src_files := \
@@ -56,6 +57,7 @@ bsdiff_common_unittests := \
extents_file_unittest.cc \
extents_unittest.cc \
patch_writer_unittest.cc \
+ split_patch_writer_unittest.cc \
test_utils.cc \
testrunner.cc
diff --git a/bsdiff.gyp b/bsdiff.gyp
index dcb7bde..87f0c2b 100644
--- a/bsdiff.gyp
+++ b/bsdiff.gyp
@@ -56,6 +56,7 @@
'bz2_compressor.cc',
'diff_encoder.cc',
'patch_writer.cc',
+ 'split_patch_writer.cc',
],
},
# bsdiff executable
@@ -131,6 +132,7 @@
'extents_file_unittest.cc',
'extents_unittest.cc',
'patch_writer_unittest.cc',
+ 'split_patch_writer_unittest.cc',
'test_utils.cc',
],
},
diff --git a/include/bsdiff/patch_writer_interface.h b/include/bsdiff/patch_writer_interface.h
index 6ae4656..04f1675 100644
--- a/include/bsdiff/patch_writer_interface.h
+++ b/include/bsdiff/patch_writer_interface.h
@@ -27,6 +27,11 @@ struct ControlEntry {
// The value to add to the source pointer after patching from the diff stream.
int64_t offset_increment;
+
+ bool operator==(const ControlEntry& o) const {
+ return diff_size == o.diff_size && extra_size == o.extra_size &&
+ offset_increment == o.offset_increment;
+ }
};
class PatchWriterInterface {
diff --git a/split_patch_writer.cc b/split_patch_writer.cc
new file mode 100644
index 0000000..4123440
--- /dev/null
+++ b/split_patch_writer.cc
@@ -0,0 +1,171 @@
+// Copyright 2017 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "bsdiff/split_patch_writer.h"
+
+#include <algorithm>
+
+#include "bsdiff/logging.h"
+
+using std::endl;
+
+namespace bsdiff {
+
+bool SplitPatchWriter::Init() {
+ // Fail gracefully if re-initialized.
+ if (current_patch_ || patches_.empty())
+ return false;
+
+ return patches_[0]->Init();
+}
+
+bool SplitPatchWriter::WriteDiffStream(const uint8_t* data, size_t size) {
+ return WriteToStream(&PatchWriterInterface::WriteDiffStream, &diff_sizes_,
+ data, size);
+}
+
+bool SplitPatchWriter::WriteExtraStream(const uint8_t* data, size_t size) {
+ return WriteToStream(&PatchWriterInterface::WriteExtraStream, &extra_sizes_,
+ data, size);
+}
+
+bool SplitPatchWriter::AddControlEntry(const ControlEntry& entry) {
+ ControlEntry remaining(entry);
+ while (written_output_ + remaining.diff_size + remaining.extra_size >=
+ (current_patch_ + 1) * new_chunk_size_) {
+ // We need to write some of the current ControlEntry to the current patch
+ // and move on to the next patch if there are more bytes to write.
+ uint64_t remaining_bytes =
+ (current_patch_ + 1) * new_chunk_size_ - written_output_;
+ // The offset_increment is always 0 in this case since we don't plan to read
+ // for the old file in the current_patch anymore.
+ ControlEntry current_patch_entry(0, 0, 0);
+
+ current_patch_entry.diff_size =
+ std::min(remaining.diff_size, remaining_bytes);
+ remaining_bytes -= current_patch_entry.diff_size;
+ remaining.diff_size -= current_patch_entry.diff_size;
+
+ // This will be positive only if we used all the diff_size bytes.
+ current_patch_entry.extra_size =
+ std::min(remaining.extra_size, remaining_bytes);
+ remaining_bytes -= current_patch_entry.extra_size;
+ remaining.extra_size -= current_patch_entry.extra_size;
+
+ AddControlEntryToCurrentPatch(current_patch_entry);
+
+ if (remaining.diff_size + remaining.extra_size > 0) {
+ current_patch_++;
+ if (current_patch_ >= patches_.size()) {
+ LOG(ERROR) << "Writing past the last patch" << endl;
+ return false;
+ }
+ if (!patches_[current_patch_]->Init()) {
+ LOG(ERROR) << "Failed to initialize patch " << current_patch_ << endl;
+ return false;
+ }
+ if (!remaining.diff_size) {
+ // When no diff need to be sent to the output, we can just push the
+ // existing old_pos_ as part of the current triplet, since the extra
+ // stream doesn't use the old_pos_;
+ remaining.offset_increment += old_pos_;
+ old_pos_ = 0;
+ }
+ // Need to add a dummy control entry at the beginning of the patch to
+ // offset the old_pos in the new patch, which would start at 0.
+ if (old_pos_ != 0) {
+ if (!patches_[current_patch_]->AddControlEntry(
+ ControlEntry(0, 0, old_pos_)))
+ return false;
+ }
+ } else {
+ // There was no need to write more bytes past the current patch, so just
+ // update the old_pos_ we are tracking for the next patch, if any.
+ old_pos_ += remaining.offset_increment;
+ return true;
+ }
+ }
+
+ // Trivial entries will be ignored.
+ return AddControlEntryToCurrentPatch(remaining);
+}
+
+bool SplitPatchWriter::Close() {
+ uint64_t missing_bytes = 0;
+ for (auto size : diff_sizes_)
+ missing_bytes += size;
+ for (auto size : extra_sizes_)
+ missing_bytes += size;
+ if (missing_bytes > 0) {
+ LOG(ERROR) << "Close() called but there are " << missing_bytes
+ << " bytes missing from Write*Stream() calls" << endl;
+ return false;
+ }
+
+ // |current_patch_| holds the last patch that was Init()'ed. If there are more
+ // patches in the list those have not been initialized/closed, which is a
+ // programming error.
+ if (current_patch_ + 1 != patches_.size()) {
+ LOG(ERROR) << "Close() called but no bytes habe been written to the last "
+ "patch"
+ << endl;
+ return false;
+ }
+
+ // Close all the remaining streams.
+ for (; closed_patches_ < patches_.size(); closed_patches_++) {
+ if (!patches_[closed_patches_]->Close())
+ return false;
+ }
+ return true;
+}
+
+bool SplitPatchWriter::AddControlEntryToCurrentPatch(
+ const ControlEntry& entry) {
+ // Ignore trivial control entries that don't modify the state.
+ if (!entry.diff_size && !entry.extra_size && !entry.offset_increment)
+ return true;
+
+ if (current_patch_ >= patches_.size()) {
+ LOG(ERROR) << "Writing past the last patch" << endl;
+ return false;
+ }
+ old_pos_ += entry.diff_size + entry.offset_increment;
+ written_output_ += entry.diff_size + entry.extra_size;
+ // Register the diff/extra sizes as required bytes for the current patch.
+ diff_sizes_[current_patch_] += entry.diff_size;
+ extra_sizes_[current_patch_] += entry.extra_size;
+ return patches_[current_patch_]->AddControlEntry(entry);
+}
+
+bool SplitPatchWriter::WriteToStream(WriteStreamMethod method,
+ std::vector<size_t>* sizes_vector,
+ const uint8_t* data,
+ size_t size) {
+ size_t written = 0;
+ for (size_t i = closed_patches_; i <= current_patch_ && written < size; i++) {
+ if ((*sizes_vector)[i]) {
+ size_t flush_size = std::min(size - written, (*sizes_vector)[i]);
+ if (!(patches_[i]->*method)(data + written, flush_size))
+ return false;
+ written += flush_size;
+ (*sizes_vector)[i] -= flush_size;
+ }
+
+ if (i < current_patch_ && !diff_sizes_[i] && !extra_sizes_[i]) {
+ // All bytes expected for the patch i are already sent.
+ if (!patches_[i]->Close())
+ return false;
+ closed_patches_++;
+ }
+ }
+ if (written < size) {
+ LOG(ERROR) << "Calling Write*Stream() before the corresponding "
+ "AddControlEntry() is not supported.";
+ return false;
+ }
+ return true;
+}
+
+} // namespace bsdiff
diff --git a/split_patch_writer.h b/split_patch_writer.h
new file mode 100644
index 0000000..81c913e
--- /dev/null
+++ b/split_patch_writer.h
@@ -0,0 +1,82 @@
+// Copyright 2017 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef _BSDIFF_SPLIT_PATCH_WRITER_H_
+#define _BSDIFF_SPLIT_PATCH_WRITER_H_
+
+#include <stdint.h>
+
+#include <vector>
+
+#include "bsdiff/patch_writer_interface.h"
+
+namespace bsdiff {
+
+// A PatchWriterInterface class that splits the output patch into multiple
+// patches of a fixed new-file size. The size of each patch data will depend on
+// the contents of the new file data, and won't necessarily be uniform.
+class SplitPatchWriter : public PatchWriterInterface {
+ public:
+ // Create a PatchWriter using that will split in several patches where each
+ // one will write |new_chunk_size| bytes of new file data. Each patch will
+ // use the old file as a whole input file.
+ SplitPatchWriter(uint64_t new_chunk_size,
+ const std::vector<PatchWriterInterface*>& patches)
+ : new_chunk_size_(new_chunk_size), patches_(patches) {
+ diff_sizes_.resize(patches.size());
+ extra_sizes_.resize(patches.size());
+ }
+
+ // PatchWriterInterface overrides.
+ // Note: Calling WriteDiffStream/WriteExtraStream before calling the
+ // corresponding AddControlEntry() is not supported and will fail. The reason
+ // for this is because which underlying patch takes the bytes depends on the
+ // control entries.
+ bool Init() override;
+ bool WriteDiffStream(const uint8_t* data, size_t size) override;
+ bool WriteExtraStream(const uint8_t* data, size_t size) override;
+ bool AddControlEntry(const ControlEntry& entry) override;
+ bool Close() override;
+
+ private:
+ // Add a ControlEntry to the |current_patch_| without splitting it. Updates
+ // the internal structures of used data.
+ bool AddControlEntryToCurrentPatch(const ControlEntry& entry);
+
+ using WriteStreamMethod = bool (PatchWriterInterface::*)(const uint8_t* data,
+ size_t size);
+
+ // Write to the diff or extra stream as determined by |method|.
+ bool WriteToStream(WriteStreamMethod method,
+ std::vector<size_t>* sizes_vector,
+ const uint8_t* data,
+ size_t size);
+
+ // The size of each chunk of the new file written to.
+ uint64_t new_chunk_size_;
+ std::vector<PatchWriterInterface*> patches_;
+
+ // The size of the diff and extra streams that should go in each patch and has
+ // been written so far.
+ std::vector<size_t> diff_sizes_;
+ std::vector<size_t> extra_sizes_;
+
+ // The current patch number in the |patches_| array we are writing to.
+ size_t current_patch_{0};
+
+ // The number of patches we already called Close() on. The patches are always
+ // closed in order.
+ size_t closed_patches_{0};
+
+ // Bytes of the new files already written. Needed to store the new length in
+ // the header of the file.
+ uint64_t written_output_{0};
+
+ // The current pointer into the old stream.
+ uint64_t old_pos_{0};
+};
+
+} // namespace bsdiff
+
+#endif // _BSDIFF_SPLIT_PATCH_WRITER_H_
diff --git a/split_patch_writer_unittest.cc b/split_patch_writer_unittest.cc
new file mode 100644
index 0000000..1170db9
--- /dev/null
+++ b/split_patch_writer_unittest.cc
@@ -0,0 +1,156 @@
+// Copyright 2017 The Chromium OS Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "bsdiff/split_patch_writer.h"
+
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "bsdiff/fake_patch_writer.h"
+#include "bsdiff/test_utils.h"
+
+namespace bsdiff {
+
+class SplitPatchWriterTest : public testing::Test {
+ protected:
+ void SetUpForSize(size_t num_chunks, uint64_t new_chunk_size) {
+ fake_patches_.resize(num_chunks);
+ std::vector<PatchWriterInterface*> patches;
+ for (auto& fake_patch : fake_patches_)
+ patches.push_back(&fake_patch);
+
+ patch_writer_.reset(new SplitPatchWriter(new_chunk_size, patches));
+ EXPECT_TRUE(patch_writer_->Init());
+ }
+
+ std::vector<FakePatchWriter> fake_patches_;
+ std::unique_ptr<SplitPatchWriter> patch_writer_;
+};
+
+// A single empty patch is allowed.
+TEST_F(SplitPatchWriterTest, NonSplitEmptyPatchTest) {
+ SetUpForSize(1, 100);
+ EXPECT_TRUE(patch_writer_->Close());
+
+ EXPECT_TRUE(fake_patches_[0].entries().empty());
+}
+
+// Leaving patches at the end that are empty is considered an error.
+TEST_F(SplitPatchWriterTest, NotAllPatchesWrittenErrorTest) {
+ SetUpForSize(2, 100);
+ // We write less than the amount needed for two patches, which should fail.
+ EXPECT_FALSE(patch_writer_->Close());
+}
+
+TEST_F(SplitPatchWriterTest, MissingDiffBytesErrorTest) {
+ SetUpForSize(2, 10);
+
+ EXPECT_TRUE(patch_writer_->AddControlEntry(ControlEntry(15, 5, 0)));
+ std::vector<uint8_t> zeros(20, 0);
+ // We write 12 diff bytes instead of the expected 15. This should fail on
+ // Close().
+ EXPECT_TRUE(patch_writer_->WriteDiffStream(zeros.data(), 12));
+ EXPECT_TRUE(patch_writer_->WriteExtraStream(zeros.data(), 5));
+ EXPECT_FALSE(patch_writer_->Close());
+}
+
+TEST_F(SplitPatchWriterTest, MissingExtraBytesErrorTest) {
+ SetUpForSize(2, 10);
+
+ std::vector<uint8_t> zeros(20, 0);
+ EXPECT_TRUE(patch_writer_->AddControlEntry(ControlEntry(5, 15, -5)));
+ EXPECT_TRUE(patch_writer_->WriteDiffStream(zeros.data(), 5));
+ // We write a little less than the expected 15. This operation should succeed,
+ // since we could write the rest later.
+ EXPECT_TRUE(patch_writer_->WriteExtraStream(zeros.data(), 10));
+ EXPECT_FALSE(patch_writer_->Close());
+}
+
+// Test all sort of corner cases when splitting the ControlEntry across multiple
+// patches
+TEST_F(SplitPatchWriterTest, SplitControlAcrossSeveralPatchesTest) {
+ SetUpForSize(4, 10);
+ // The middle control entry would be split in tree different patches.
+ EXPECT_TRUE(patch_writer_->AddControlEntry(ControlEntry(5, 1, -5)));
+ EXPECT_TRUE(patch_writer_->AddControlEntry(ControlEntry(4, 0, -4)));
+ // old_pos at this point is 0. This is the end of the first patch.
+ EXPECT_TRUE(patch_writer_->AddControlEntry(ControlEntry(6, 0, -1)));
+ // old_pos at this point is 5.
+ EXPECT_TRUE(patch_writer_->AddControlEntry(ControlEntry(1, 18, 2)));
+ // old_pos at this point is 8.
+ EXPECT_TRUE(patch_writer_->AddControlEntry(ControlEntry(1, 0, 1)));
+ EXPECT_TRUE(patch_writer_->AddControlEntry(ControlEntry(4, 0, -5)));
+
+ std::vector<uint8_t> zeros(40, 0);
+ EXPECT_TRUE(
+ patch_writer_->WriteDiffStream(zeros.data(), 5 + 4 + 6 + 1 + 1 + 4));
+ EXPECT_TRUE(patch_writer_->WriteExtraStream(zeros.data(), 1 + 18));
+ EXPECT_TRUE(patch_writer_->Close());
+
+ EXPECT_EQ((std::vector<ControlEntry>{
+ ControlEntry(5, 1, -5),
+ ControlEntry(4, 0, 0), // (4, 0, -4) but the -4 is not needed.
+ }),
+ fake_patches_[0].entries());
+ EXPECT_EQ((std::vector<ControlEntry>{
+ // No need for dummy entry because the old_pos is already at 0.
+ ControlEntry(6, 0, -1),
+ ControlEntry(1, 3, 0), // the first part of (1, 18, 2)
+ }),
+ fake_patches_[1].entries());
+ EXPECT_EQ((std::vector<ControlEntry>{
+ // No need for dummy entry because the first entry is all in
+ // the extra stream and this is the last entry.
+ ControlEntry(0, 10, 0), // the middle part of (1, 18, 2)
+ }),
+ fake_patches_[2].entries());
+ EXPECT_EQ((std::vector<ControlEntry>{
+ // No need for dummy entry because the first entry is all in
+ // the extra stream, so use that.
+ ControlEntry(0, 5, 8), // the last part of (1, 18, 2), plus the
+ // old_pos 5. 8 = 1 + 2 + 5.
+ ControlEntry(1, 0, 1), // (1, 0, 1) copied
+ ControlEntry(4, 0, 0), // (4, 0, -5) ignoring the offset.
+ }),
+ fake_patches_[3].entries());
+}
+
+TEST_F(SplitPatchWriterTest, WriteStreamsAfterControlAcrossPatchesTest) {
+ std::vector<uint8_t> numbers(40);
+ for (size_t i = 0; i < numbers.size(); ++i)
+ numbers[i] = 'A' + i;
+
+ SetUpForSize(4, 10);
+ // The sequence is 15 diff, 10 extra, 15 diff.
+ EXPECT_TRUE(patch_writer_->AddControlEntry(ControlEntry(15, 10, 0)));
+ EXPECT_TRUE(patch_writer_->AddControlEntry(ControlEntry(15, 0, 0)));
+ // Numbers [0, 30) for the diff stream, and [30, 40) for the extra stream.
+ EXPECT_TRUE(patch_writer_->WriteDiffStream(numbers.data(), 30));
+ EXPECT_TRUE(patch_writer_->WriteExtraStream(numbers.data() + 30, 10));
+ EXPECT_TRUE(patch_writer_->Close());
+
+ EXPECT_EQ(std::vector<uint8_t>(numbers.begin(), numbers.begin() + 10),
+ fake_patches_[0].diff_stream());
+ EXPECT_TRUE(fake_patches_[0].extra_stream().empty());
+
+ // 5 diff, then 5 extra.
+ EXPECT_EQ(std::vector<uint8_t>(numbers.begin() + 10, numbers.begin() + 15),
+ fake_patches_[1].diff_stream());
+ EXPECT_EQ(std::vector<uint8_t>(numbers.begin() + 30, numbers.begin() + 35),
+ fake_patches_[1].extra_stream());
+
+ // 5 extra, then 5 diff.
+ EXPECT_EQ(std::vector<uint8_t>(numbers.begin() + 15, numbers.begin() + 20),
+ fake_patches_[2].diff_stream());
+ EXPECT_EQ(std::vector<uint8_t>(numbers.begin() + 35, numbers.begin() + 40),
+ fake_patches_[2].extra_stream());
+
+ EXPECT_EQ(std::vector<uint8_t>(numbers.begin() + 20, numbers.begin() + 30),
+ fake_patches_[3].diff_stream());
+ EXPECT_TRUE(fake_patches_[3].extra_stream().empty());
+}
+
+} // namespace bsdiff