Mon, 16 Jun 2008 15:15:51 -0700 Google Inc. <opensource@google.com>

* open-vcdiff: initial release: The open-vcdiff package provides an encoder and decoder for the VCDIFF format described in RFC 3284 (http://www.ietf.org/rfc/rfc3284.txt). git-svn-id: http://open-vcdiff.googlecode.com/svn/trunk@7 132ac840-3546-0410-a738-d3f8764196be
author: openvcdiff <openvcdiff@132ac840-3546-0410-a738-d3f8764196be> 2008-08-26 19:29:25 +0000
committer: openvcdiff <openvcdiff@132ac840-3546-0410-a738-d3f8764196be> 2008-08-26 19:29:25 +0000
commit: 311c71486f5f6074e5ba62a7f4c5397c8700b868 (patch)
tree: 3851b12e95a0f6a5a30deb52ae13ae7453f606bc /src/vcdiffengine.cc
parent: a2f33801808f7704582f62e098c0aff24a22def5 (diff)
download: open-vcdiff-311c71486f5f6074e5ba62a7f4c5397c8700b868.tar.gz
1 files changed, 252 insertions, 0 deletions
diff --git a/src/vcdiffengine.cc b/src/vcdiffengine.cc
new file mode 100644
index 0000000..1254e46
--- /dev/null
+++ b/src/vcdiffengine.cc
@@ -0,0 +1,252 @@
+// Copyright 2006, 2008 Google Inc.
+// Authors: Chandra Chereddi, Lincoln Smith
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <config.h>
+#include "vcdiffengine.h"
+#include <stdint.h>  // uint32_t
+#include "blockhash.h"
+#include "encodetable.h"
+#include "logging.h"
+#include "rolling_hash.h"
+
+namespace open_vcdiff {
+
+VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size)
+    // If dictionary_size == 0, then dictionary could be NULL.  Guard against
+    // using a NULL value.
+    : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""),
+      dictionary_size_(dictionary_size),
+      hashed_dictionary_(NULL) {
+  if (dictionary_size > 0) {
+    memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size);
+  }
+}
+
+VCDiffEngine::~VCDiffEngine() {
+  delete hashed_dictionary_;
+  if (dictionary_size_ > 0) {
+    delete[] dictionary_;
+  }
+}
+
+bool VCDiffEngine::Init() {
+  if (hashed_dictionary_) {
+    LOG(DFATAL) << "Init() called twice for same VCDiffEngine object"
+                << LOG_ENDL;
+    return false;
+  }
+  hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_,
+                                                       dictionary_size());
+  if (!hashed_dictionary_) {
+    LOG(DFATAL) << "Creation of dictionary hash failed" << LOG_ENDL;
+    return false;
+  }
+  if (!RollingHash<BlockHash::kBlockSize>::Init()) {
+    LOG(DFATAL) << "RollingHash initialization failed" << LOG_ENDL;
+    return false;
+  }
+  return true;
+}
+
+// This helper function tries to find an appropriate match within
+// hashed_dictionary_ for the block starting at the current target position.
+// If target_hash is not NULL, this function will also look for a match
+// within the previously encoded target data.
+//
+// If a match is found, this function will generate an ADD instruction
+// for all unencoded data that precedes the match,
+// and a COPY instruction for the match itself; then it returns
+// the number of bytes processed by both instructions,
+// which is guaranteed to be > 0.
+// If no appropriate match is found, the function returns 0.
+//
+// The first four parameters are input parameters which are passed
+// directly to BlockHash::FindBestMatch; please see that function
+// for a description of their allowable values.
+template<bool look_for_target_matches>
+inline size_t VCDiffEngine::EncodeCopyForBestMatch(
+    uint32_t hash_value,
+    const char* target_candidate_start,
+    const char* unencoded_target_start,
+    size_t unencoded_target_size,
+    const BlockHash* target_hash,
+    VCDiffCodeTableWriter* coder) const {
+  // When FindBestMatch() comes up with a match for a candidate block,
+  // it will populate best_match with the size, source offset,
+  // and target offset of the match.
+  BlockHash::Match best_match;
+
+  // First look for a match in the dictionary.
+  hashed_dictionary_->FindBestMatch(hash_value,
+                                    target_candidate_start,
+                                    unencoded_target_start,
+                                    unencoded_target_size,
+                                    &best_match);
+  // If target matching is enabled, then see if there is a better match
+  // within the target data that has been encoded so far.
+  if (look_for_target_matches) {
+    target_hash->FindBestMatch(hash_value,
+                               target_candidate_start,
+                               unencoded_target_start,
+                               unencoded_target_size,
+                               &best_match);
+  }
+  if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) {
+    return 0;
+  }
+  if (best_match.target_offset() > 0) {
+    // Create an ADD instruction to encode all target bytes
+    // from the end of the last COPY match, if any, up to
+    // the beginning of this COPY match.
+    coder->Add(unencoded_target_start, best_match.target_offset());
+  }
+  coder->Copy(best_match.source_offset(), best_match.size());
+  return best_match.target_offset()  // ADD size
+       + best_match.size();          // + COPY size
+}
+
+// Once the encoder loop has finished checking for matches in the target data,
+// this function creates an ADD instruction to encode all target bytes
+// from the end of the last COPY match, if any, through the end of
+// the target data.  In the worst case, if no matches were found at all,
+// this function will create one big ADD instruction
+// for the entire buffer of target data.
+inline void VCDiffEngine::AddUnmatchedRemainder(
+    const char* unencoded_target_start,
+    size_t unencoded_target_size,
+    VCDiffCodeTableWriter* coder) const {
+  if (unencoded_target_size > 0) {
+    coder->Add(unencoded_target_start, unencoded_target_size);
+  }
+}
+
+// This helper function tells the coder to finish the encoding and write
+// the results into the output string "diff".
+inline void VCDiffEngine::FinishEncoding(size_t target_size,
+                                         OutputStringInterface* diff,
+                                         VCDiffCodeTableWriter* coder) const {
+  if (target_size != static_cast<size_t>(coder->target_length())) {
+    LOG(DFATAL) << "Internal error in VCDiffEngine::Encode: "
+                   "original target size (" << target_size
+                << ") does not match number of bytes processed ("
+                << coder->target_length() << ")" << LOG_ENDL;
+  }
+  coder->Output(diff);
+}
+
+template<bool look_for_target_matches>
+void VCDiffEngine::EncodeInternal(const char* target_data,
+                                  size_t target_size,
+                                  OutputStringInterface* diff,
+                                  VCDiffCodeTableWriter* coder) const {
+  if (!hashed_dictionary_) {
+    LOG(DFATAL) << "Internal error: VCDiffEngine::Encode() "
+                   "called before VCDiffEngine::Init()" << LOG_ENDL;
+    return;
+  }
+  if (target_size == 0) {
+    return;  // Do nothing for empty target
+  }
+  if (!coder->Init(dictionary_size())) {
+    LOG(DFATAL) << "Internal error: "
+                   "Initialization of VCDiffCodeTableWriter failed" << LOG_ENDL;
+    return;
+  }
+  // Special case for really small input
+  if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) {
+    AddUnmatchedRemainder(target_data, target_size, coder);
+    FinishEncoding(target_size, diff, coder);
+    return;
+  }
+  RollingHash<BlockHash::kBlockSize> hasher;
+  BlockHash* target_hash = NULL;
+  if (look_for_target_matches) {
+    // Check matches against previously encoded target data
+    // in this same target window, as well as against the dictionary
+    target_hash = BlockHash::CreateTargetHash(target_data,
+                                              target_size,
+                                              dictionary_size());
+    if (!target_hash) {
+      LOG(DFATAL) << "Instantiation of target hash failed" << LOG_ENDL;
+      return;
+    }
+  }
+  const char* const target_end = target_data + target_size;
+  const char* const start_of_last_block = target_end - BlockHash::kBlockSize;
+  // Offset of next bytes in string to ADD if NOT copied (i.e., not found in
+  // dictionary)
+  const char* next_encode = target_data;
+  // candidate_pos points to the start of the kBlockSize-byte block that may
+  // begin a match with the dictionary or previously encoded target data.
+  const char* candidate_pos = target_data;
+  uint32_t hash_value = hasher.Hash(candidate_pos);
+  while (1) {
+    const size_t bytes_encoded =
+        EncodeCopyForBestMatch<look_for_target_matches>(
+            hash_value,
+            candidate_pos,
+            next_encode,
+            (target_end - next_encode),
+            target_hash,
+            coder);
+    if (bytes_encoded > 0) {
+      next_encode += bytes_encoded;  // Advance past COPYed data
+      candidate_pos = next_encode;
+      if (candidate_pos > start_of_last_block) {
+        break;  // Reached end of target data
+      }
+      // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash
+      // can't be used to calculate the hash value at its new position.
+      hash_value = hasher.Hash(candidate_pos);
+      if (look_for_target_matches) {
+        // Update the target hash for the ADDed and COPYed data
+        target_hash->AddAllBlocksThroughIndex(
+            static_cast<int>(next_encode - target_data));
+      }
+    } else {
+      // No match, or match is too small to be worth a COPY instruction.
+      // Move to the next position in the target data.
+      if ((candidate_pos + 1) > start_of_last_block) {
+        break;  // Reached end of target data
+      }
+      if (look_for_target_matches) {
+        target_hash->AddOneIndexHash(
+            static_cast<int>(candidate_pos - target_data),
+            hash_value);
+      }
+      hash_value = hasher.UpdateHash(hash_value,
+                                     candidate_pos[0],
+                                     candidate_pos[BlockHash::kBlockSize]);
+      ++candidate_pos;
+    }
+  }
+  AddUnmatchedRemainder(next_encode, target_end - next_encode, coder);
+  FinishEncoding(target_size, diff, coder);
+  delete target_hash;
+}
+
+void VCDiffEngine::Encode(const char* target_data,
+                          size_t target_size,
+                          bool look_for_target_matches,
+                          OutputStringInterface* diff,
+                          VCDiffCodeTableWriter* coder) const {
+  if (look_for_target_matches) {
+    EncodeInternal<true>(target_data, target_size, diff, coder);
+  } else {
+    EncodeInternal<false>(target_data, target_size, diff, coder);
+  }
+}
+
+}  // namespace open_vcdiff
author	openvcdiff <openvcdiff@132ac840-3546-0410-a738-d3f8764196be>	2008-08-26 19:29:25 +0000
committer	openvcdiff <openvcdiff@132ac840-3546-0410-a738-d3f8764196be>	2008-08-26 19:29:25 +0000
commit	311c71486f5f6074e5ba62a7f4c5397c8700b868 (patch)
tree	3851b12e95a0f6a5a30deb52ae13ae7453f606bc /src/vcdiffengine.cc
parent	a2f33801808f7704582f62e098c0aff24a22def5 (diff)
download	open-vcdiff-311c71486f5f6074e5ba62a7f4c5397c8700b868.tar.gz