diff options
author | openvcdiff <openvcdiff@132ac840-3546-0410-a738-d3f8764196be> | 2008-08-26 19:29:25 +0000 |
---|---|---|
committer | openvcdiff <openvcdiff@132ac840-3546-0410-a738-d3f8764196be> | 2008-08-26 19:29:25 +0000 |
commit | 311c71486f5f6074e5ba62a7f4c5397c8700b868 (patch) | |
tree | 3851b12e95a0f6a5a30deb52ae13ae7453f606bc /src/vcdiffengine.cc | |
parent | a2f33801808f7704582f62e098c0aff24a22def5 (diff) | |
download | open-vcdiff-311c71486f5f6074e5ba62a7f4c5397c8700b868.tar.gz |
Mon, 16 Jun 2008 15:15:51 -0700 Google Inc. <opensource@google.com>
* open-vcdiff: initial release:
The open-vcdiff package provides an encoder and decoder for the VCDIFF format
described in RFC 3284 (http://www.ietf.org/rfc/rfc3284.txt).
git-svn-id: http://open-vcdiff.googlecode.com/svn/trunk@7 132ac840-3546-0410-a738-d3f8764196be
Diffstat (limited to 'src/vcdiffengine.cc')
-rw-r--r-- | src/vcdiffengine.cc | 252 |
1 files changed, 252 insertions, 0 deletions
diff --git a/src/vcdiffengine.cc b/src/vcdiffengine.cc new file mode 100644 index 0000000..1254e46 --- /dev/null +++ b/src/vcdiffengine.cc @@ -0,0 +1,252 @@ +// Copyright 2006, 2008 Google Inc. +// Authors: Chandra Chereddi, Lincoln Smith +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <config.h> +#include "vcdiffengine.h" +#include <stdint.h> // uint32_t +#include "blockhash.h" +#include "encodetable.h" +#include "logging.h" +#include "rolling_hash.h" + +namespace open_vcdiff { + +VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size) + // If dictionary_size == 0, then dictionary could be NULL. Guard against + // using a NULL value. + : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""), + dictionary_size_(dictionary_size), + hashed_dictionary_(NULL) { + if (dictionary_size > 0) { + memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size); + } +} + +VCDiffEngine::~VCDiffEngine() { + delete hashed_dictionary_; + if (dictionary_size_ > 0) { + delete[] dictionary_; + } +} + +bool VCDiffEngine::Init() { + if (hashed_dictionary_) { + LOG(DFATAL) << "Init() called twice for same VCDiffEngine object" + << LOG_ENDL; + return false; + } + hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_, + dictionary_size()); + if (!hashed_dictionary_) { + LOG(DFATAL) << "Creation of dictionary hash failed" << LOG_ENDL; + return false; + } + if (!RollingHash<BlockHash::kBlockSize>::Init()) { + LOG(DFATAL) << "RollingHash initialization failed" << LOG_ENDL; + return false; + } + return true; +} + +// This helper function tries to find an appropriate match within +// hashed_dictionary_ for the block starting at the current target position. +// If target_hash is not NULL, this function will also look for a match +// within the previously encoded target data. +// +// If a match is found, this function will generate an ADD instruction +// for all unencoded data that precedes the match, +// and a COPY instruction for the match itself; then it returns +// the number of bytes processed by both instructions, +// which is guaranteed to be > 0. +// If no appropriate match is found, the function returns 0. +// +// The first four parameters are input parameters which are passed +// directly to BlockHash::FindBestMatch; please see that function +// for a description of their allowable values. +template<bool look_for_target_matches> +inline size_t VCDiffEngine::EncodeCopyForBestMatch( + uint32_t hash_value, + const char* target_candidate_start, + const char* unencoded_target_start, + size_t unencoded_target_size, + const BlockHash* target_hash, + VCDiffCodeTableWriter* coder) const { + // When FindBestMatch() comes up with a match for a candidate block, + // it will populate best_match with the size, source offset, + // and target offset of the match. + BlockHash::Match best_match; + + // First look for a match in the dictionary. + hashed_dictionary_->FindBestMatch(hash_value, + target_candidate_start, + unencoded_target_start, + unencoded_target_size, + &best_match); + // If target matching is enabled, then see if there is a better match + // within the target data that has been encoded so far. + if (look_for_target_matches) { + target_hash->FindBestMatch(hash_value, + target_candidate_start, + unencoded_target_start, + unencoded_target_size, + &best_match); + } + if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) { + return 0; + } + if (best_match.target_offset() > 0) { + // Create an ADD instruction to encode all target bytes + // from the end of the last COPY match, if any, up to + // the beginning of this COPY match. + coder->Add(unencoded_target_start, best_match.target_offset()); + } + coder->Copy(best_match.source_offset(), best_match.size()); + return best_match.target_offset() // ADD size + + best_match.size(); // + COPY size +} + +// Once the encoder loop has finished checking for matches in the target data, +// this function creates an ADD instruction to encode all target bytes +// from the end of the last COPY match, if any, through the end of +// the target data. In the worst case, if no matches were found at all, +// this function will create one big ADD instruction +// for the entire buffer of target data. +inline void VCDiffEngine::AddUnmatchedRemainder( + const char* unencoded_target_start, + size_t unencoded_target_size, + VCDiffCodeTableWriter* coder) const { + if (unencoded_target_size > 0) { + coder->Add(unencoded_target_start, unencoded_target_size); + } +} + +// This helper function tells the coder to finish the encoding and write +// the results into the output string "diff". +inline void VCDiffEngine::FinishEncoding(size_t target_size, + OutputStringInterface* diff, + VCDiffCodeTableWriter* coder) const { + if (target_size != static_cast<size_t>(coder->target_length())) { + LOG(DFATAL) << "Internal error in VCDiffEngine::Encode: " + "original target size (" << target_size + << ") does not match number of bytes processed (" + << coder->target_length() << ")" << LOG_ENDL; + } + coder->Output(diff); +} + +template<bool look_for_target_matches> +void VCDiffEngine::EncodeInternal(const char* target_data, + size_t target_size, + OutputStringInterface* diff, + VCDiffCodeTableWriter* coder) const { + if (!hashed_dictionary_) { + LOG(DFATAL) << "Internal error: VCDiffEngine::Encode() " + "called before VCDiffEngine::Init()" << LOG_ENDL; + return; + } + if (target_size == 0) { + return; // Do nothing for empty target + } + if (!coder->Init(dictionary_size())) { + LOG(DFATAL) << "Internal error: " + "Initialization of VCDiffCodeTableWriter failed" << LOG_ENDL; + return; + } + // Special case for really small input + if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) { + AddUnmatchedRemainder(target_data, target_size, coder); + FinishEncoding(target_size, diff, coder); + return; + } + RollingHash<BlockHash::kBlockSize> hasher; + BlockHash* target_hash = NULL; + if (look_for_target_matches) { + // Check matches against previously encoded target data + // in this same target window, as well as against the dictionary + target_hash = BlockHash::CreateTargetHash(target_data, + target_size, + dictionary_size()); + if (!target_hash) { + LOG(DFATAL) << "Instantiation of target hash failed" << LOG_ENDL; + return; + } + } + const char* const target_end = target_data + target_size; + const char* const start_of_last_block = target_end - BlockHash::kBlockSize; + // Offset of next bytes in string to ADD if NOT copied (i.e., not found in + // dictionary) + const char* next_encode = target_data; + // candidate_pos points to the start of the kBlockSize-byte block that may + // begin a match with the dictionary or previously encoded target data. + const char* candidate_pos = target_data; + uint32_t hash_value = hasher.Hash(candidate_pos); + while (1) { + const size_t bytes_encoded = + EncodeCopyForBestMatch<look_for_target_matches>( + hash_value, + candidate_pos, + next_encode, + (target_end - next_encode), + target_hash, + coder); + if (bytes_encoded > 0) { + next_encode += bytes_encoded; // Advance past COPYed data + candidate_pos = next_encode; + if (candidate_pos > start_of_last_block) { + break; // Reached end of target data + } + // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash + // can't be used to calculate the hash value at its new position. + hash_value = hasher.Hash(candidate_pos); + if (look_for_target_matches) { + // Update the target hash for the ADDed and COPYed data + target_hash->AddAllBlocksThroughIndex( + static_cast<int>(next_encode - target_data)); + } + } else { + // No match, or match is too small to be worth a COPY instruction. + // Move to the next position in the target data. + if ((candidate_pos + 1) > start_of_last_block) { + break; // Reached end of target data + } + if (look_for_target_matches) { + target_hash->AddOneIndexHash( + static_cast<int>(candidate_pos - target_data), + hash_value); + } + hash_value = hasher.UpdateHash(hash_value, + candidate_pos[0], + candidate_pos[BlockHash::kBlockSize]); + ++candidate_pos; + } + } + AddUnmatchedRemainder(next_encode, target_end - next_encode, coder); + FinishEncoding(target_size, diff, coder); + delete target_hash; +} + +void VCDiffEngine::Encode(const char* target_data, + size_t target_size, + bool look_for_target_matches, + OutputStringInterface* diff, + VCDiffCodeTableWriter* coder) const { + if (look_for_target_matches) { + EncodeInternal<true>(target_data, target_size, diff, coder); + } else { + EncodeInternal<false>(target_data, target_size, diff, coder); + } +} + +} // namespace open_vcdiff |