src/blockhash.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440

// Copyright 2006, 2008 Google Inc.
// Authors: Chandra Chereddi, Lincoln Smith
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <config.h>
#include "blockhash.h"
#include "compile_assert.h"
#include <stdint.h>  // uint32_t
#include <string.h>  // memcpy, memcmp
#include "logging.h"
#include "rolling_hash.h"

namespace open_vcdiff {

typedef unsigned long uword_t;  // a machine word                         NOLINT

BlockHash::BlockHash(const char* source_data,
                     size_t source_size,
                     int starting_offset)
    : source_data_(source_data),
      source_size_(source_size),
      hash_table_mask_(0),
      starting_offset_(starting_offset),
      last_block_added_(-1) {
}

BlockHash::~BlockHash() { }

// kBlockSize must be at least 2 to be meaningful.  Since it's a compile-time
// constant, check its value at compile time rather than wasting CPU cycles
// on runtime checks.
COMPILE_ASSERT(BlockHash::kBlockSize >= 2, kBlockSize_must_be_at_least_2);

// kBlockSize is required to be a power of 2 because multiplication
// (n * kBlockSize), division (n / kBlockSize) and MOD (n % kBlockSize)
// are commonly-used operations.  If kBlockSize is a compile-time
// constant and a power of 2, the compiler can convert these three operations
// into bit-shift (>> or <<) and bitwise-AND (&) operations, which are much
// more efficient than executing full integer multiply, divide, or remainder
// instructions.
COMPILE_ASSERT((BlockHash::kBlockSize & (BlockHash::kBlockSize - 1)) == 0,
               kBlockSize_must_be_a_power_of_2);

bool BlockHash::Init(bool populate_hash_table) {
  if (!hash_table_.empty() ||
      !next_block_table_.empty() ||
      !last_block_table_.empty()) {
    LOG(DFATAL) << "Init() called twice for same BlockHash object" << LOG_ENDL;
    return false;
  }
  const size_t table_size = CalcTableSize(source_size_);
  if (table_size == 0) {
    LOG(DFATAL) << "Error finding table size for source size " << source_size_
                << LOG_ENDL;
    return false;
  }
  // Since table_size is a power of 2, (table_size - 1) is a bit mask
  // containing all the bits below table_size.
  hash_table_mask_ = static_cast<uint32_t>(table_size - 1);
  hash_table_.resize(table_size, -1);
  next_block_table_.resize(GetNumberOfBlocks(), -1);
  last_block_table_.resize(GetNumberOfBlocks(), -1);
  if (populate_hash_table) {
    AddAllBlocks();
  }
  return true;
}

const BlockHash* BlockHash::CreateDictionaryHash(const char* dictionary_data,
                                                 size_t dictionary_size) {
  BlockHash* new_dictionary_hash = new BlockHash(dictionary_data,
                                                 dictionary_size,
                                                 0);
  if (!new_dictionary_hash->Init(/* populate_hash_table = */ true)) {
    delete new_dictionary_hash;
    return NULL;
  } else {
    return new_dictionary_hash;
  }
}

BlockHash* BlockHash::CreateTargetHash(const char* target_data,
                                       size_t target_size,
                                       size_t dictionary_size) {
  BlockHash* new_target_hash = new BlockHash(target_data,
                                             target_size,
                                             static_cast<int>(dictionary_size));
  if (!new_target_hash->Init(/* populate_hash_table = */ false)) {
    delete new_target_hash;
    return NULL;
  } else {
    return new_target_hash;
  }
}

// Returns zero if an error occurs.
size_t BlockHash::CalcTableSize(const size_t dictionary_size) {
  // Overallocate the hash table by making it the same size (in bytes)
  // as the source data.  This is a trade-off between space and time:
  // the empty entries in the hash table will reduce the
  // probability of a hash collision to (sizeof(int) / kblockSize),
  // and so save time comparing false matches.
  const size_t min_size = (dictionary_size / sizeof(int)) + 1;  // NOLINT
  size_t table_size = 1;
  // Find the smallest power of 2 that is >= min_size, and assign
  // that value to table_size.
  while (table_size < min_size) {
    table_size <<= 1;
    // Guard against an infinite loop
    if (table_size <= 0) {
      LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "
                  << dictionary_size
                  << "): resulting table_size " << table_size
                  << " is zero or negative" << LOG_ENDL;
      return 0;
    }
  }
  // Check size sanity
  if ((table_size & (table_size - 1)) != 0) {
    LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "
                << dictionary_size
                << "): resulting table_size " << table_size
                << " is not a power of 2" << LOG_ENDL;
    return 0;
  }
  // The loop above tries to find the smallest power of 2 that is >= min_size.
  // That value must lie somewhere between min_size and (min_size * 2),
  // except for the case (dictionary_size == 0, table_size == 1).
  if ((dictionary_size > 0) && (table_size > (min_size * 2))) {
    LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "
                << dictionary_size
                << "): resulting table_size " << table_size
                << " is too large" << LOG_ENDL;
    return 0;
  }
  return table_size;
}

// If the hash value is already available from the rolling hash,
// call this function to save time.
void BlockHash::AddBlock(uint32_t hash_value) {
  if (hash_table_.empty()) {
    LOG(DFATAL) << "BlockHash::AddBlock() called before BlockHash::Init()"
                << LOG_ENDL;
    return;
  }
  // The initial value of last_block_added_ is -1.
  int block_number = last_block_added_ + 1;
  const int total_blocks =
      static_cast<int>(source_size_ / kBlockSize);  // round down
  if (block_number >= total_blocks) {
    LOG(DFATAL) << "BlockHash::AddBlock() called"
                   " with block number " << block_number
                << " that is past last block " << (total_blocks - 1)
                << LOG_ENDL;
    return;
  }
  if (next_block_table_[block_number] != -1) {
    LOG(DFATAL) << "Internal error in BlockHash::AddBlock(): "
                   "block number = " << block_number
                << ", next block should be -1 but is "
                << next_block_table_[block_number] << LOG_ENDL;
    return;
  }
  const uint32_t hash_table_index = GetHashTableIndex(hash_value);
  const int first_matching_block = hash_table_[hash_table_index];
  if (first_matching_block < 0) {
    // This is the first entry with this hash value
    hash_table_[hash_table_index] = block_number;
    last_block_table_[block_number] = block_number;
  } else {
    // Add this entry at the end of the chain of matching blocks
    const int last_matching_block = last_block_table_[first_matching_block];
    if (next_block_table_[last_matching_block] != -1) {
      LOG(DFATAL) << "Internal error in BlockHash::AddBlock(): "
                     "first matching block = " << first_matching_block
                  << ", last matching block = " << last_matching_block
                  << ", next block should be -1 but is "
                  << next_block_table_[last_matching_block] << LOG_ENDL;
      return;
    }
    next_block_table_[last_matching_block] = block_number;
    last_block_table_[first_matching_block] = block_number;
  }
  last_block_added_ = block_number;
}

void BlockHash::AddAllBlocks() {
  AddAllBlocksThroughIndex(static_cast<int>(source_size_));
}

void BlockHash::AddAllBlocksThroughIndex(int end_index) {
  if (end_index > static_cast<int>(source_size_)) {
    LOG(DFATAL) << "BlockHash::AddAllBlocksThroughIndex() called"
                   " with index " << end_index
                << " higher than end index  " << source_size_ << LOG_ENDL;
    return;
  }
  const int last_index_added = last_block_added_ * kBlockSize;
  if (end_index <= last_index_added) {
    LOG(DFATAL) << "BlockHash::AddAllBlocksThroughIndex() called"
                   " with index " << end_index
                << " <= last index added ( " << last_index_added
                << ")" << LOG_ENDL;
    return;
  }
  int end_limit = end_index;
  // Don't allow reading any indices at or past source_size_.
  // The Hash function extends (kBlockSize - 1) bytes past the index,
  // so leave a margin of that size.
  int last_legal_hash_index = static_cast<int>(source_size() - kBlockSize);
  if (end_limit > last_legal_hash_index) {
    end_limit = last_legal_hash_index + 1;
  }
  const char* block_ptr = source_data() + NextIndexToAdd();
  const char* const end_ptr = source_data() + end_limit;
  while (block_ptr < end_ptr) {
    AddBlock(RollingHash<kBlockSize>::Hash(block_ptr));
    block_ptr += kBlockSize;
  }
}

COMPILE_ASSERT((BlockHash::kBlockSize % sizeof(uword_t)) == 0,
               kBlockSize_must_be_a_multiple_of_machine_word_size);

// A recursive template to compare a fixed number
// of (possibly unaligned) machine words starting
// at addresses block1 and block2.  Returns true or false
// depending on whether an exact match was found.
template<int number_of_words>
inline bool CompareWholeWordValues(const char* block1,
                                   const char* block2) {
  return CompareWholeWordValues<1>(block1, block2) &&
         CompareWholeWordValues<number_of_words - 1>(block1 + sizeof(uword_t),
                                                     block2 + sizeof(uword_t));
}

// The base of the recursive template: compare one pair of machine words.
template<>
inline bool CompareWholeWordValues<1>(const char* word1,
                                      const char* word2) {
  uword_t aligned_word1, aligned_word2;
  memcpy(&aligned_word1, word1, sizeof(aligned_word1));
  memcpy(&aligned_word2, word2, sizeof(aligned_word2));
  return aligned_word1 == aligned_word2;
}

// A block must be composed of an integral number of machine words
// (uword_t values.)  This function takes advantage of that fact
// by comparing the blocks as series of (possibly unaligned) word values.
// A word-sized comparison can be performed as a single
// machine instruction.  Comparing words instead of bytes means that,
// on a 64-bit platform, this function will use 8 times fewer test-and-branch
// instructions than a byte-by-byte comparison.  Even with the extra
// cost of the calls to memcpy, this method is still at least twice as fast
// as memcmp (measured using gcc on a 64-bit platform, with a block size
// of 32.)  For blocks with identical contents (a common case), this method
// is over six times faster than memcmp.
inline bool BlockCompareWordsInline(const char* block1, const char* block2) {
  static const size_t kWordsPerBlock = BlockHash::kBlockSize / sizeof(uword_t);
  return CompareWholeWordValues<kWordsPerBlock>(block1, block2);
}

bool BlockHash::BlockCompareWords(const char* block1, const char* block2) {
  return BlockCompareWordsInline(block1, block2);
}

inline bool BlockContentsMatchInline(const char* block1, const char* block2) {
  // Optimize for mismatch in first byte.  Since this function is called only
  // when the hash values of the two blocks match, it is very likely that either
  // the blocks are identical, or else the first byte does not match.
  if (*block1 != *block2) {
    return false;
  }
#ifdef VCDIFF_USE_BLOCK_COMPARE_WORDS
  return BlockCompareWordsInline(block1, block2);
#else  // !VCDIFF_USE_BLOCK_COMPARE_WORDS
  return memcmp(block1, block2, BlockHash::kBlockSize) == 0;
#endif  // VCDIFF_USE_BLOCK_COMPARE_WORDS
}

bool BlockHash::BlockContentsMatch(const char* block1, const char* block2) {
  return BlockContentsMatchInline(block1, block2);
}

inline int BlockHash::SkipNonMatchingBlocks(int block_number,
                                            const char* block_ptr) const {
  int probes = 0;
  while ((block_number >= 0) &&
         !BlockContentsMatchInline(block_ptr,
                                   &source_data_[block_number * kBlockSize])) {
    if (++probes > kMaxProbes) {
      return -1;  // Avoid too much chaining
    }
    block_number = next_block_table_[block_number];
  }
  return block_number;
}

// Init() must have been called and returned true before using
// FirstMatchingBlock or NextMatchingBlock.  No check is performed
// for this condition; the code will crash if this condition is violated.
inline int BlockHash::FirstMatchingBlockInline(uint32_t hash_value,
                                               const char* block_ptr) const {
  return SkipNonMatchingBlocks(hash_table_[GetHashTableIndex(hash_value)],
                               block_ptr);
}

int BlockHash::FirstMatchingBlock(uint32_t hash_value,
                                  const char* block_ptr) const {
  return FirstMatchingBlockInline(hash_value, block_ptr);
}

int BlockHash::NextMatchingBlock(int block_number,
                                 const char* block_ptr) const {
  if (static_cast<size_t>(block_number) >= GetNumberOfBlocks()) {
    LOG(DFATAL) << "NextMatchingBlock called for invalid block number "
                << block_number << LOG_ENDL;
    return -1;
  }
  return SkipNonMatchingBlocks(next_block_table_[block_number], block_ptr);
}

// Keep a count of the number of matches found.  This will throttle the
// number of iterations in FindBestMatch.  For example, if the entire
// dictionary is made up of spaces (' ') and the search string is also
// made up of spaces, there will be one match for each block in the
// dictionary.
inline bool BlockHash::TooManyMatches(int* match_counter) {
  ++(*match_counter);
  return (*match_counter) > kMaxMatchesToCheck;
}

// Returns the number of bytes to the left of source_match_start
// that match the corresponding bytes to the left of target_match_start.
// Will not examine more than max_bytes bytes, which is to say that
// the return value will be in the range [0, max_bytes] inclusive.
int BlockHash::MatchingBytesToLeft(const char* source_match_start,
                                   const char* target_match_start,
                                   int max_bytes) {
  const char* source_ptr = source_match_start;
  const char* target_ptr = target_match_start;
  int bytes_found = 0;
  while (bytes_found < max_bytes) {
    --source_ptr;
    --target_ptr;
    if (*source_ptr != *target_ptr) {
      break;
    }
    ++bytes_found;
  }
  return bytes_found;
}

// Returns the number of bytes starting at source_match_end
// that match the corresponding bytes starting at target_match_end.
// Will not examine more than max_bytes bytes, which is to say that
// the return value will be in the range [0, max_bytes] inclusive.
int BlockHash::MatchingBytesToRight(const char* source_match_end,
                                    const char* target_match_end,
                                    int max_bytes) {
  const char* source_ptr = source_match_end;
  const char* target_ptr = target_match_end;
  int bytes_found = 0;
  while ((bytes_found < max_bytes) && (*source_ptr == *target_ptr)) {
    ++bytes_found;
    ++source_ptr;
    ++target_ptr;
  }
  return bytes_found;
}

// No NULL checks are performed on the pointer arguments.  The caller
// must guarantee that none of the arguments is NULL, or a crash will occur.
//
// The vast majority of calls to FindBestMatch enter the loop *zero* times,
// which is to say that most candidate blocks find no matches in the dictionary.
// The important sections for optimization are therefore the code outside the
// loop and the code within the loop conditions.  Keep this to a minimum.
void BlockHash::FindBestMatch(uint32_t hash_value,
                              const char* target_candidate_start,
                              const char* target_start,
                              size_t target_size,
                              Match* best_match) const {
  int match_counter = 0;
  for (int block_number = FirstMatchingBlockInline(hash_value,
                                                   target_candidate_start);
       (block_number >= 0) && !TooManyMatches(&match_counter);
       block_number = NextMatchingBlock(block_number, target_candidate_start)) {
    int source_match_offset = block_number * kBlockSize;
    const int source_match_end = source_match_offset + kBlockSize;

    int target_match_offset =
        static_cast<int>(target_candidate_start - target_start);
    const int target_match_end = target_match_offset + kBlockSize;

    size_t match_size = kBlockSize;
    {
      // Extend match start towards beginning of unencoded data
      const int limit_bytes_to_left = std::min(source_match_offset,
                                               target_match_offset);
      const int matching_bytes_to_left =
          MatchingBytesToLeft(source_data_ + source_match_offset,
                              target_start + target_match_offset,
                              limit_bytes_to_left);
      source_match_offset -= matching_bytes_to_left;
      target_match_offset -= matching_bytes_to_left;
      match_size += matching_bytes_to_left;
    }
    {
      // Extend match end towards end of unencoded data
      const size_t source_bytes_to_right = source_size_ - source_match_end;
      const size_t target_bytes_to_right = target_size - target_match_end;
      const size_t limit_bytes_to_right = std::min(source_bytes_to_right,
                                                   target_bytes_to_right);
      match_size +=
          MatchingBytesToRight(source_data_ + source_match_end,
                               target_start + target_match_end,
                               static_cast<int>(limit_bytes_to_right));
    }
    // Update in/out parameter if the best match found was better
    // than any match already stored in *best_match.
    best_match->ReplaceIfBetterMatch(match_size,
                                     source_match_offset + starting_offset_,
                                     target_match_offset);
  }
}

}  // namespace open_vcdiff