diff options
author | Zoltan Szabadka <szabadka@google.com> | 2014-03-20 14:32:35 +0100 |
---|---|---|
committer | Zoltan Szabadka <szabadka@google.com> | 2014-03-20 14:32:35 +0100 |
commit | 278b89484fa947fad7fbf8753aadff0c9ce18a8c (patch) | |
tree | 7c6bb974a72f53a8570496915da4d983c3766e7c | |
parent | 7f848593bd2ec83f4537b6d494a5bf55b9bd4456 (diff) | |
download | src-278b89484fa947fad7fbf8753aadff0c9ce18a8c.tar.gz |
Updates to Brotli compression format, decoder and encoder
This commit contains a batch of changes that were made to the Brotli
compression algorithm in the last month. Most important changes:
* Format change: don't push distances representing static dictionary words to the distance cache.
* Fix decoder invalid memory access bug caused by building a non-complete Huffman tree.
* Add a mode parameter to the encoder interface.
* Use different hashers for text and font mode.
* Add a heuristics to the hasher for skipping non-compressible data.
* Exhaustive search of static dictionary during backward reference search.
-rw-r--r-- | brotli/brotlispec.txt | 4 | ||||
-rw-r--r-- | brotli/dec/decode.c | 27 | ||||
-rw-r--r-- | brotli/enc/backward_references.cc | 74 | ||||
-rw-r--r-- | brotli/enc/backward_references.h | 3 | ||||
-rw-r--r-- | brotli/enc/bit_cost.h | 22 | ||||
-rw-r--r-- | brotli/enc/block_splitter.cc | 1 | ||||
-rw-r--r-- | brotli/enc/command.h | 8 | ||||
-rw-r--r-- | brotli/enc/encode.cc | 304 | ||||
-rw-r--r-- | brotli/enc/encode.h | 21 | ||||
-rw-r--r-- | brotli/enc/hash.h | 257 | ||||
-rw-r--r-- | brotli/enc/literal_cost.cc | 33 | ||||
-rw-r--r-- | brotli/enc/literal_cost.h | 4 | ||||
-rw-r--r-- | brotli/enc/prefix.cc | 2 | ||||
-rw-r--r-- | brotli/enc/static_dict.h | 68 |
14 files changed, 547 insertions, 281 deletions
diff --git a/brotli/brotlispec.txt b/brotli/brotlispec.txt index 0010bfb..594517f 100644 --- a/brotli/brotlispec.txt +++ b/brotli/brotlispec.txt @@ -589,7 +589,9 @@ Abstract in the sequence of distances) is not pushed to the ring-buffer of last distances, in other words, the expression "(second, third, fourth) last distance" means the (second, third, fourth) last - distance that was not represented by a 0 distance code. + distance that was not represented by a 0 distance code. Similarly, + distances that represent static dictionary words (see Section 8.) are + not pushed to the ringbuffer of last distances. The next NDIRECT distance codes, from 16 to 15 + NDIRECT, represent distances from 1 to NDIRECT. Neither the distance short codes, nor diff --git a/brotli/dec/decode.c b/brotli/dec/decode.c index fc66e15..5df28e0 100644 --- a/brotli/dec/decode.c +++ b/brotli/dec/decode.c @@ -247,12 +247,23 @@ static int ReadHuffmanCode(int alphabet_size, code_lengths[symbols[0]] = 1; switch (num_symbols) { case 1: + break; case 3: + ok = ((symbols[0] != symbols[1]) && + (symbols[0] != symbols[2]) && + (symbols[1] != symbols[2])); break; case 2: + ok = (symbols[0] != symbols[1]); code_lengths[symbols[1]] = 1; break; case 4: + ok = ((symbols[0] != symbols[1]) && + (symbols[0] != symbols[2]) && + (symbols[0] != symbols[3]) && + (symbols[1] != symbols[2]) && + (symbols[1] != symbols[3]) && + (symbols[2] != symbols[3])); if (BrotliReadBits(br, 1)) { code_lengths[symbols[2]] = 3; code_lengths[symbols[3]] = 3; @@ -266,6 +277,7 @@ static int ReadHuffmanCode(int alphabet_size, int i; uint8_t code_length_code_lengths[CODE_LENGTH_CODES] = { 0 }; int space = 32; + int num_codes = 0; /* Static Huffman code for the code length code lengths */ static const HuffmanCode huff[16] = { {2, 0}, {2, 4}, {2, 3}, {3, 2}, {2, 0}, {2, 4}, {2, 3}, {4, 1}, @@ -283,10 +295,12 @@ static int ReadHuffmanCode(int alphabet_size, BROTLI_LOG_ARRAY_INDEX(code_length_code_lengths, code_len_idx); if (v != 0) { space -= (32 >> v); + ++num_codes; } } - ok = ReadHuffmanCodeLengths(code_length_code_lengths, - alphabet_size, code_lengths, br); + ok = (num_codes == 1 || space == 0) && + ReadHuffmanCodeLengths(code_length_code_lengths, + alphabet_size, code_lengths, br); } if (ok) { table_size = BrotliBuildHuffmanTable(table, HUFFMAN_TABLE_BITS, @@ -961,10 +975,6 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) { ok = 0; goto End; } - if (distance_code > 0) { - dist_rb[dist_rb_idx & 3] = distance; - ++dist_rb_idx; - } BROTLI_LOG_UINT(distance); if (pos < max_backward_distance && @@ -1016,6 +1026,11 @@ int BrotliDecompress(BrotliInput input, BrotliOutput output) { goto End; } } else { + if (distance_code > 0) { + dist_rb[dist_rb_idx & 3] = distance; + ++dist_rb_idx; + } + if (copy_length > meta_block_remaining_len) { printf("Invalid backward reference. pos: %d distance: %d " "len: %d bytes left: %d\n", pos, distance, copy_length, diff --git a/brotli/enc/backward_references.cc b/brotli/enc/backward_references.cc index f76d7d4..837dbf3 100644 --- a/brotli/enc/backward_references.cc +++ b/brotli/enc/backward_references.cc @@ -23,6 +23,7 @@ namespace brotli { +template<typename Hasher> void CreateBackwardReferences(size_t num_bytes, size_t position, const uint8_t* ringbuffer, @@ -38,6 +39,9 @@ void CreateBackwardReferences(size_t num_bytes, const int i_diff = position - i; const size_t i_end = i + num_bytes; + const int random_heuristics_window_size = 512; + int apply_random_heuristics = i + random_heuristics_window_size; + double average_cost = 0.0; for (int k = position; k < position + num_bytes; ++k) { average_cost += literal_cost[k & ringbuffer_mask]; @@ -65,7 +69,8 @@ void CreateBackwardReferences(size_t num_bytes, bool match_found = hasher->FindLongestMatch( ringbuffer, literal_cost, ringbuffer_mask, i + i_diff, i_end - i, max_distance, - &best_len, &best_len_code, &best_dist, &best_score, &in_dictionary); + &best_len, &best_len_code, &best_dist, &best_score, + &in_dictionary); bool best_in_dictionary = in_dictionary; if (match_found) { if (match_found_M1 && best_score_M1 > best_score) { @@ -138,16 +143,20 @@ void CreateBackwardReferences(size_t num_bytes, } } } + apply_random_heuristics = + i + 2 * best_len + random_heuristics_window_size; Command cmd; cmd.insert_length_ = insert_length; cmd.copy_length_ = best_len; cmd.copy_length_code_ = best_len_code; cmd.copy_distance_ = best_dist; commands->push_back(cmd); - hasher->set_last_distance(best_dist); - insert_length = 0; ++i; + if (best_dist <= std::min(i + i_diff, max_backward_limit)) { + hasher->set_last_distance(best_dist); + } + // Copy all copied literals to the hasher, except the last one. // We cannot store the last one yet, otherwise we couldn't find // the possible M1 match. @@ -158,7 +167,8 @@ void CreateBackwardReferences(size_t num_bytes, ++i; } // Prepare M1 match. - if (best_len >= 4 && i + 20 < i_end && !best_in_dictionary) { + if (hasher->HasStaticDictionary() && + best_len >= 4 && i + 20 < i_end && !best_in_dictionary) { max_distance = std::min(i + i_diff, max_backward_limit); match_found_M1 = hasher->FindLongestMatch( ringbuffer, literal_cost, ringbuffer_mask, @@ -185,6 +195,32 @@ void CreateBackwardReferences(size_t num_bytes, ++insert_length; hasher->Store(ringbuffer + i, i + i_diff); ++i; + // If we have not seen matches for a long time, we can skip some + // match lookups. Unsuccessful match lookups are very very expensive + // and this kind of a heuristic speeds up compression quite + // a lot. + if (i > apply_random_heuristics) { + // Going through uncompressible data, jump. + if (i > apply_random_heuristics + 4 * random_heuristics_window_size) { + // It is quite a long time since we saw a copy, so we assume + // that this data is not compressible, and store hashes less + // often. Hashes of non compressible data are less likely to + // turn out to be useful in the future, too, so we store less of + // them to not to flood out the hash table of good compressible + // data. + int i_jump = std::min(i + 16, i_end - 4); + for (; i < i_jump; i += 4) { + hasher->Store(ringbuffer + i, i + i_diff); + insert_length += 4; + } + } else { + int i_jump = std::min(i + 8, i_end - 2); + for (; i < i_jump; i += 2) { + hasher->Store(ringbuffer + i, i + i_diff); + insert_length += 2; + } + } + } } } insert_length += (i_end - i); @@ -198,4 +234,34 @@ void CreateBackwardReferences(size_t num_bytes, } } +void CreateBackwardReferences(size_t num_bytes, + size_t position, + const uint8_t* ringbuffer, + const float* literal_cost, + size_t ringbuffer_mask, + const size_t max_backward_limit, + Hashers* hashers, + Hashers::Type hash_type, + std::vector<Command>* commands) { + switch (hash_type) { + case Hashers::HASH_15_8_4: + CreateBackwardReferences( + num_bytes, position, ringbuffer, literal_cost, + ringbuffer_mask, max_backward_limit, + hashers->hash_15_8_4.get(), + commands); + break; + case Hashers::HASH_15_8_2: + CreateBackwardReferences( + num_bytes, position, ringbuffer, literal_cost, + ringbuffer_mask, max_backward_limit, + hashers->hash_15_8_2.get(), + commands); + break; + default: + break; + } +} + + } // namespace brotli diff --git a/brotli/enc/backward_references.h b/brotli/enc/backward_references.h index f666ef6..bf90c7e 100644 --- a/brotli/enc/backward_references.h +++ b/brotli/enc/backward_references.h @@ -31,7 +31,8 @@ void CreateBackwardReferences(size_t num_bytes, const float* literal_cost, size_t ringbuffer_mask, const size_t max_backward_limit, - Hasher* hasher, + Hashers* hashers, + Hashers::Type hash_type, std::vector<Command>* commands); } // namespace brotli diff --git a/brotli/enc/bit_cost.h b/brotli/enc/bit_cost.h index c2fd3e4..46e6229 100644 --- a/brotli/enc/bit_cost.h +++ b/brotli/enc/bit_cost.h @@ -46,6 +46,7 @@ static inline int HuffmanBitCost(const uint8_t* depth, int length) { int max_depth = 1; int histogram[kCodeLengthCodes] = { 0 }; int tail_start = 0; + int prev_value = 8; // compute histogram of compacted huffman tree for (int i = 0; i < length;) { const int value = depth[i]; @@ -57,29 +58,32 @@ static inline int HuffmanBitCost(const uint8_t* depth, int length) { ++reps; } i += reps; + if (i == length && value == 0) + break; if (value == 0) { if (reps < 3) { histogram[0] += reps; } else { - reps -= 3; - while (reps >= 0) { + reps -= 2; + while (reps > 0) { ++histogram[17]; reps >>= 3; - --reps; } } } else { tail_start = i; - ++histogram[value]; - --reps; + if (value != prev_value) { + ++histogram[value]; + --reps; + } + prev_value = value; if (reps < 3) { histogram[value] += reps; } else { - reps -= 3; - while (reps >= 0) { + reps -= 2; + while (reps > 0) { ++histogram[16]; reps >>= 2; - --reps; } } } @@ -93,7 +97,7 @@ static inline int HuffmanBitCost(const uint8_t* depth, int length) { cost[17] += 3; int tree_size = 0; - int bits = 6 + 2 * max_depth; // huffman tree of huffman tree cost + int bits = 18 + 2 * max_depth; // huffman tree of huffman tree cost for (int i = 0; i < kCodeLengthCodes; ++i) { bits += histogram[i] * cost[i]; // huffman tree bit cost tree_size += histogram[i]; diff --git a/brotli/enc/block_splitter.cc b/brotli/enc/block_splitter.cc index 57c1e90..a24d0fb 100644 --- a/brotli/enc/block_splitter.cc +++ b/brotli/enc/block_splitter.cc @@ -348,6 +348,7 @@ void SplitBlock(const std::vector<Command>& cmds, &insert_and_copy_codes, &distance_prefixes); + SplitByteVector<HistogramLiteral>( literals, kSymbolsPerLiteralHistogram, kMaxLiteralHistograms, diff --git a/brotli/enc/command.h b/brotli/enc/command.h index 7a9f481..f979973 100644 --- a/brotli/enc/command.h +++ b/brotli/enc/command.h @@ -24,9 +24,13 @@ namespace brotli { // Command holds a sequence of literals and a backward reference copy. class Command { public: + // distance_code_ is initialized to 17 because it refers to the distance + // code of a backward distance of 1, this way the last insert-only command + // won't use the last-distance short code, and accordingly distance_prefix_ is + // set to 16 Command() : insert_length_(0), copy_length_(0), copy_length_code_(0), - copy_distance_(0), distance_code_(0), - distance_prefix_(0), command_prefix_(0), + copy_distance_(0), distance_code_(17), + distance_prefix_(16), command_prefix_(0), distance_extra_bits_(0), distance_extra_bits_value_(0) {} uint32_t insert_length_; diff --git a/brotli/enc/encode.cc b/brotli/enc/encode.cc index fa04834..4ae96bc 100644 --- a/brotli/enc/encode.cc +++ b/brotli/enc/encode.cc @@ -168,15 +168,6 @@ void EncodeMetaBlockLength(size_t meta_block_size, } } -template<int kSize> -void EntropyEncode(int val, const EntropyCode<kSize>& code, - int* storage_ix, uint8_t* storage) { - if (code.count_ <= 1) { - return; - }; - WriteBits(code.depth_[val], code.bits_[val], storage_ix, storage); -} - void StoreHuffmanTreeOfHuffmanTreeToBitMask( const uint8_t* code_length_bitdepth, int* storage_ix, uint8_t* storage) { @@ -225,7 +216,9 @@ void StoreHuffmanTreeToBitMask( for (int i = 0; i < huffman_tree_size; ++i) { const int ix = huffman_tree[i]; const int extra_bits = huffman_tree_extra_bits[i]; - EntropyEncode(ix, entropy, storage_ix, storage); + if (entropy.count_ > 1) { + WriteBits(entropy.depth_[ix], entropy.bits_[ix], storage_ix, storage); + } switch (ix) { case 16: WriteBits(2, extra_bits, storage_ix, storage); @@ -240,8 +233,7 @@ void StoreHuffmanTreeToBitMask( template<int kSize> void StoreHuffmanCodeSimple( const EntropyCode<kSize>& code, int alphabet_size, - int max_bits, - int* storage_ix, uint8_t* storage) { + int max_bits, int* storage_ix, uint8_t* storage) { const uint8_t *depth = &code.depth_[0]; int symbols[4]; // Quadratic sort. @@ -304,37 +296,66 @@ void StoreHuffmanCodeComplex( storage_ix, storage); } - template<int kSize> -void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size, - int* storage_ix, uint8_t* storage) { +void BuildAndStoreEntropyCode(const Histogram<kSize>& histogram, + const int tree_limit, + const int alphabet_size, + EntropyCode<kSize>* code, + int* storage_ix, uint8_t* storage) { + memset(code->depth_, 0, sizeof(code->depth_)); + memset(code->bits_, 0, sizeof(code->bits_)); + memset(code->symbols_, 0, sizeof(code->symbols_)); + code->count_ = 0; + int max_bits_counter = alphabet_size - 1; int max_bits = 0; while (max_bits_counter) { max_bits_counter >>= 1; ++max_bits; } - if (code.count_ == 0) { - // Emit a minimal tree for empty cases. - // bits: small tree marker: 1, count-1: 0, max_bits-sized encoding for 0 - WriteBits(4 + max_bits, 0x1, storage_ix, storage); - } else if (code.count_ <= 4) { - StoreHuffmanCodeSimple( - code, alphabet_size, max_bits, - storage_ix, storage); + + for (size_t i = 0; i < alphabet_size; i++) { + if (histogram.data_[i] > 0) { + if (code->count_ < 4) code->symbols_[code->count_] = i; + ++code->count_; + } + } + + if (code->count_ <= 1) { + WriteBits(2, 1, storage_ix, storage); + WriteBits(2, 0, storage_ix, storage); + WriteBits(max_bits, code->symbols_[0], storage_ix, storage); + return; + } + + if (alphabet_size >= 50 && code->count_ >= 16) { + std::vector<int> counts(alphabet_size); + memcpy(&counts[0], histogram.data_, sizeof(counts[0]) * alphabet_size); + OptimizeHuffmanCountsForRle(alphabet_size, &counts[0]); + CreateHuffmanTree(&counts[0], alphabet_size, tree_limit, code->depth_); } else { - StoreHuffmanCodeComplex( - code, alphabet_size, - storage_ix, storage); + CreateHuffmanTree(histogram.data_, alphabet_size, tree_limit, code->depth_); + } + ConvertBitDepthsToSymbols(code->depth_, alphabet_size, code->bits_); + + if (code->count_ <= 4) { + StoreHuffmanCodeSimple(*code, alphabet_size, max_bits, storage_ix, storage); + } else { + StoreHuffmanCodeComplex(*code, alphabet_size, storage_ix, storage); } } template<int kSize> -void StoreHuffmanCodes(const std::vector<EntropyCode<kSize> >& codes, - int alphabet_size, - int* storage_ix, uint8_t* storage) { - for (int i = 0; i < codes.size(); ++i) { - StoreHuffmanCode(codes[i], alphabet_size, storage_ix, storage); +void BuildAndStoreEntropyCodes( + const std::vector<Histogram<kSize> >& histograms, + int alphabet_size, + std::vector<EntropyCode<kSize> >* entropy_codes, + int* storage_ix, uint8_t* storage) { + entropy_codes->resize(histograms.size()); + for (int i = 0; i < histograms.size(); ++i) { + BuildAndStoreEntropyCode(histograms[i], 15, alphabet_size, + &(*entropy_codes)[i], + storage_ix, storage); } } @@ -342,7 +363,7 @@ void EncodeCommand(const Command& cmd, const EntropyCodeCommand& entropy, int* storage_ix, uint8_t* storage) { int code = cmd.command_prefix_; - EntropyEncode(code, entropy, storage_ix, storage); + WriteBits(entropy.depth_[code], entropy.bits_[code], storage_ix, storage); if (code >= 128) { code -= 128; } @@ -364,13 +385,15 @@ void EncodeCopyDistance(const Command& cmd, const EntropyCodeDistance& entropy, int code = cmd.distance_prefix_; int extra_bits = cmd.distance_extra_bits_; uint64_t extra_bits_val = cmd.distance_extra_bits_value_; - EntropyEncode(code, entropy, storage_ix, storage); + WriteBits(entropy.depth_[code], entropy.bits_[code], storage_ix, storage); if (extra_bits > 0) { WriteBits(extra_bits, extra_bits_val, storage_ix, storage); } } void ComputeDistanceShortCodes(std::vector<Command>* cmds, + size_t pos, + const size_t max_backward, int* dist_ringbuffer, size_t* ringbuffer_idx) { static const int kIndexOffset[16] = { @@ -380,30 +403,40 @@ void ComputeDistanceShortCodes(std::vector<Command>* cmds, 0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3 }; for (int i = 0; i < cmds->size(); ++i) { + pos += (*cmds)[i].insert_length_; + size_t max_distance = std::min(pos, max_backward); int cur_dist = (*cmds)[i].copy_distance_; - if (cur_dist == 0) break; int dist_code = cur_dist + 16; - int limits[16] = { 0, 4, 10, 11, - 6, 6, 11, 11, - 11, 11, 11, 11, - 12, 12, 12, 12 }; - for (int k = 0; k < 16; ++k) { - // Only accept more popular choices. - if (cur_dist < limits[k]) { - // Typically unpopular ranges, don't replace a short distance - // with them. - continue; + if (cur_dist <= max_distance) { + if (cur_dist == 0) break; + int limits[16] = { 0, 0, 0, 0, + 6, 6, 11, 11, + 11, 11, 11, 11, + 12, 12, 12, 12 }; + for (int k = 0; k < 16; ++k) { + // Only accept more popular choices. + if (cur_dist < limits[k]) { + // Typically unpopular ranges, don't replace a short distance + // with them. + continue; + } + int comp = (dist_ringbuffer[(*ringbuffer_idx + kIndexOffset[k]) & 3] + + kValueOffset[k]); + if (cur_dist == comp) { + dist_code = k + 1; + break; + } } - int comp = (dist_ringbuffer[(*ringbuffer_idx + kIndexOffset[k]) & 3] + - kValueOffset[k]); - if (cur_dist == comp) { - dist_code = k + 1; - break; + if (dist_code > 1) { + dist_ringbuffer[*ringbuffer_idx & 3] = cur_dist; + ++(*ringbuffer_idx); } - } - if (dist_code > 1) { - dist_ringbuffer[*ringbuffer_idx & 3] = cur_dist; - ++(*ringbuffer_idx); + pos += (*cmds)[i].copy_length_; + } else { + int word_idx = cur_dist - max_distance - 1; + const std::string word = + GetTransformedDictionaryWord((*cmds)[i].copy_length_code_, word_idx); + pos += word.size(); } (*cmds)[i].distance_code_ = dist_code; } @@ -558,18 +591,20 @@ void EncodeContextMap(const std::vector<int>& context_map, for (int i = 0; i < rle_symbols.size(); ++i) { symbol_histogram.Add(rle_symbols[i]); } - EntropyCodeContextMap symbol_code; - BuildEntropyCode(symbol_histogram, 15, num_clusters + max_run_length_prefix, - &symbol_code); bool use_rle = max_run_length_prefix > 0; WriteBits(1, use_rle, storage_ix, storage); if (use_rle) { WriteBits(4, max_run_length_prefix - 1, storage_ix, storage); } - StoreHuffmanCode(symbol_code, num_clusters + max_run_length_prefix, - storage_ix, storage); + EntropyCodeContextMap symbol_code; + BuildAndStoreEntropyCode(symbol_histogram, 15, + num_clusters + max_run_length_prefix, + &symbol_code, + storage_ix, storage); for (int i = 0; i < rle_symbols.size(); ++i) { - EntropyEncode(rle_symbols[i], symbol_code, storage_ix, storage); + WriteBits(symbol_code.depth_[rle_symbols[i]], + symbol_code.bits_[rle_symbols[i]], + storage_ix, storage); if (rle_symbols[i] > 0 && rle_symbols[i] <= max_run_length_prefix) { WriteBits(rle_symbols[i], extra_bits[i], storage_ix, storage); } @@ -577,16 +612,6 @@ void EncodeContextMap(const std::vector<int>& context_map, WriteBits(1, 1, storage_ix, storage); // use move-to-front } -template<int kSize> -void BuildEntropyCodes(const std::vector<Histogram<kSize> >& histograms, - int alphabet_size, - std::vector<EntropyCode<kSize> >* entropy_codes) { - entropy_codes->resize(histograms.size()); - for (int i = 0; i < histograms.size(); ++i) { - BuildEntropyCode(histograms[i], 15, alphabet_size, &(*entropy_codes)[i]); - } -} - struct BlockSplitCode { EntropyCodeBlockType block_type_code; EntropyCodeBlockLength block_len_code; @@ -598,8 +623,8 @@ void EncodeBlockLength(const EntropyCodeBlockLength& entropy, int len_code = BlockLengthPrefix(length); int extra_bits = BlockLengthExtraBits(len_code); int extra_bits_value = length - BlockLengthOffset(len_code); - EntropyEncode(len_code, entropy, storage_ix, storage); - + WriteBits(entropy.depth_[len_code], entropy.bits_[len_code], + storage_ix, storage); if (extra_bits > 0) { WriteBits(extra_bits, extra_bits_value, storage_ix, storage); } @@ -632,26 +657,25 @@ void BuildAndEncodeBlockSplitCode(const BlockSplit& split, BlockSplitCode* code, int* storage_ix, uint8_t* storage) { EncodeVarLenUint8(split.num_types_ - 1, storage_ix, storage); + if (split.num_types_ == 1) { return; } HistogramBlockType type_histo; - for (int i = 0; i < split.type_codes_.size(); ++i) { + for (int i = 1; i < split.type_codes_.size(); ++i) { type_histo.Add(split.type_codes_[i]); } - BuildEntropyCode(type_histo, 15, split.num_types_ + 2, - &code->block_type_code); HistogramBlockLength length_histo; for (int i = 0; i < split.lengths_.size(); ++i) { length_histo.Add(BlockLengthPrefix(split.lengths_[i])); } - BuildEntropyCode(length_histo, 15, kNumBlockLenPrefixes, - &code->block_len_code); - StoreHuffmanCode(code->block_type_code, split.num_types_ + 2, - storage_ix, storage); - StoreHuffmanCode(code->block_len_code, kNumBlockLenPrefixes, - storage_ix, storage); + BuildAndStoreEntropyCode(type_histo, 15, split.num_types_ + 2, + &code->block_type_code, + storage_ix, storage); + BuildAndStoreEntropyCode(length_histo, 15, kNumBlockLenPrefixes, + &code->block_len_code, + storage_ix, storage); EncodeBlockLength(code->block_len_code, split.lengths_[0], storage_ix, storage); } @@ -664,7 +688,9 @@ void MoveAndEncode(const BlockSplitCode& code, it->type_ = it->split_.types_[it->idx_]; it->length_ = it->split_.lengths_[it->idx_]; int type_code = it->split_.type_codes_[it->idx_]; - EntropyEncode(type_code, code.block_type_code, storage_ix, storage); + WriteBits(code.block_type_code.depth_[type_code], + code.block_type_code.bits_[type_code], + storage_ix, storage); EncodeBlockLength(code.block_len_code, it->length_, storage_ix, storage); } --it->length_; @@ -773,10 +799,8 @@ void StoreMetaBlock(const MetaBlock& mb, int* storage_ix, uint8_t* storage) { size_t length = MetaBlockLength(mb.cmds); const size_t end_pos = *pos + length; - EncodeMetaBlockLength(length, - is_last, - false, - storage_ix, storage); + EncodeMetaBlockLength(length, is_last, false, storage_ix, storage); + if (length == 0) { return; } @@ -792,26 +816,27 @@ void StoreMetaBlock(const MetaBlock& mb, WriteBits(2, mb.params.distance_postfix_bits, storage_ix, storage); WriteBits(4, mb.params.num_direct_distance_codes >> - mb.params.distance_postfix_bits, storage_ix, storage); + mb.params.distance_postfix_bits, + storage_ix, storage); int num_distance_codes = kNumDistanceShortCodes + mb.params.num_direct_distance_codes + (48 << mb.params.distance_postfix_bits); for (int i = 0; i < mb.literal_split.num_types_; ++i) { WriteBits(2, mb.literal_context_modes[i], storage_ix, storage); } - EncodeContextMap(mb.literal_context_map, mb.literal_histograms.size(), storage_ix, storage); - EncodeContextMap(mb.distance_context_map, mb.distance_histograms.size(), storage_ix, storage); + EncodeContextMap(mb.literal_context_map, mb.literal_histograms.size(), + storage_ix, storage); + EncodeContextMap(mb.distance_context_map, mb.distance_histograms.size(), + storage_ix, storage); std::vector<EntropyCodeLiteral> literal_codes; std::vector<EntropyCodeCommand> command_codes; std::vector<EntropyCodeDistance> distance_codes; - BuildEntropyCodes(mb.literal_histograms, 256, &literal_codes); - BuildEntropyCodes(mb.command_histograms, kNumCommandPrefixes, - &command_codes); - BuildEntropyCodes(mb.distance_histograms, num_distance_codes, - &distance_codes); - StoreHuffmanCodes(literal_codes, 256, storage_ix, storage); - StoreHuffmanCodes(command_codes, kNumCommandPrefixes, storage_ix, storage); - StoreHuffmanCodes(distance_codes, num_distance_codes, storage_ix, storage); + BuildAndStoreEntropyCodes(mb.literal_histograms, 256, &literal_codes, + storage_ix, storage); + BuildAndStoreEntropyCodes(mb.command_histograms, kNumCommandPrefixes, + &command_codes, storage_ix, storage); + BuildAndStoreEntropyCodes(mb.distance_histograms, num_distance_codes, + &distance_codes, storage_ix, storage); BlockSplitIterator literal_it(mb.literal_split); BlockSplitIterator command_it(mb.command_split); BlockSplitIterator distance_it(mb.distance_split); @@ -828,8 +853,10 @@ void StoreMetaBlock(const MetaBlock& mb, Context(prev_byte, prev_byte2, mb.literal_context_modes[literal_it.type_])); histogram_idx = mb.literal_context_map[context]; - EntropyEncode(ringbuffer[*pos & mask], - literal_codes[histogram_idx], storage_ix, storage); + int literal = ringbuffer[*pos & mask]; + WriteBits(literal_codes[histogram_idx].depth_[literal], + literal_codes[histogram_idx].bits_[literal], + storage_ix, storage); ++(*pos); } if (*pos < end_pos && cmd.distance_prefix_ != 0xffff) { @@ -845,9 +872,10 @@ void StoreMetaBlock(const MetaBlock& mb, } } -BrotliCompressor::BrotliCompressor() - : window_bits_(kWindowBits), - hasher_(new Hasher), +BrotliCompressor::BrotliCompressor(BrotliParams params) + : params_(params), + window_bits_(kWindowBits), + hashers_(new Hashers()), dist_ringbuffer_idx_(0), input_pos_(0), ringbuffer_(kRingBufferBits, kMetaBlockSizeBits), @@ -859,28 +887,41 @@ BrotliCompressor::BrotliCompressor() dist_ringbuffer_[2] = 11; dist_ringbuffer_[3] = 4; storage_[0] = 0; - StoreDictionaryWordHashes(); + switch (params.mode) { + case BrotliParams::MODE_TEXT: hash_type_ = Hashers::HASH_15_8_4; break; + case BrotliParams::MODE_FONT: hash_type_ = Hashers::HASH_15_8_2; break; + default: break; + } + hashers_->Init(hash_type_); + if (params.mode == BrotliParams::MODE_TEXT) { + StoreDictionaryWordHashes(); + } } BrotliCompressor::~BrotliCompressor() { - delete hasher_; delete[] storage_; } +StaticDictionary *BrotliCompressor::static_dictionary_ = NULL; + void BrotliCompressor::StoreDictionaryWordHashes() { - for (int t = kNumTransforms - 1; t >= 0; --t) { - for (int i = kMaxDictionaryWordLength; i >= 3; --i) { - const int num_words = 1 << kBrotliDictionarySizeBitsByLength[i]; - for (int j = num_words - 1; j >= 0; --j) { - int word_id = t * num_words + j; - std::string word = GetTransformedDictionaryWord(i, word_id); - if (word.size() >= 3) { - hasher_->Store(reinterpret_cast<const uint8_t*>(&word[0]), - (-1) * ((i << 20) + word_id + 1)); + const int num_transforms = kNumTransforms; + if (static_dictionary_ == NULL) { + static_dictionary_ = new StaticDictionary; + for (int t = num_transforms - 1; t >= 0; --t) { + for (int i = kMaxDictionaryWordLength; i >= 3; --i) { + const int num_words = 1 << kBrotliDictionarySizeBitsByLength[i]; + for (int j = num_words - 1; j >= 0; --j) { + int word_id = t * num_words + j; + std::string word = GetTransformedDictionaryWord(i, word_id); + if (word.size() >= 3) { + static_dictionary_->Insert(word, i, word_id); + } } } } } + hashers_->SetStaticDictionary(static_dictionary_); } void BrotliCompressor::WriteStreamHeader() { @@ -908,25 +949,30 @@ void BrotliCompressor::WriteMetaBlock(const size_t input_size, input_size, kMinUTF8Ratio); if (utf8_mode) { EstimateBitCostsForLiteralsUTF8(input_pos_, input_size, - kRingBufferMask, ringbuffer_.start(), - &literal_cost_[0]); + kRingBufferMask, kRingBufferMask, + ringbuffer_.start(), &literal_cost_[0]); } else { EstimateBitCostsForLiterals(input_pos_, input_size, - kRingBufferMask, ringbuffer_.start(), - &literal_cost_[0]); - } - CreateBackwardReferences(input_size, input_pos_, - ringbuffer_.start(), - &literal_cost_[0], - kRingBufferMask, kMaxBackwardDistance, - hasher_, - &commands); - ComputeDistanceShortCodes(&commands, dist_ringbuffer_, + kRingBufferMask, kRingBufferMask, + ringbuffer_.start(), &literal_cost_[0]); + } + CreateBackwardReferences( + input_size, input_pos_, + ringbuffer_.start(), + &literal_cost_[0], + kRingBufferMask, kMaxBackwardDistance, + hashers_.get(), + hash_type_, + &commands); + ComputeDistanceShortCodes(&commands, input_pos_, kMaxBackwardDistance, + dist_ringbuffer_, &dist_ringbuffer_idx_); } EncodingParams params; - params.num_direct_distance_codes = 12; - params.distance_postfix_bits = 1; + params.num_direct_distance_codes = + params_.mode == BrotliParams::MODE_FONT ? 12 : 0; + params.distance_postfix_bits = + params_.mode == BrotliParams::MODE_FONT ? 1 : 0; params.literal_context_mode = CONTEXT_SIGNED; const int storage_ix0 = storage_ix_; MetaBlock mb; @@ -935,6 +981,7 @@ void BrotliCompressor::WriteMetaBlock(const size_t input_size, StoreMetaBlock(mb, is_last, ringbuffer_.start(), kRingBufferMask, &input_pos_, &storage_ix_, storage_); size_t output_size = is_last ? ((storage_ix_ + 7) >> 3) : (storage_ix_ >> 3); + output_size -= (storage_ix0 >> 3); if (input_size + 4 < output_size) { storage_ix_ = storage_ix0; storage_[storage_ix_ >> 3] &= (1 << (storage_ix_ & 7)) - 1; @@ -968,7 +1015,8 @@ void BrotliCompressor::FinishStream( } -int BrotliCompressBuffer(size_t input_size, +int BrotliCompressBuffer(BrotliParams params, + size_t input_size, const uint8_t* input_buffer, size_t* encoded_size, uint8_t* encoded_buffer) { @@ -978,7 +1026,7 @@ int BrotliCompressBuffer(size_t input_size, return 1; } - BrotliCompressor compressor; + BrotliCompressor compressor(params); compressor.WriteStreamHeader(); const int max_block_size = 1 << kMetaBlockSizeBits; diff --git a/brotli/enc/encode.h b/brotli/enc/encode.h index 49bd5b1..a128f7e 100644 --- a/brotli/enc/encode.h +++ b/brotli/enc/encode.h @@ -23,12 +23,23 @@ #include <vector> #include "./hash.h" #include "./ringbuffer.h" +#include "./static_dict.h" namespace brotli { +struct BrotliParams { + enum Mode { + MODE_TEXT = 0, + MODE_FONT = 1, + }; + Mode mode; + + BrotliParams() : mode(MODE_TEXT) {} +}; + class BrotliCompressor { public: - BrotliCompressor(); + explicit BrotliCompressor(BrotliParams params); ~BrotliCompressor(); // Writes the stream header into the internal output buffer. @@ -53,8 +64,10 @@ class BrotliCompressor { // Initializes the hasher with the hashes of dictionary words. void StoreDictionaryWordHashes(); + BrotliParams params_; int window_bits_; - Hasher* hasher_; + std::unique_ptr<Hashers> hashers_; + Hashers::Type hash_type_; int dist_ringbuffer_[4]; size_t dist_ringbuffer_idx_; size_t input_pos_; @@ -62,12 +75,14 @@ class BrotliCompressor { std::vector<float> literal_cost_; int storage_ix_; uint8_t* storage_; + static StaticDictionary *static_dictionary_; }; // Compresses the data in input_buffer into encoded_buffer, and sets // *encoded_size to the compressed length. // Returns 0 if there was an error and 1 otherwise. -int BrotliCompressBuffer(size_t input_size, +int BrotliCompressBuffer(BrotliParams params, + size_t input_size, const uint8_t* input_buffer, size_t* encoded_size, uint8_t* encoded_buffer); diff --git a/brotli/enc/hash.h b/brotli/enc/hash.h index d6a415b..b07a538 100644 --- a/brotli/enc/hash.h +++ b/brotli/enc/hash.h @@ -24,12 +24,14 @@ #include <sys/types.h> #include <algorithm> #include <cstdlib> +#include <memory> #include <string> #include "./transform.h" #include "./fast_log.h" #include "./find_match_length.h" #include "./port.h" +#include "./static_dict.h" namespace brotli { @@ -41,11 +43,21 @@ namespace brotli { // * The number has been tuned heuristically against compression benchmarks. static const uint32_t kHashMul32 = 0x1e35a7bd; -inline uint32_t Hash3Bytes(const uint8_t *data, const int bits) { - uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32; - // The higher bits contain more mixture from the multiplication, - // so we take our results from there. - return h >> (32 - bits); +template<int kShiftBits, int kMinLength> +inline uint32_t Hash(const uint8_t *data) { + if (kMinLength <= 3) { + // If kMinLength is 2 or 3, we hash the first 3 bytes of data. + uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32; + // The higher bits contain more mixture from the multiplication, + // so we take our results from there. + return h >> (32 - kShiftBits); + } else { + // If kMinLength is at least 4, we hash the first 4 bytes of data. + uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32; + // The higher bits contain more mixture from the multiplication, + // so we take our results from there. + return h >> (32 - kShiftBits); + } } // Usually, we always choose the longest backward reference. This function @@ -67,32 +79,35 @@ inline double BackwardReferenceScore(double average_cost, double start_cost3, double start_cost2, int copy_length, - int backward_reference_offset, - int last_distance1, - int last_distance2, - int last_distance3, - int last_distance4) { + int backward_reference_offset) { double retval = 0; switch (copy_length) { case 2: retval = start_cost2; break; case 3: retval = start_cost3; break; default: retval = start_cost4 + (copy_length - 4) * average_cost; break; } - int diff_last1 = abs(backward_reference_offset - last_distance1); - int diff_last2 = abs(backward_reference_offset - last_distance2); - if (diff_last1 == 0) { - retval += 0.6; - } else if (diff_last1 < 4) { - retval -= 0.9 + 0.03 * diff_last1; - } else if (diff_last2 < 4) { - retval -= 0.95 + 0.1 * diff_last2; - } else if (backward_reference_offset == last_distance3) { - retval -= 1.17; - } else if (backward_reference_offset == last_distance4) { - retval -= 1.27; - } else { - retval -= 1.20 * Log2Floor(backward_reference_offset); + retval -= 1.20 * Log2Floor(backward_reference_offset); + return retval; +} + +inline double BackwardReferenceScoreUsingLastDistance(double average_cost, + double start_cost4, + double start_cost3, + double start_cost2, + int copy_length, + int distance_short_code) { + double retval = 0; + switch (copy_length) { + case 2: retval = start_cost2; break; + case 3: retval = start_cost3; break; + default: retval = start_cost4 + (copy_length - 4) * average_cost; break; } + static const double kDistanceShortCodeBitCost[16] = { + -0.6, 0.95, 1.17, 1.27, + 0.93, 0.93, 0.96, 0.96, 0.99, 0.99, + 1.05, 1.05, 1.15, 1.15, 1.25, 1.25 + }; + retval -= kDistanceShortCodeBitCost[distance_short_code]; return retval; } @@ -102,7 +117,7 @@ inline double BackwardReferenceScore(double average_cost, // This is a hash map of fixed size (kBucketSize) to a ring buffer of // fixed size (kBlockSize). The ring buffer contains the last kBlockSize // index positions of the given hash key in the compressed data. -template <int kBucketBits, int kBlockBits> +template <int kBucketBits, int kBlockBits, int kMinLength> class HashLongestMatch { public: HashLongestMatch() @@ -111,17 +126,24 @@ class HashLongestMatch { last_distance3_(15), last_distance4_(16), insert_length_(0), - average_cost_(5.4) { + average_cost_(5.4), + static_dict_(NULL) { Reset(); } void Reset() { std::fill(&num_[0], &num_[sizeof(num_) / sizeof(num_[0])], 0); } + void SetStaticDictionary(const StaticDictionary *dict) { + static_dict_ = dict; + } + bool HasStaticDictionary() const { + return static_dict_ != NULL; + } // Look at 3 bytes at data. // Compute a hash from these, and store the value of ix at that position. inline void Store(const uint8_t *data, const int ix) { - const uint32_t key = Hash3Bytes(data, kBucketBits); + const uint32_t key = Hash<kBucketBits, kMinLength>(data); const int minor_ix = num_[key] & kBlockMask; buckets_[key][minor_ix] = ix; ++num_[key]; @@ -218,19 +240,17 @@ class HashLongestMatch { const size_t len = FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked], max_length); - if (len >= 3 || (len == 2 && i < 2)) { + if (len >= std::max(kMinLength, 3) || + (kMinLength == 2 && len == 2 && i < 2)) { // Comparing for >= 2 does not change the semantics, but just saves for // a few unnecessary binary logarithms in backward reference score, // since we are not interested in such short matches. - const double score = BackwardReferenceScore(average_cost_, - start_cost4, - start_cost3, - start_cost2, - len, backward, - last_distance1_, - last_distance2_, - last_distance3_, - last_distance4_); + const double score = BackwardReferenceScoreUsingLastDistance( + average_cost_, + start_cost4, + start_cost3, + start_cost2, + len, i); if (best_score < score) { best_score = score; best_len = len; @@ -244,76 +264,41 @@ class HashLongestMatch { } } } - const uint32_t key = Hash3Bytes(&data[cur_ix_masked], kBucketBits); - const int * __restrict const bucket = &buckets_[key][0]; - const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0; - int stop = int(cur_ix) - 64; - if (stop < 0) { stop = 0; } + if (kMinLength == 2) { + int stop = int(cur_ix) - 64; + if (stop < 0) { stop = 0; } + start_cost2 -= 1.0; + for (int i = cur_ix - 1; i > stop; --i) { + size_t prev_ix = i; + const size_t backward = cur_ix - prev_ix; + if (PREDICT_FALSE(backward > max_backward)) { + break; + } + prev_ix &= ring_buffer_mask; + if (data[cur_ix_masked] != data[prev_ix] || + data[cur_ix_masked + 1] != data[prev_ix + 1]) { + continue; + } + int len = 2; + const double score = start_cost2 - 2.3 * Log2Floor(backward); - start_cost2 -= 1.0; - for (int i = cur_ix - 1; i > stop; --i) { - size_t prev_ix = i; - const size_t backward = cur_ix - prev_ix; - if (PREDICT_FALSE(backward > max_backward)) { - break; - } - prev_ix &= ring_buffer_mask; - if (data[cur_ix_masked] != data[prev_ix] || - data[cur_ix_masked + 1] != data[prev_ix + 1]) { - continue; - } - int len = 2; - const double score = start_cost2 - 2.3 * Log2Floor(backward); - - if (best_score < score) { - best_score = score; - best_len = len; - best_ix = backward; - *best_len_out = best_len; - *best_len_code_out = best_len; - *best_distance_out = best_ix; - match_found = true; + if (best_score < score) { + best_score = score; + best_len = len; + best_ix = backward; + *best_len_out = best_len; + *best_len_code_out = best_len; + *best_distance_out = best_ix; + match_found = true; + } } } + const uint32_t key = Hash<kBucketBits, kMinLength>(&data[cur_ix_masked]); + const int * __restrict const bucket = &buckets_[key][0]; + const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0; for (int i = num_[key] - 1; i >= down; --i) { int prev_ix = bucket[i & kBlockMask]; - if (prev_ix < 0) { - prev_ix *= -1; - prev_ix -= 1; - int copy_len_code = prev_ix >> 20; - int word_id = prev_ix & 0xfffff; - std::string word = GetTransformedDictionaryWord(copy_len_code, word_id); - int len = word.size(); - const size_t backward = max_backward + word_id + 1; - bool word_matched = (len >= 3 && len <= max_length); - for (int k = 0; k < len && word_matched; ++k) { - if ((uint8_t)(word[k]) != data[cur_ix_masked + k]) { - word_matched = false; - } - } - if (word_matched) { - const double score = BackwardReferenceScore(average_cost_, - start_cost4, - start_cost3, - start_cost2, - len, backward, - last_distance1_, - last_distance2_, - last_distance3_, - last_distance4_); - if (best_score < score) { - best_score = score; - best_len = len; - best_ix = backward; - *best_len_out = best_len; - *best_len_code_out = copy_len_code; - *best_distance_out = best_ix; - *best_score_out = best_score; - match_found = true; - *in_dictionary = true; - } - } - } else { + if (prev_ix >= 0) { const size_t backward = cur_ix - prev_ix; if (PREDICT_FALSE(backward > max_backward)) { break; @@ -327,7 +312,7 @@ class HashLongestMatch { const size_t len = FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked], max_length); - if (len >= 3) { + if (len >= std::max(kMinLength, 3)) { // Comparing for >= 3 does not change the semantics, but just saves // for a few unnecessary binary logarithms in backward reference // score, since we are not interested in such short matches. @@ -335,11 +320,7 @@ class HashLongestMatch { start_cost4, start_cost3, start_cost2, - len, backward, - last_distance1_, - last_distance2_, - last_distance3_, - last_distance4_); + len, backward); if (best_score < score) { best_score = score; best_len = len; @@ -354,6 +335,36 @@ class HashLongestMatch { } } } + if (static_dict_ != NULL) { + // We decide based on first 4 bytes how many bytes to test for. + int prefix = BROTLI_UNALIGNED_LOAD32(&data[cur_ix_masked]); + int maxlen = static_dict_->GetLength(prefix); + for (int len = std::min<size_t>(maxlen, max_length); + len > best_len && len >= 4; --len) { + std::string snippet((const char *)&data[cur_ix_masked], len); + int copy_len_code; + int word_id; + if (static_dict_->Get(snippet, ©_len_code, &word_id)) { + const size_t backward = max_backward + word_id + 1; + const double score = BackwardReferenceScore(average_cost_, + start_cost4, + start_cost3, + start_cost2, + len, backward); + if (best_score < score) { + best_score = score; + best_len = len; + best_ix = backward; + *best_len_out = best_len; + *best_len_code_out = copy_len_code; + *best_distance_out = best_ix; + *best_score_out = best_score; + match_found = true; + *in_dictionary = true; + } + } + } + } return match_found; } @@ -399,9 +410,37 @@ class HashLongestMatch { int insert_length_; double average_cost_; + + const StaticDictionary *static_dict_; }; -typedef HashLongestMatch<13, 11> Hasher; +struct Hashers { + enum Type { + HASH_15_8_4 = 0, + HASH_15_8_2 = 1, + }; + + void Init(Type type) { + switch (type) { + case HASH_15_8_4: + hash_15_8_4.reset(new HashLongestMatch<15, 8, 4>()); + break; + case HASH_15_8_2: + hash_15_8_2.reset(new HashLongestMatch<15, 8, 2>()); + break; + default: + break; + } + } + + void SetStaticDictionary(const StaticDictionary *dict) { + if (hash_15_8_4.get() != NULL) hash_15_8_4->SetStaticDictionary(dict); + if (hash_15_8_2.get() != NULL) hash_15_8_2->SetStaticDictionary(dict); + } + + std::unique_ptr<HashLongestMatch<15, 8, 4> > hash_15_8_4; + std::unique_ptr<HashLongestMatch<15, 8, 2> > hash_15_8_2; +}; } // namespace brotli diff --git a/brotli/enc/literal_cost.cc b/brotli/enc/literal_cost.cc index a944599..9179923 100644 --- a/brotli/enc/literal_cost.cc +++ b/brotli/enc/literal_cost.cc @@ -59,7 +59,8 @@ static int DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask, } void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask, - const uint8_t *data, float *cost) { + size_t cost_mask, const uint8_t *data, + float *cost) { // max_utf8 is 0 (normal ascii single byte modeling), // 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling). @@ -110,18 +111,20 @@ void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask, if (histo == 0) { histo = 1; } - cost[masked_pos] = log2(static_cast<double>(in_window_utf8[utf8_pos]) - / histo); - cost[masked_pos] += 0.02905; - if (cost[masked_pos] < 1.0) { - cost[masked_pos] *= 0.5; - cost[masked_pos] += 0.5; + float lit_cost = log2(static_cast<double>(in_window_utf8[utf8_pos]) + / histo); + lit_cost += 0.02905; + if (lit_cost < 1.0) { + lit_cost *= 0.5; + lit_cost += 0.5; } + cost[(pos + i) & cost_mask] = lit_cost; } } void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask, - const uint8_t *data, float *cost) { + size_t cost_mask, const uint8_t *data, + float *cost) { int histogram[256] = { 0 }; int window_half = 2000; int in_window = std::min(static_cast<size_t>(window_half), len); @@ -143,17 +146,17 @@ void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask, ++histogram[data[(pos + i + window_half) & mask]]; ++in_window; } - int masked_pos = (pos + i) & mask; - int histo = histogram[data[masked_pos]]; + int histo = histogram[data[(pos + i) & mask]]; if (histo == 0) { histo = 1; } - cost[masked_pos] = log2(static_cast<double>(in_window) / histo); - cost[masked_pos] += 0.029; - if (cost[masked_pos] < 1.0) { - cost[masked_pos] *= 0.5; - cost[masked_pos] += 0.5; + float lit_cost = log2(static_cast<double>(in_window) / histo); + lit_cost += 0.029; + if (lit_cost < 1.0) { + lit_cost *= 0.5; + lit_cost += 0.5; } + cost[(pos + i) & cost_mask] = lit_cost; } } diff --git a/brotli/enc/literal_cost.h b/brotli/enc/literal_cost.h index ca39a4e..5c1eca4 100644 --- a/brotli/enc/literal_cost.h +++ b/brotli/enc/literal_cost.h @@ -26,11 +26,11 @@ namespace brotli { // ringbuffer (data, mask) will take entropy coded and writes these estimates // to the ringbuffer (cost, mask). void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask, - const uint8_t *data, + size_t cost_mask, const uint8_t *data, float *cost); void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask, - const uint8_t *data, + size_t cost_mask, const uint8_t *data, float *cost); } // namespace brotli diff --git a/brotli/enc/prefix.cc b/brotli/enc/prefix.cc index 3e43501..50b671e 100644 --- a/brotli/enc/prefix.cc +++ b/brotli/enc/prefix.cc @@ -90,7 +90,7 @@ int CopyLengthPrefix(int length) { int CommandPrefix(int insert_length, int copy_length) { if (copy_length == 0) { - copy_length = 3; + copy_length = 4; } int insert_prefix = InsertLengthPrefix(insert_length); int copy_prefix = CopyLengthPrefix(copy_length); diff --git a/brotli/enc/static_dict.h b/brotli/enc/static_dict.h new file mode 100644 index 0000000..1b5a77f --- /dev/null +++ b/brotli/enc/static_dict.h @@ -0,0 +1,68 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Class to model the static dictionary. + +#ifndef BROTLI_ENC_STATIC_DICT_H_ +#define BROTLI_ENC_STATIC_DICT_H_ + +#include <algorithm> +#include <unordered_map> +#include <string> + +namespace brotli { + +class StaticDictionary { + public: + StaticDictionary() {} + void Insert(const std::string &str, int len, int dist) { + int ix = (dist << 6) + len; + std::unordered_map<std::string, int>::const_iterator it = map_.find(str); + if (it != map_.end() && ix >= it->second) { + return; + } + map_[str] = ix; + int v = 0; + for (int i = 0; i < 4 && i < str.size(); ++i) { + v += str[i] << (8 * i); + } + if (prefix_map_[v] < str.size()) { + prefix_map_[v] = str.size(); + } + } + int GetLength(int v) const { + std::unordered_map<int, int>::const_iterator it = prefix_map_.find(v); + if (it == prefix_map_.end()) { + return 0; + } + return it->second; + } + bool Get(const std::string &str, int *len, int *dist) const { + std::unordered_map<std::string, int>::const_iterator it = map_.find(str); + if (it == map_.end()) { + return false; + } + int v = it->second; + *len = v & 63; + *dist = v >> 6; + return true; + } + private: + std::unordered_map<std::string, int> map_; + std::unordered_map<int, int> prefix_map_; +}; + +} // namespace brotli + +#endif // BROTLI_ENC_STATIC_DICT_H_ |