diff options
Diffstat (limited to 'brotli/enc/encode.cc')
-rw-r--r-- | brotli/enc/encode.cc | 189 |
1 files changed, 141 insertions, 48 deletions
diff --git a/brotli/enc/encode.cc b/brotli/enc/encode.cc index b492421..6603eec 100644 --- a/brotli/enc/encode.cc +++ b/brotli/enc/encode.cc @@ -77,6 +77,68 @@ void EncodeVarLenUint8(int n, int* storage_ix, uint8_t* storage) { } } +int ParseAsUTF8(int* symbol, const uint8_t* input, int size) { + // ASCII + if ((input[0] & 0x80) == 0) { + *symbol = input[0]; + if (*symbol > 0) { + return 1; + } + } + // 2-byte UTF8 + if (size > 1 && + (input[0] & 0xe0) == 0xc0 && + (input[1] & 0xc0) == 0x80) { + *symbol = (((input[0] & 0x1f) << 6) | + (input[1] & 0x3f)); + if (*symbol > 0x7f) { + return 2; + } + } + // 3-byte UFT8 + if (size > 2 && + (input[0] & 0xf0) == 0xe0 && + (input[1] & 0xc0) == 0x80 && + (input[2] & 0xc0) == 0x80) { + *symbol = (((input[0] & 0x0f) << 12) | + ((input[1] & 0x3f) << 6) | + (input[2] & 0x3f)); + if (*symbol > 0x7ff) { + return 3; + } + } + // 4-byte UFT8 + if (size > 3 && + (input[0] & 0xf8) == 0xf0 && + (input[1] & 0xc0) == 0x80 && + (input[2] & 0xc0) == 0x80 && + (input[3] & 0xc0) == 0x80) { + *symbol = (((input[0] & 0x07) << 18) | + ((input[1] & 0x3f) << 12) | + ((input[2] & 0x3f) << 6) | + (input[3] & 0x3f)); + if (*symbol > 0xffff && *symbol <= 0x10ffff) { + return 4; + } + } + // Not UTF8, emit a special symbol above the UTF8-code space + *symbol = 0x110000 | input[0]; + return 1; +} + +// Returns true if at least min_fraction of the data is UTF8-encoded. +bool IsMostlyUTF8(const uint8_t* data, size_t length, double min_fraction) { + size_t size_utf8 = 0; + size_t pos = 0; + while (pos < length) { + int symbol; + int bytes_read = ParseAsUTF8(&symbol, data + pos, length - pos); + pos += bytes_read; + if (symbol < 0x110000) size_utf8 += bytes_read; + } + return size_utf8 > min_fraction * length; +} + void EncodeMetaBlockLength(size_t meta_block_size, bool is_last, bool is_uncompressed, @@ -118,7 +180,7 @@ void StoreHuffmanTreeOfHuffmanTreeToBitMask( const uint8_t* code_length_bitdepth, int* storage_ix, uint8_t* storage) { static const uint8_t kStorageOrder[kCodeLengthCodes] = { - 1, 2, 3, 4, 0, 17, 5, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15, }; // Throw away trailing zeros: int codes_to_store = kCodeLengthCodes; @@ -147,7 +209,7 @@ void StoreHuffmanTreeOfHuffmanTreeToBitMask( WriteBits(2, skip_some, storage_ix, storage); for (int i = skip_some; i < codes_to_store; ++i) { uint8_t len[] = { 2, 4, 3, 2, 2, 4 }; - uint8_t bits[] = { 0, 5, 1, 3, 2, 13 }; + uint8_t bits[] = { 0, 7, 3, 2, 1, 15 }; int v = code_length_bitdepth[kStorageOrder[i]]; WriteBits(len[v], bits[v], storage_ix, storage); } @@ -175,54 +237,49 @@ void StoreHuffmanTreeToBitMask( } template<int kSize> -void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size, - int* storage_ix, uint8_t* storage) { +void StoreHuffmanCodeSimple( + const EntropyCode<kSize>& code, int alphabet_size, + int max_bits, + int* storage_ix, uint8_t* storage) { const uint8_t *depth = &code.depth_[0]; - int max_bits_counter = alphabet_size - 1; - int max_bits = 0; - while (max_bits_counter) { - max_bits_counter >>= 1; - ++max_bits; - } - if (code.count_ == 0) { // emit minimal tree for empty cases - // bits: small tree marker: 1, count-1: 0, max_bits-sized encoding for 0 - WriteBits(4 + max_bits, 0x1, storage_ix, storage); - return; - } - if (code.count_ <= 4) { - int symbols[4]; - // Quadratic sort. - int k, j; - for (k = 0; k < code.count_; ++k) { - symbols[k] = code.symbols_[k]; - } - for (k = 0; k < code.count_; ++k) { - for (j = k + 1; j < code.count_; ++j) { - if (depth[symbols[j]] < depth[symbols[k]]) { - int t = symbols[k]; - symbols[k] = symbols[j]; - symbols[j] = t; - } + int symbols[4]; + // Quadratic sort. + int k, j; + for (k = 0; k < code.count_; ++k) { + symbols[k] = code.symbols_[k]; + } + for (k = 0; k < code.count_; ++k) { + for (j = k + 1; j < code.count_; ++j) { + if (depth[symbols[j]] < depth[symbols[k]]) { + int t = symbols[k]; + symbols[k] = symbols[j]; + symbols[j] = t; } } - // Small tree marker to encode 1-4 symbols. - WriteBits(2, 1, storage_ix, storage); - WriteBits(2, code.count_ - 1, storage_ix, storage); - for (int i = 0; i < code.count_; ++i) { - WriteBits(max_bits, symbols[i], storage_ix, storage); - } - if (code.count_ == 4) { - if (depth[symbols[0]] == 2 && - depth[symbols[1]] == 2 && - depth[symbols[2]] == 2 && - depth[symbols[3]] == 2) { - WriteBits(1, 0, storage_ix, storage); - } else { - WriteBits(1, 1, storage_ix, storage); - } + } + // Small tree marker to encode 1-4 symbols. + WriteBits(2, 1, storage_ix, storage); + WriteBits(2, code.count_ - 1, storage_ix, storage); + for (int i = 0; i < code.count_; ++i) { + WriteBits(max_bits, symbols[i], storage_ix, storage); + } + if (code.count_ == 4) { + if (depth[symbols[0]] == 2 && + depth[symbols[1]] == 2 && + depth[symbols[2]] == 2 && + depth[symbols[3]] == 2) { + WriteBits(1, 0, storage_ix, storage); + } else { + WriteBits(1, 1, storage_ix, storage); } - return; } +} + +template<int kSize> +void StoreHuffmanCodeComplex( + const EntropyCode<kSize>& code, int alphabet_size, + int* storage_ix, uint8_t* storage) { + const uint8_t *depth = &code.depth_[0]; uint8_t huffman_tree[kSize]; uint8_t huffman_tree_extra_bits[kSize]; int huffman_tree_size = 0; @@ -246,6 +303,31 @@ void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size, storage_ix, storage); } + +template<int kSize> +void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size, + int* storage_ix, uint8_t* storage) { + int max_bits_counter = alphabet_size - 1; + int max_bits = 0; + while (max_bits_counter) { + max_bits_counter >>= 1; + ++max_bits; + } + if (code.count_ == 0) { + // Emit a minimal tree for empty cases. + // bits: small tree marker: 1, count-1: 0, max_bits-sized encoding for 0 + WriteBits(4 + max_bits, 0x1, storage_ix, storage); + } else if (code.count_ <= 4) { + StoreHuffmanCodeSimple( + code, alphabet_size, max_bits, + storage_ix, storage); + } else { + StoreHuffmanCodeComplex( + code, alphabet_size, + storage_ix, storage); + } +} + template<int kSize> void StoreHuffmanCodes(const std::vector<EntropyCode<kSize> >& codes, int alphabet_size, @@ -798,12 +880,23 @@ void BrotliCompressor::WriteMetaBlock(const size_t input_size, const bool is_last, size_t* encoded_size, uint8_t* encoded_buffer) { + static const double kMinUTF8Ratio = 0.75; + bool utf8_mode = false; std::vector<Command> commands; if (input_size > 0) { ringbuffer_.Write(input_buffer, input_size); - EstimateBitCostsForLiterals(input_pos_, input_size, - kRingBufferMask, ringbuffer_.start(), - &literal_cost_[0]); + utf8_mode = IsMostlyUTF8( + &ringbuffer_.start()[input_pos_ & kRingBufferMask], + input_size, kMinUTF8Ratio); + if (utf8_mode) { + EstimateBitCostsForLiteralsUTF8(input_pos_, input_size, + kRingBufferMask, ringbuffer_.start(), + &literal_cost_[0]); + } else { + EstimateBitCostsForLiterals(input_pos_, input_size, + kRingBufferMask, ringbuffer_.start(), + &literal_cost_[0]); + } CreateBackwardReferences(input_size, input_pos_, ringbuffer_.start(), &literal_cost_[0], |