Updates to Brotli compression format, decoder and encoder

This commit contains a batch of changes that were made to the Brotli compression algorithm in the last three weeks. Most important changes: * Added UTF8 context model for good text compression. * Simplified context modeling by having only 4 context modes. * Per-block context mode selection. * Faster backward copying and bit reading functions. * More efficient histogram coding. * Streaming support for the decoder and encoder.
author: Zoltan Szabadka <szabadka@google.com> 2013-11-15 19:02:17 +0100
committer: Zoltan Szabadka <szabadka@google.com> 2013-11-15 19:02:17 +0100
commit: 1571db36a9b00e895882ee236e9f84d62f8ea226 (patch)
tree: 365fdb3cbd81ab9dc863b4ec39dcba6837616b00
parent: 79e99afe468407e9ff9f0820df7190cb069eabeb (diff)
download: src-1571db36a9b00e895882ee236e9f84d62f8ea226.tar.gz
23 files changed, 1644 insertions, 867 deletions
diff --git a/brotli/dec/bit_reader.c b/brotli/dec/bit_reader.c
index 858afcb..9781248 100644
--- a/brotli/dec/bit_reader.c
+++ b/brotli/dec/bit_reader.c
@@ -15,6 +15,7 @@
 // Bit reading helpers
 
 #include <assert.h>
+#include <stdlib.h>
 
 #include "./bit_reader.h"
 
@@ -22,99 +23,24 @@
 extern "C" {
 #endif
 
-#define MAX_NUM_BIT_READ 25
-
-#define LBITS 64      // Number of bits prefetched.
-#define WBITS 32      // Minimum number of bytes needed after
-                      // BrotliFillBitWindow.
-#define LOG8_WBITS 4  // Number of bytes needed to store WBITS bits.
-
-static const uint32_t kBitMask[MAX_NUM_BIT_READ] = {
-  0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
-  65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
-};
-
-void BrotliInitBitReader(BrotliBitReader* const br,
-                         const uint8_t* const start,
-                         size_t length) {
+int BrotliInitBitReader(BrotliBitReader* const br, BrotliInput input) {
   size_t i;
   assert(br != NULL);
-  assert(start != NULL);
-  assert(length < 0xfffffff8u);   // can't happen with a RIFF chunk.
 
-  br->buf_ = start;
-  br->len_ = length;
+  br->input_ = input;
   br->val_ = 0;
   br->pos_ = 0;
   br->bit_pos_ = 0;
+  br->end_pos_ = 0;
   br->eos_ = 0;
-  br->error_ = 0;
-  for (i = 0; i < sizeof(br->val_) && i < br->len_; ++i) {
-    br->val_ |= ((uint64_t)br->buf_[br->pos_]) << (8 * i);
-    ++br->pos_;
+  if (!BrotliReadMoreInput(br)) {
+    return 0;
   }
-}
-
-void BrotliBitReaderSetBuffer(BrotliBitReader* const br,
-                              const uint8_t* const buf, size_t len) {
-  assert(br != NULL);
-  assert(buf != NULL);
-  assert(len < 0xfffffff8u);   // can't happen with a RIFF chunk.
-  br->eos_ = (br->pos_ >= len);
-  br->buf_ = buf;
-  br->len_ = len;
-}
-
-// If not at EOS, reload up to LBITS byte-by-byte
-static void ShiftBytes(BrotliBitReader* const br) {
-  while (br->bit_pos_ >= 8 && br->pos_ < br->len_) {
-    br->val_ >>= 8;
-    br->val_ |= ((uint64_t)br->buf_[br->pos_]) << (LBITS - 8);
+  for (i = 0; i < sizeof(br->val_); ++i) {
+    br->val_ |= ((uint64_t)br->buf_[br->pos_]) << (8 * i);
     ++br->pos_;
-    br->bit_pos_ -= 8;
-  }
-}
-
-void BrotliFillBitWindow(BrotliBitReader* const br) {
-  if (br->bit_pos_ >= WBITS) {
-#if (defined(__x86_64__) || defined(_M_X64))
-    if (br->pos_ + sizeof(br->val_) < br->len_) {
-      br->val_ >>= WBITS;
-      br->bit_pos_ -= WBITS;
-      // The expression below needs a little-endian arch to work correctly.
-      // This gives a large speedup for decoding speed.
-      br->val_ |= *(const uint64_t*)(br->buf_ + br->pos_) << (LBITS - WBITS);
-      br->pos_ += LOG8_WBITS;
-      return;
-    }
-#endif
-    ShiftBytes(br);       // Slow path.
-    if (br->pos_ == br->len_ && br->bit_pos_ == LBITS) {
-      br->eos_ = 1;
-    }
-  }
-}
-
-uint32_t BrotliReadBits(BrotliBitReader* const br, int n_bits) {
-  assert(n_bits >= 0);
-  // Flag an error if end_of_stream or n_bits is more than allowed limit.
-  if (n_bits == 0 || (!br->eos_ && n_bits < MAX_NUM_BIT_READ)) {
-    const uint32_t val =
-        (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
-    const int new_bits = br->bit_pos_ + n_bits;
-    br->bit_pos_ = new_bits;
-    // If this read is going to cross the read buffer, set the eos flag.
-    if (br->pos_ == br->len_) {
-      if (new_bits >= LBITS) {
-        br->eos_ = 1;
-      }
-    }
-    ShiftBytes(br);
-    return val;
-  } else {
-    br->error_ = 1;
-    return 0;
   }
+  return (br->end_pos_ > 0);
 }
 
 #if defined(__cplusplus) || defined(c_plusplus)
diff --git a/brotli/dec/bit_reader.h b/brotli/dec/bit_reader.h
index 8ee16e5..c3f84f4 100644
--- a/brotli/dec/bit_reader.h
+++ b/brotli/dec/bit_reader.h
@@ -17,34 +17,39 @@
 #ifndef BROTLI_DEC_BIT_READER_H_
 #define BROTLI_DEC_BIT_READER_H_
 
+#include <string.h>
+#include "./streams.h"
 #include "./types.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
-typedef struct {
-  uint64_t       val_;        // pre-fetched bits
-  const uint8_t* buf_;        // input byte buffer
-  size_t         len_;        // buffer length
-  size_t         pos_;        // byte position in buf_
-  int            bit_pos_;    // current bit-reading position in val_
-  int            eos_;        // bitstream is finished
-  int            error_;      // an error occurred (buffer overflow attempt...)
-} BrotliBitReader;
+#define BROTLI_MAX_NUM_BIT_READ   25
+#define BROTLI_READ_SIZE          4096
+#define BROTLI_IBUF_SIZE          (2 * BROTLI_READ_SIZE + 32)
+#define BROTLI_IBUF_MASK          (2 * BROTLI_READ_SIZE - 1)
 
-void BrotliInitBitReader(BrotliBitReader* const br,
-                         const uint8_t* const start,
-                         size_t length);
+#define UNALIGNED_COPY64(dst, src) *(uint64_t*)(dst) = *(const uint64_t*)(src)
 
-//  Sets a new data buffer.
-void BrotliBitReaderSetBuffer(BrotliBitReader* const br,
-                              const uint8_t* const buffer, size_t length);
+static const uint32_t kBitMask[BROTLI_MAX_NUM_BIT_READ] = {
+  0, 1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767,
+  65535, 131071, 262143, 524287, 1048575, 2097151, 4194303, 8388607, 16777215
+};
 
-// Reads the specified number of bits from Read Buffer.
-// Flags an error in case end_of_stream or n_bits is more than allowed limit.
-// Flags eos if this read attempt is going to cross the read buffer.
-uint32_t BrotliReadBits(BrotliBitReader* const br, int n_bits);
+typedef struct {
+  // Input byte buffer, consist of a ringbuffer and a "slack" region where
+  // bytes from the start of the ringbuffer are copied.
+  uint8_t buf_[BROTLI_IBUF_SIZE];
+  BrotliInput input_;    // input callback
+  uint64_t    val_;      // pre-fetched bits
+  size_t      pos_;      // byte position in stream
+  int         bit_pos_;  // current bit-reading position in val_
+  size_t      end_pos_;  // current end position in stream
+  int         eos_;      // input stream is finished
+} BrotliBitReader;
+
+int BrotliInitBitReader(BrotliBitReader* const br, BrotliInput input);
 
 // Return the prefetched bits, so they can be looked up.
 static BROTLI_INLINE uint32_t BrotliPrefetchBits(BrotliBitReader* const br) {
@@ -57,8 +62,92 @@ static BROTLI_INLINE void BrotliSetBitPos(BrotliBitReader* const br, int val) {
   br->bit_pos_ = val;
 }
 
-// Advances the Read buffer by 4 bytes to make room for reading next 32 bits.
-void BrotliFillBitWindow(BrotliBitReader* const br);
+// Reload up to 64 bits byte-by-byte
+static BROTLI_INLINE void ShiftBytes(BrotliBitReader* const br) {
+  while (br->bit_pos_ >= 8) {
+    br->val_ >>= 8;
+    br->val_ |= ((uint64_t)br->buf_[br->pos_ & BROTLI_IBUF_MASK]) << 56;
+    ++br->pos_;
+    br->bit_pos_ -= 8;
+  }
+}
+
+// Fills up the input ringbuffer by calling the input callback.
+//
+// Does nothing if there are at least 32 bytes present after current position.
+//
+// Returns 0 if either:
+//  - the input callback returned an error, or
+//  - there is no more input and the position is past the end of the stream.
+//
+// After encountering the end of the input stream, 32 additional zero bytes are
+// copied to the ringbuffer, therefore it is safe to call this function after
+// every 32 bytes of input is read.
+static BROTLI_INLINE int BrotliReadMoreInput(BrotliBitReader* const br) {
+  if (br->pos_ + 32 < br->end_pos_) {
+    return 1;
+  } else if (br->eos_) {
+    return (br->pos_ << 3) + br->bit_pos_ <= (br->end_pos_ << 3) + 64;
+  } else {
+    uint8_t* dst = br->buf_ + (br->end_pos_ & BROTLI_IBUF_MASK);
+    int bytes_read = BrotliRead(br->input_, dst, BROTLI_READ_SIZE);
+    if (bytes_read < 0) {
+      return 0;
+    }
+    if (bytes_read < BROTLI_READ_SIZE) {
+      br->eos_ = 1;
+      // Store 32 bytes of zero after the stream end.
+#if (defined(__x86_64__) || defined(_M_X64))
+      *(uint64_t*)(dst + bytes_read) = 0;
+      *(uint64_t*)(dst + bytes_read + 8) = 0;
+      *(uint64_t*)(dst + bytes_read + 16) = 0;
+      *(uint64_t*)(dst + bytes_read + 24) = 0;
+#else
+      memset(dst + bytes_read, 0, 32);
+#endif
+    }
+    if (dst == br->buf_) {
+      // Copy the head of the ringbuffer to the slack region.
+#if (defined(__x86_64__) || defined(_M_X64))
+      UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 32, br->buf_);
+      UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 24, br->buf_ + 8);
+      UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 16, br->buf_ + 16);
+      UNALIGNED_COPY64(br->buf_ + BROTLI_IBUF_SIZE - 8, br->buf_ + 24);
+#else
+      memcpy(br->buf_ + (BROTLI_READ_SIZE << 1), br->buf_, 32);
+#endif
+    }
+    br->end_pos_ += bytes_read;
+    return 1;
+  }
+}
+
+// Advances the Read buffer by 5 bytes to make room for reading next 24 bits.
+static BROTLI_INLINE void BrotliFillBitWindow(BrotliBitReader* const br) {
+  if (br->bit_pos_ >= 40) {
+#if (defined(__x86_64__) || defined(_M_X64))
+    br->val_ >>= 40;
+    br->bit_pos_ -= 40;
+    // The expression below needs a little-endian arch to work correctly.
+    // This gives a large speedup for decoding speed.
+    br->val_ |= *(const uint64_t*)(
+        br->buf_ + (br->pos_ & BROTLI_IBUF_MASK)) << 24;
+    br->pos_ += 5;
+#else
+    ShiftBytes(br);
+#endif
+  }
+}
+
+// Reads the specified number of bits from Read Buffer.
+// Requires that n_bits is positive.
+static BROTLI_INLINE uint32_t BrotliReadBits(
+    BrotliBitReader* const br, int n_bits) {
+  BrotliFillBitWindow(br);
+  const uint32_t val = (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
+  br->bit_pos_ += n_bits;
+  return val;
+}
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
diff --git a/brotli/dec/context.h b/brotli/dec/context.h
index a2e21c0..212a445 100644
--- a/brotli/dec/context.h
+++ b/brotli/dec/context.h
@@ -12,34 +12,154 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-// Lookup tables to map the previous one to three bytes to a context id.
+// Lookup table to map the previous two bytes to a context id.
+//
+// There are four different context modeling modes defined here:
+//   CONTEXT_LSB6: context id is the least significant 6 bits of the last byte,
+//   CONTEXT_MSB6: context id is the most significant 6 bits of the last byte,
+//   CONTEXT_UTF8: second-order context model tuned for UTF8-encoded text,
+//   CONTEXT_SIGNED: second-order context model tuned for signed integers.
+//
+// The context id for the UTF8 context model is calculated as follows. If p1
+// and p2 are the previous two bytes, we calcualte the context as
+//
+//   context = kContextLookup[p1] | kContextLookup[p2 + 256].
+//
+// If the previous two bytes are ASCII characters (i.e. < 128), this will be
+// equivalent to
+//
+//   context = 4 * context1(p1) + context2(p2),
+//
+// where context1 is based on the previous byte in the following way:
+//
+//   0  : non-ASCII control
+//   1  : \t, \n, \r
+//   2  : space
+//   3  : other punctuation
+//   4  : " '
+//   5  : %
+//   6  : ( < [ {
+//   7  : ) > ] }
+//   8  : , ; :
+//   9  : .
+//   10 : =
+//   11 : number
+//   12 : upper-case vowel
+//   13 : upper-case consonant
+//   14 : lower-case vowel
+//   15 : lower-case consonant
+//
+// and context2 is based on the second last byte:
+//
+//   0 : control, space
+//   1 : punctuation
+//   2 : upper-case letter, number
+//   3 : lower-case letter
+//
+// If the last byte is ASCII, and the second last byte is not (in a valid UTF8
+// stream it will be a continuation byte, value between 128 and 191), the
+// context is the same as if the second last byte was an ASCII control or space.
+//
+// If the last byte is a UTF8 lead byte (value >= 192), then the next byte will
+// be a continuation byte and the context id is 2 or 3 depending on the LSB of
+// the last byte and to a lesser extent on the second last byte if it is ASCII.
+//
+// If the last byte is a UTF8 continuation byte, the second last byte can be:
+//   - continuation byte: the next byte is probably ASCII or lead byte (assuming
+//     4-byte UTF8 characters are rare) and the context id is 0 or 1.
+//   - lead byte (192 - 207): next byte is ASCII or lead byte, context is 0 or 1
+//   - lead byte (208 - 255): next byte is continuation byte, context is 2 or 3
+//
+// The possible value combinations of the previous two bytes, the range of
+// context ids and the type of the next byte is summarized in the table below:
+//
+// |--------\-----------------------------------------------------------------|
+// |         \                         Last byte                              |
+// | Second   \---------------------------------------------------------------|
+// | last byte \    ASCII            |   cont. byte        |   lead byte      |
+// |            \   (0-127)          |   (128-191)         |   (192-)         |
+// |=============|===================|=====================|==================|
+// |  ASCII      | next: ASCII/lead  |  not valid          |  next: cont.     |
+// |  (0-127)    | context: 4 - 63   |                     |  context: 2 - 3  |
+// |-------------|-------------------|---------------------|------------------|
+// |  cont. byte | next: ASCII/lead  |  next: ASCII/lead   |  next: cont.     |
+// |  (128-191)  | context: 4 - 63   |  context: 0 - 1     |  context: 2 - 3  |
+// |-------------|-------------------|---------------------|------------------|
+// |  lead byte  | not valid         |  next: ASCII/lead   |  not valid       |
+// |  (192-207)  |                   |  context: 0 - 1     |                  |
+// |-------------|-------------------|---------------------|------------------|
+// |  lead byte  | not valid         |  next: cont.        |  not valid       |
+// |  (208-)     |                   |  context: 2 - 3     |                  |
+// |-------------|-------------------|---------------------|------------------|
+//
+// The context id for the signed context mode is calculated as:
+//
+//   context = (kContextLookup[512 + p1] << 3) | kContextLookup[512 + p2].
+//
+// For any context modeling modes, the context ids can be calculated by |-ing
+// together two lookups from one table using context model dependent offsets:
+//
+//   context = kContextLookup[offset1 + p1] | kContextLookup[offset2 + p2].
+//
+// where offset1 and offset2 are dependent on the context mode.
 
 #ifndef BROTLI_DEC_CONTEXT_H_
 #define BROTLI_DEC_CONTEXT_H_
 
-
 #include "./types.h"
 
-static const int kSigned2BitContextLookup[] = {
+enum ContextType {
+  CONTEXT_LSB6         = 0,
+  CONTEXT_MSB6         = 1,
+  CONTEXT_UTF8         = 2,
+  CONTEXT_SIGNED       = 3
+};
+
+// Common context lookup table for all context modes.
+static const uint8_t kContextLookup[1792] = {
+  // CONTEXT_UTF8, last byte.
+  //
+  // ASCII range.
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  4,  0,  0,  4,  0,  0,
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+   8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
+  44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
+  12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
+  52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
+  12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
+  60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12,  0,
+  // UTF8 continuation byte range.
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  // UTF8 lead byte range.
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  // CONTEXT_UTF8 second last byte.
+  //
+  // ASCII range.
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
+  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
+  1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
+  // UTF8 continuation byte range.
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // UTF8 lead byte range.
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
-};
-
-static const int kSigned3BitContextLookup[] = {
+  // CONTEXT_SIGNED, second last byte.
   0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -56,69 +176,85 @@ static const int kSigned3BitContextLookup[] = {
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
   6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  // CONTEXT_SIGNED, last byte, same as the above values shifted by 3 bits.
+   0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+  32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+  32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+  32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+  40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+  40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+  40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+  48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 56,
+  // CONTEXT_LSB6, last byte.
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  // CONTEXT_MSB6, last byte.
+   0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
+   4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
+   8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11,
+  12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
+  16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19,
+  20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23,
+  24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27,
+  28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31,
+  32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35,
+  36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39,
+  40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43,
+  44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47,
+  48, 48, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51, 51,
+  52, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55,
+  56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59, 59,
+  60, 60, 60, 60, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 63, 63,
+  // CONTEXT_{M,L}SB6, second last byte,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 };
 
-static const int kSigned4BitContextLookup[] = {
-   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
-   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 14, 15,
-};
-
-enum ContextType {
-  CONTEXT_FULL        = 0,
-  CONTEXT_MSB7        = 1,
-  CONTEXT_MSB6        = 2,
-  CONTEXT_MSB5        = 3,
-  CONTEXT_MSB4        = 4,
-  CONTEXT_MSB3        = 5,
-  CONTEXT_MSB2        = 6,
-  CONTEXT_MSB1        = 7,
-  CONTEXT_IS_ZERO     = 8,
-  CONTEXT_SIGNED_2BIT = 9,
-  CONTEXT_SIGNED_3BIT = 10,
-  CONTEXT_SIGNED_4BIT = 11,
-  CONTEXT_SIGNED_MIXED_3BYTE = 12
+static const int kContextLookupOffsets[8] = {
+  // CONTEXT_LSB6
+  1024, 1536,
+  // CONTEXT_MSB6
+  1280, 1536,
+  // CONTEXT_UTF8
+  0, 256,
+  // CONTEXT_SIGNED
+  768, 512,
 };
 
-static const int kContextSize[] = {
-  256, 128, 64, 32, 16, 8, 4, 2, 2, 4, 8, 16, 64,
-};
-
-static BROTLI_INLINE int NumContexts(int mode) {
-  return kContextSize[mode];
-}
-
-static BROTLI_INLINE uint8_t Context(uint8_t prev_byte, uint8_t prev_byte2,
-                                     uint8_t prev_byte3, int mode) {
-  switch (mode) {
-    case CONTEXT_IS_ZERO:
-      return prev_byte == 0 ? 0 : 1;
-    case CONTEXT_SIGNED_2BIT:
-      return kSigned2BitContextLookup[prev_byte];
-    case CONTEXT_SIGNED_3BIT:
-      return kSigned3BitContextLookup[prev_byte];
-    case CONTEXT_SIGNED_4BIT:
-      return kSigned4BitContextLookup[prev_byte];
-    case CONTEXT_SIGNED_MIXED_3BYTE:
-      return ((kSigned3BitContextLookup[prev_byte] << 3) +
-              (kSigned2BitContextLookup[prev_byte2] << 1) +
-              (prev_byte3 == 0 ? 0 : 1));
-    default:
-      return prev_byte >> mode;
-  }
-}
-
 #endif  // BROTLI_DEC_CONTEXT_H_
diff --git a/brotli/dec/decode.c b/brotli/dec/decode.c
index 0e61640..a8b4ba3 100644
--- a/brotli/dec/decode.c
+++ b/brotli/dec/decode.c
@@ -14,7 +14,6 @@
 
 #include <stdlib.h>
 #include <stdio.h>
-#include <string.h>
 #include "./bit_reader.h"
 #include "./context.h"
 #include "./decode.h"
@@ -28,10 +27,10 @@ extern "C" {
 
 #ifdef BROTLI_DECODE_DEBUG
 #define BROTLI_LOG_UINT(name)                                    \
-  printf("[%s] %s = %lu\n", __func__, #name, (unsigned long)name)
+  printf("[%s] %s = %lu\n", __func__, #name, (unsigned long)(name))
 #define BROTLI_LOG_ARRAY_INDEX(array_name, idx)                  \
   printf("[%s] %s[%lu] = %lu\n", __func__, #array_name, \
-         (unsigned long)idx, (unsigned long)array_name[idx])
+         (unsigned long)(idx), (unsigned long)array_name[idx])
 #else
 #define BROTLI_LOG_UINT(name)
 #define BROTLI_LOG_ARRAY_INDEX(array_name, idx)
@@ -46,10 +45,12 @@ static const int kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
 static const int kNumLiteralCodes = 256;
 static const int kNumInsertAndCopyCodes = 704;
 static const int kNumBlockLengthCodes = 26;
+static const int kLiteralContextBits = 6;
+static const int kDistanceContextBits = 2;
 
 #define CODE_LENGTH_CODES 19
 static const uint8_t kCodeLengthCodeOrder[CODE_LENGTH_CODES] = {
-  17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  1, 2, 3, 4, 0, 17, 18, 5, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15
 };
 
 #define NUM_DISTANCE_SHORT_CODES 16
@@ -61,81 +62,92 @@ static const int kDistanceShortCodeValueOffset[NUM_DISTANCE_SHORT_CODES] = {
   0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
 };
 
-static int DecodeSize(BrotliBitReader* br, size_t* len) {
+static int64_t DecodeSize(BrotliBitReader* br) {
   int size_bytes = BrotliReadBits(br, 3);
   int i = 0;
-  *len = 0;
+  int64_t len = 0;
+  if (size_bytes == 0) {
+    return -1;
+  }
   for (; i < size_bytes; ++i) {
-    *len |= BrotliReadBits(br, 8) << (i * 8);
+    len |= BrotliReadBits(br, 8) << (i * 8);
   }
-  return !br->error_;
+  return len;
 }
 
-static int DecodeMetaBlockLength(int input_size_bits,
-                                 size_t remaining_length,
-                                 BrotliBitReader* br,
-                                 size_t* meta_block_length) {
-  if (BrotliReadBits(br, 1)) {
-    *meta_block_length = remaining_length;
-    return 1;
-  } else {
-    int shift = 0;
+static void DecodeMetaBlockLength(int input_size_bits,
+                                  size_t pos,
+                                  int64_t input_size,
+                                  BrotliBitReader* br,
+                                  size_t* meta_block_length,
+                                  int* input_end) {
+  *input_end = BrotliReadBits(br, 1);
+  if (input_size < 0) {
     *meta_block_length = 0;
-    while (input_size_bits > 0) {
-      *meta_block_length |= BrotliReadBits(br, 8) << shift;
-      input_size_bits -= 8;
-      shift += 8;
+    if (!*input_end) {
+      int size_nibbles = BrotliReadBits(br, 3);
+      int i;
+      for (i = 0; i < size_nibbles; ++i) {
+        *meta_block_length |= BrotliReadBits(br, 4) << (i * 4);
+      }
+      ++(*meta_block_length);
     }
-    if (input_size_bits > 0) {
-      *meta_block_length |= BrotliReadBits(br, input_size_bits) << shift;
+  } else {
+    if (*input_end) {
+      *meta_block_length = (size_t)input_size - pos;
+    } else {
+      int shift = 0;
+      *meta_block_length = 0;
+      while (input_size_bits > 0) {
+        *meta_block_length |= BrotliReadBits(br, 8) << shift;
+        input_size_bits -= 8;
+        shift += 8;
+      }
+      if (input_size_bits > 0) {
+        *meta_block_length |= BrotliReadBits(br, input_size_bits) << shift;
+      }
+      ++(*meta_block_length);
     }
-    ++(*meta_block_length);
-    return !br->error_;
   }
 }
 
 // Decodes the next Huffman code from bit-stream.
-// FillBitWindow(br) needs to be called at minimum every second call
-// to ReadSymbol, in order to pre-fetch enough bits.
 static BROTLI_INLINE int ReadSymbol(const HuffmanTree* tree,
                                     BrotliBitReader* br) {
-  if (tree->fixed_bit_length_ > 0) {
-    return BrotliReadBits(br, tree->fixed_bit_length_);
-  } else {
-    const HuffmanTreeNode* node = tree->root_;
-    uint32_t bits = BrotliPrefetchBits(br);
-    int bitpos = br->bit_pos_;
-    // Check if we find the bit combination from the Huffman lookup table.
-    const int lut_ix = bits & (HUFF_LUT - 1);
-    const int lut_bits = tree->lut_bits_[lut_ix];
-    if (lut_bits <= HUFF_LUT_BITS) {
-      BrotliSetBitPos(br, bitpos + lut_bits);
-      return tree->lut_symbol_[lut_ix];
-    }
-    node += tree->lut_jump_[lut_ix];
-    bitpos += HUFF_LUT_BITS;
-    bits >>= HUFF_LUT_BITS;
-
-    // Decode the value from a binary tree.
-    assert(node != NULL);
-    do {
-      node = HuffmanTreeNextNode(node, bits & 1);
-      bits >>= 1;
-      ++bitpos;
-    } while (HuffmanTreeNodeIsNotLeaf(node));
-    BrotliSetBitPos(br, bitpos);
-    return node->symbol_;
+  const HuffmanTreeNode* node = tree->root_;
+  BrotliFillBitWindow(br);
+  uint32_t bits = BrotliPrefetchBits(br);
+  int bitpos = br->bit_pos_;
+  // Check if we find the bit combination from the Huffman lookup table.
+  const int lut_ix = bits & (HUFF_LUT - 1);
+  const int lut_bits = tree->lut_bits_[lut_ix];
+  if (lut_bits <= HUFF_LUT_BITS) {
+    BrotliSetBitPos(br, bitpos + lut_bits);
+    return tree->lut_symbol_[lut_ix];
   }
+  node += tree->lut_jump_[lut_ix];
+  bitpos += HUFF_LUT_BITS;
+  bits >>= HUFF_LUT_BITS;
+
+  // Decode the value from a binary tree.
+  assert(node != NULL);
+  do {
+    node = HuffmanTreeNextNode(node, bits & 1);
+    bits >>= 1;
+    ++bitpos;
+  } while (HuffmanTreeNodeIsNotLeaf(node));
+  BrotliSetBitPos(br, bitpos);
+  return node->symbol_;
 }
 
-static void PrintIntVector(const int* v, int len) {
+static void PrintUcharVector(const uint8_t* v, int len) {
   while (len-- > 0) printf(" %d", *v++);
   printf("\n");
 }
 
 static int ReadHuffmanCodeLengths(
-    const int* code_length_code_lengths,
-    int num_symbols, int* code_lengths,
+    const uint8_t* code_length_code_lengths,
+    int num_symbols, uint8_t* code_lengths,
     BrotliBitReader* br) {
   int ok = 0;
   int symbol;
@@ -147,10 +159,14 @@ static int ReadHuffmanCodeLengths(
   if (!BrotliHuffmanTreeBuildImplicit(&tree, code_length_code_lengths,
                                       CODE_LENGTH_CODES)) {
     printf("[ReadHuffmanCodeLengths] Building code length tree failed: ");
-    PrintIntVector(code_length_code_lengths, CODE_LENGTH_CODES);
+    PrintUcharVector(code_length_code_lengths, CODE_LENGTH_CODES);
     return 0;
   }
 
+  if (!BrotliReadMoreInput(br)) {
+    printf("[ReadHuffmanCodeLengths] Unexpected end of input.\n");
+    return 0;
+  }
   decode_number_of_code_length_codes = BrotliReadBits(br, 1);
   BROTLI_LOG_UINT(decode_number_of_code_length_codes);
   if (decode_number_of_code_length_codes) {
@@ -171,7 +187,10 @@ static int ReadHuffmanCodeLengths(
   while (symbol < num_symbols) {
     int code_len;
     if (max_symbol-- == 0) break;
-    BrotliFillBitWindow(br);
+    if (!BrotliReadMoreInput(br)) {
+      printf("[ReadHuffmanCodeLengths] Unexpected end of input.\n");
+      goto End;
+    }
     code_len = ReadSymbol(&tree, br);
     BROTLI_LOG_UINT(symbol);
     BROTLI_LOG_UINT(code_len);
@@ -206,128 +225,101 @@ static int ReadHuffmanCodeLengths(
   return ok;
 }
 
-static const int64_t kUnitInterval = 1LL<<30;
-
-static int RepairHuffmanCodeLengths(int num_symbols, int* code_lengths) {
-  int i;
-  int64_t space = kUnitInterval;
-  int max_length = 0;
-  for(i = 0; i < num_symbols; i++)
-    if (code_lengths[i] != 0) {
-      if (code_lengths[i] > max_length)
-        max_length = code_lengths[i];
-      space -= kUnitInterval >> code_lengths[i];
-    }
-  // The code which contains one symbol of length one cannot be made optimal.
-  if (max_length == 1)
-    return 1;
-  if (space < 0) {
-    int count_longest = 0;
-    int new_length = max_length;
-    for(i = 0; i < num_symbols; i++) {
-      if (code_lengths[i] == max_length)
-        count_longest++;
-    }
-    // Substitute all longest codes with sufficiently longer ones, so that all
-    // code words fit into the unit interval. Leftover space will be
-    // redistributed later.
-    space += count_longest * (kUnitInterval >> max_length);
-    if (space < 0)
-      return 0;
-    while (space < count_longest * (kUnitInterval >> new_length))
-      new_length++;
-    space -= count_longest * (kUnitInterval >> new_length);
-    for(i = 0; i < num_symbols; i++) {
-      if (code_lengths[i] == max_length)
-        code_lengths[i] = new_length;
-    }
-  }
-
-  while (space > 0) {
-    // Redistribute leftover space in an approximation of a uniform fashion.
-    for(i = 0; i < num_symbols; i++) {
-      if (code_lengths[i] > 1 && space >= (kUnitInterval >> code_lengths[i])) {
-        space -= kUnitInterval >> code_lengths[i];
-        code_lengths[i]--;
-      }
-      if (space == 0)
-        break;
-    }
-  }
-  return 1;
-}
-
 static int ReadHuffmanCode(int alphabet_size,
                            HuffmanTree* tree,
                            BrotliBitReader* br) {
-  int ok = 0;
-  const int simple_code = BrotliReadBits(br, 1);
-  BROTLI_LOG_UINT(simple_code);
+  int ok = 1;
+  int simple_code;
+  uint8_t* code_lengths = NULL;
 
+  code_lengths =
+      (uint8_t*)BrotliSafeMalloc((uint64_t)alphabet_size,
+                                 sizeof(*code_lengths));
+  if (code_lengths == NULL) {
+    return 0;
+  }
+  if (!BrotliReadMoreInput(br)) {
+    printf("[ReadHuffmanCode] Unexpected end of input.\n");
+    return 0;
+  }
+  simple_code = BrotliReadBits(br, 1);
+  BROTLI_LOG_UINT(simple_code);
   if (simple_code) {  // Read symbols, codes & code lengths directly.
-    int symbols[2] = { 0 };
-    int codes[2];
-    int code_lengths[2];
-    const int num_symbols = BrotliReadBits(br, 1) + 1;
-    const int first_symbol_len_code = BrotliReadBits(br, 1);
-    // The first code is either 1 bit or 8 bit code.
-    symbols[0] = BrotliReadBits(br, (first_symbol_len_code == 0) ? 1 : 8);
-    codes[0] = 0;
-    code_lengths[0] = num_symbols - 1;
-    // The second code (if present), is always 8 bit long.
-    if (num_symbols == 2) {
-      symbols[1] = BrotliReadBits(br, 8);
-      codes[1] = 1;
-      code_lengths[1] = num_symbols - 1;
+    int i;
+    int max_bits_counter = alphabet_size - 1;
+    int max_bits = 0;
+    int symbols[4] = { 0 };
+    const int num_symbols = BrotliReadBits(br, 2) + 1;
+    while (max_bits_counter) {
+      max_bits_counter >>= 1;
+      ++max_bits;
     }
-    BROTLI_LOG_UINT(num_symbols);
-    BROTLI_LOG_UINT(first_symbol_len_code);
-    BROTLI_LOG_UINT(symbols[0]);
-    BROTLI_LOG_UINT(symbols[1]);
-    ok = BrotliHuffmanTreeBuildExplicit(tree, code_lengths, codes, symbols,
-                                        alphabet_size, num_symbols);
-    if (!ok) {
-      printf("[ReadHuffmanCode] HuffmanTreeBuildExplicit failed: ");
-      PrintIntVector(code_lengths, num_symbols);
+    memset(code_lengths, 0, alphabet_size);
+    for (i = 0; i < num_symbols; ++i) {
+      symbols[i] = BrotliReadBits(br, max_bits);
+      code_lengths[symbols[i]] = 2;
     }
+    code_lengths[symbols[0]] = 1;
+    switch (num_symbols) {
+      case 1:
+      case 3:
+        break;
+      case 2:
+        code_lengths[symbols[1]] = 1;
+        break;
+      case 4:
+        if (BrotliReadBits(br, 1)) {
+          code_lengths[symbols[2]] = 3;
+          code_lengths[symbols[3]] = 3;
+        } else {
+          code_lengths[symbols[0]] = 2;
+        }
+        break;
+    }
+    BROTLI_LOG_UINT(num_symbols);
   } else {  // Decode Huffman-coded code lengths.
-    int* code_lengths = NULL;
     int i;
-    int code_length_code_lengths[CODE_LENGTH_CODES] = { 0 };
+    uint8_t code_length_code_lengths[CODE_LENGTH_CODES] = { 0 };
     const int num_codes = BrotliReadBits(br, 4) + 4;
     BROTLI_LOG_UINT(num_codes);
     if (num_codes > CODE_LENGTH_CODES) {
       return 0;
     }
-
-    code_lengths =
-        (int*)BrotliSafeMalloc((uint64_t)alphabet_size, sizeof(*code_lengths));
-    if (code_lengths == NULL) {
-      return 0;
-    }
-
-    for (i = 0; i < num_codes; ++i) {
+    for (i = BrotliReadBits(br, 1) * 2; i < num_codes; ++i) {
       int code_len_idx = kCodeLengthCodeOrder[i];
-      code_length_code_lengths[code_len_idx] = BrotliReadBits(br, 3);
+      int v = BrotliReadBits(br, 2);
+      if (v == 3) {
+        v = BrotliReadBits(br, 1);
+        if (v == 0) {
+          v = 2;
+        } else {
+          v = BrotliReadBits(br, 1);
+          if (v == 0) {
+            v = 1;
+          } else {
+            v = 5;
+          }
+        }
+      } else if (v == 1) {
+        v = 3;
+      } else if (v == 2) {
+        v = 4;
+      }
+      code_length_code_lengths[code_len_idx] = v;
       BROTLI_LOG_ARRAY_INDEX(code_length_code_lengths, code_len_idx);
     }
     ok = ReadHuffmanCodeLengths(code_length_code_lengths, alphabet_size,
-                                code_lengths, br) &&
-         RepairHuffmanCodeLengths(alphabet_size, code_lengths);
-    if (ok) {
-      ok = BrotliHuffmanTreeBuildImplicit(tree, code_lengths, alphabet_size);
-      if (!ok) {
-        printf("[ReadHuffmanCode] HuffmanTreeBuildImplicit failed: ");
-        PrintIntVector(code_lengths, alphabet_size);
-      }
-    }
-    free(code_lengths);
+                                code_lengths, br);
   }
-  ok = ok && !br->error_;
-  if (!ok) {
-    return 0;
+  if (ok) {
+    ok = BrotliHuffmanTreeBuildImplicit(tree, code_lengths, alphabet_size);
+    if (!ok) {
+      printf("[ReadHuffmanCode] HuffmanTreeBuildImplicit failed: ");
+      PrintUcharVector(code_lengths, alphabet_size);
+    }
   }
-  return 1;
+  free(code_lengths);
+  return ok;
 }
 
 static int ReadCopyDistance(const HuffmanTree* tree,
@@ -339,7 +331,6 @@ static int ReadCopyDistance(const HuffmanTree* tree,
   int nbits;
   int postfix;
   int offset;
-  BrotliFillBitWindow(br);
   code = ReadSymbol(tree, br);
   if (code < num_direct_codes) {
     return code;
@@ -357,7 +348,6 @@ static int ReadCopyDistance(const HuffmanTree* tree,
 static int ReadBlockLength(const HuffmanTree* tree, BrotliBitReader* br) {
   int code;
   int nbits;
-  BrotliFillBitWindow(br);
   code = ReadSymbol(tree, br);
   nbits = kBlockLengthPrefixCode[code].nbits;
   return kBlockLengthPrefixCode[code].offset + BrotliReadBits(br, nbits);
@@ -371,8 +361,9 @@ static void ReadInsertAndCopy(const HuffmanTree* tree,
   int code;
   int range_idx;
   int insert_code;
+  int insert_extra_bits;
   int copy_code;
-  BrotliFillBitWindow(br);
+  int copy_extra_bits;
   code = ReadSymbol(tree, br);
   range_idx = code >> 6;
   if (range_idx >= 2) {
@@ -383,27 +374,27 @@ static void ReadInsertAndCopy(const HuffmanTree* tree,
   }
   insert_code = (kInsertRangeLut[range_idx] << 3) + ((code >> 3) & 7);
   copy_code = (kCopyRangeLut[range_idx] << 3) + (code & 7);
-  *insert_len =
-      kInsertLengthPrefixCode[insert_code].offset +
-      BrotliReadBits(br, kInsertLengthPrefixCode[insert_code].nbits);
-  *copy_len =
-      kCopyLengthPrefixCode[copy_code].offset +
-      BrotliReadBits(br, kCopyLengthPrefixCode[copy_code].nbits);
+  *insert_len = kInsertLengthPrefixCode[insert_code].offset;
+  insert_extra_bits = kInsertLengthPrefixCode[insert_code].nbits;
+  if (insert_extra_bits > 0) {
+    *insert_len += BrotliReadBits(br, insert_extra_bits);
+  }
+  *copy_len = kCopyLengthPrefixCode[copy_code].offset;
+  copy_extra_bits = kCopyLengthPrefixCode[copy_code].nbits;
+  if (copy_extra_bits > 0) {
+    *copy_len += BrotliReadBits(br, copy_extra_bits);
+  }
 }
 
-static int TranslateShortCodes(int code, int* ringbuffer, size_t* index) {
+static int TranslateShortCodes(int code, int* ringbuffer, size_t index) {
   int val;
   if (code < NUM_DISTANCE_SHORT_CODES) {
-    int index_offset = kDistanceShortCodeIndexOffset[code];
-    int value_offset = kDistanceShortCodeValueOffset[code];
-    val = ringbuffer[(*index + index_offset) & 3] + value_offset;
+    index += kDistanceShortCodeIndexOffset[code];
+    index &= 3;
+    val = ringbuffer[index] + kDistanceShortCodeValueOffset[code];
   } else {
     val = code - NUM_DISTANCE_SHORT_CODES + 1;
   }
-  if (code > 0) {
-    ringbuffer[*index & 3] = val;
-    ++(*index);
-  }
   return val;
 }
 
@@ -453,41 +444,24 @@ static int HuffmanTreeGroupDecode(HuffmanTreeGroup* group,
                                   BrotliBitReader* br) {
   int i;
   for (i = 0; i < group->num_htrees; ++i) {
-    ReadHuffmanCode(group->alphabet_size, &group->htrees[i], br);
+    if (!ReadHuffmanCode(group->alphabet_size, &group->htrees[i], br)) {
+      return 0;
+    }
   }
   return 1;
 }
 
-static int DecodeContextMap(int num_block_types,
-                            int stream_type,
-                            int* context_mode,
-                            int* contexts_per_block,
+static int DecodeContextMap(int context_map_size,
                             int* num_htrees,
                             uint8_t** context_map,
                             BrotliBitReader* br) {
-  int context_map_size;
-  int use_context = BrotliReadBits(br, 1);
-  if (!use_context) {
-    *context_mode = 0;
-    *contexts_per_block = 1;
-    *context_map = NULL;
-    *num_htrees = num_block_types;
-    return 1;
+  int ok = 1;
+  if (!BrotliReadMoreInput(br)) {
+    printf("[DecodeContextMap] Unexpected end of input.\n");
+    return 0;
   }
-  switch (stream_type) {
-    case 0:
-      *context_mode = BrotliReadBits(br, 4);
-      *contexts_per_block = NumContexts(*context_mode);
-      break;
-    case 2:
-      *context_mode = 1;
-      *contexts_per_block = 4;
-      break;
-  }
-  context_map_size = *contexts_per_block * num_block_types;
   *num_htrees = BrotliReadBits(br, 8) + 1;
 
-  BROTLI_LOG_UINT(*context_mode);
   BROTLI_LOG_UINT(context_map_size);
   BROTLI_LOG_UINT(*num_htrees);
 
@@ -511,13 +485,19 @@ static int DecodeContextMap(int num_block_types,
     if (use_rle_for_zeros) {
       max_run_length_prefix = BrotliReadBits(br, 4) + 1;
     }
-    ReadHuffmanCode(*num_htrees + max_run_length_prefix,
-                    &tree_index_htree, br);
+    if (!ReadHuffmanCode(*num_htrees + max_run_length_prefix,
+                         &tree_index_htree, br)) {
+      return 0;
+    }
     if (use_rle_for_zeros) {
       int i;
       for (i = 0; i < context_map_size;) {
         int code;
-        BrotliFillBitWindow(br);
+        if (!BrotliReadMoreInput(br)) {
+          printf("[DecodeContextMap] Unexpected end of input.\n");
+          ok = 0;
+          goto End;
+        }
         code = ReadSymbol(&tree_index_htree, br);
         if (code == 0) {
           (*context_map)[i] = 0;
@@ -536,16 +516,21 @@ static int DecodeContextMap(int num_block_types,
     } else {
       int i;
       for (i = 0; i < context_map_size; ++i) {
-        BrotliFillBitWindow(br);
+        if (!BrotliReadMoreInput(br)) {
+          printf("[DecodeContextMap] Unexpected end of input.\n");
+          ok = 0;
+          goto End;
+        }
         (*context_map)[i] = ReadSymbol(&tree_index_htree, br);
       }
     }
+   End:
     BrotliHuffmanTreeRelease(&tree_index_htree);
   }
   if (BrotliReadBits(br, 1)) {
     InverseMoveToFrontTransform(*context_map, context_map_size);
   }
-  return 1;
+  return ok;
 }
 
 static BROTLI_INLINE void DecodeBlockType(const HuffmanTree* trees,
@@ -570,39 +555,116 @@ static BROTLI_INLINE void DecodeBlockType(const HuffmanTree* trees,
   ++(*index);
 }
 
+// Copy len bytes from src to dst. It can write up to ten extra bytes
+// after the end of the copy.
+//
+// The main part of this loop is a simple copy of eight bytes at a time until
+// we've copied (at least) the requested amount of bytes.  However, if dst and
+// src are less than eight bytes apart (indicating a repeating pattern of
+// length < 8), we first need to expand the pattern in order to get the correct
+// results. For instance, if the buffer looks like this, with the eight-byte
+// <src> and <dst> patterns marked as intervals:
+//
+//    abxxxxxxxxxxxx
+//    [------]           src
+//      [------]         dst
+//
+// a single eight-byte copy from <src> to <dst> will repeat the pattern once,
+// after which we can move <dst> two bytes without moving <src>:
+//
+//    ababxxxxxxxxxx
+//    [------]           src
+//        [------]       dst
+//
+// and repeat the exercise until the two no longer overlap.
+//
+// This allows us to do very well in the special case of one single byte
+// repeated many times, without taking a big hit for more general cases.
+//
+// The worst case of extra writing past the end of the match occurs when
+// dst - src == 1 and len == 1; the last copy will read from byte positions
+// [0..7] and write to [4..11], whereas it was only supposed to write to
+// position 1. Thus, ten excess bytes.
+static BROTLI_INLINE void IncrementalCopyFastPath(
+    uint8_t* dst, const uint8_t* src, int len) {
+  if (src < dst) {
+    while (dst - src < 8) {
+      UNALIGNED_COPY64(dst, src);
+      len -= dst - src;
+      dst += dst - src;
+    }
+  }
+  while (len > 0) {
+    UNALIGNED_COPY64(dst, src);
+    src += 8;
+    dst += 8;
+    len -= 8;
+  }
+}
+
 int BrotliDecompressedSize(size_t encoded_size,
                            const uint8_t* encoded_buffer,
                            size_t* decoded_size) {
+  BrotliMemInput memin;
+  BrotliInput input = BrotliInitMemInput(encoded_buffer, encoded_size, &memin);
   BrotliBitReader br;
-  BrotliInitBitReader(&br, encoded_buffer, encoded_size);
-  return DecodeSize(&br, decoded_size);
+  if (!BrotliInitBitReader(&br, input)) {
+    return 0;
+  }
+  int64_t size = DecodeSize(&br);
+  if (size < 0) {
+    return 0;
+  }
+  *decoded_size = (size_t)size;
+  return 1;
 }
 
 int BrotliDecompressBuffer(size_t encoded_size,
                            const uint8_t* encoded_buffer,
                            size_t* decoded_size,
                            uint8_t* decoded_buffer) {
+  BrotliMemInput memin;
+  BrotliInput in = BrotliInitMemInput(encoded_buffer, encoded_size, &memin);
+  BrotliMemOutput mout;
+  BrotliOutput out = BrotliInitMemOutput(decoded_buffer, *decoded_size, &mout);
+  int success = BrotliDecompress(in, out);
+  *decoded_size = mout.pos;
+  return success;
+}
+
+int BrotliDecompress(BrotliInput input, BrotliOutput output) {
   int ok = 1;
   int i;
   size_t pos = 0;
-  uint8_t* data = decoded_buffer;
-  int input_size_bits;
+  int64_t decoded_size;
+  int input_size_bits = 0;
+  int input_end = 0;
+  int window_bits = 0;
+  size_t ringbuffer_size;
+  size_t ringbuffer_mask;
+  uint8_t* ringbuffer;
+  uint8_t* ringbuffer_end;
   // This ring buffer holds a few past copy distances that will be used by
   // some special distance codes.
   int dist_rb[4] = { 4, 11, 15, 16 };
   size_t dist_rb_idx = 0;
+  // The previous 2 bytes used for context.
+  uint8_t prev_byte1 = 0;
+  uint8_t prev_byte2 = 0;
   HuffmanTreeGroup hgroup[3];
   BrotliBitReader br;
-  BrotliInitBitReader(&br, encoded_buffer, encoded_size);
 
-  ok = DecodeSize(&br, decoded_size);
-  if (!ok) return 0;
+  if (!BrotliInitBitReader(&br, input)) {
+    return 0;
+  }
 
-  if (*decoded_size == 0) {
+  decoded_size = DecodeSize(&br);
+  if (decoded_size == 0) {
     return 1;
   }
-  {
-    size_t n = *decoded_size;
+
+  if (decoded_size > 0) {
+    size_t n = (size_t)decoded_size;
     input_size_bits = (n == (n &~ (n - 1))) ? -1 : 0;
     while (n) {
       ++input_size_bits;
@@ -610,12 +672,24 @@ int BrotliDecompressBuffer(size_t encoded_size,
     }
   }
 
-  BROTLI_LOG_UINT(*decoded_size);
+  // Decode window size.
+  if ((decoded_size < 0 || input_size_bits > 16) && BrotliReadBits(&br, 1)) {
+    window_bits = 17 + BrotliReadBits(&br, 3);
+  } else {
+    window_bits = 16;
+  }
+
+  ringbuffer_size = 1 << window_bits;
+  ringbuffer_mask = ringbuffer_size - 1;
+  ringbuffer = (uint8_t*)malloc(ringbuffer_size + 16);
+  ringbuffer_end = ringbuffer + ringbuffer_size;
+
+  BROTLI_LOG_UINT(decoded_size);
   BROTLI_LOG_UINT(input_size_bits);
 
-  while (pos < *decoded_size && ok) {
+  while (!input_end && ok) {
     size_t meta_block_len = 0;
-    size_t meta_block_end;
+    size_t meta_block_end_pos;
     size_t block_length[3] = { 0 };
     int block_type[3] = { 0 };
     int num_block_types[3] = { 0 };
@@ -628,12 +702,9 @@ int BrotliDecompressBuffer(size_t encoded_size,
     uint32_t distance_postfix_mask;
     int num_distance_codes;
     uint8_t* context_map = NULL;
-    int context_mode;
-    int contexts_per_block;
+    uint8_t* context_modes = NULL;
     int num_literal_htrees;
     uint8_t* dist_context_map = NULL;
-    int dist_context_mode;
-    int dist_contexts_per_block;
     int num_dist_htrees;
     int context_offset = 0;
     uint8_t* context_map_slice = NULL;
@@ -641,23 +712,41 @@ int BrotliDecompressBuffer(size_t encoded_size,
     int dist_context_offset = 0;
     uint8_t* dist_context_map_slice = NULL;
     uint8_t dist_htree_index = 0;
+    int context_lookup_offset1 = 0;
+    int context_lookup_offset2 = 0;
+    uint8_t context_mode;
 
-    BROTLI_LOG_UINT(pos);
-    if (!DecodeMetaBlockLength(input_size_bits, *decoded_size - pos,
-                               &br, &meta_block_len)) {
-      printf("Could not decode meta-block length.\n");
+    for (i = 0; i < 3; ++i) {
+      hgroup[i].num_htrees = 0;
+      hgroup[i].htrees = NULL;
+      block_type_trees[i].root_ = NULL;
+      block_len_trees[i].root_ = NULL;
+    }
+
+    if (!BrotliReadMoreInput(&br)) {
+      printf("[BrotliDecompress] Unexpected end of input.\n");
       ok = 0;
       goto End;
     }
+    BROTLI_LOG_UINT(pos);
+    DecodeMetaBlockLength(input_size_bits, pos, decoded_size,
+                          &br, &meta_block_len, &input_end);
     BROTLI_LOG_UINT(meta_block_len);
-    meta_block_end = pos + meta_block_len;
+    if (meta_block_len == 0) {
+      goto End;
+    }
+    meta_block_end_pos = pos + meta_block_len;
     for (i = 0; i < 3; ++i) {
       block_type_trees[i].root_ = NULL;
       block_len_trees[i].root_ = NULL;
       if (BrotliReadBits(&br, 1)) {
         num_block_types[i] = BrotliReadBits(&br, 8) + 1;
-        ReadHuffmanCode(num_block_types[i] + 2, &block_type_trees[i], &br);
-        ReadHuffmanCode(kNumBlockLengthCodes, &block_len_trees[i], &br);
+        if (!ReadHuffmanCode(
+                num_block_types[i] + 2, &block_type_trees[i], &br) ||
+            !ReadHuffmanCode(kNumBlockLengthCodes, &block_len_trees[i], &br)) {
+          ok = 0;
+          goto End;
+        }
         block_length[i] = ReadBlockLength(&block_len_trees[i], &br);
         block_type_rb_index[i] = 1;
       } else {
@@ -673,21 +762,32 @@ int BrotliDecompressBuffer(size_t encoded_size,
     BROTLI_LOG_UINT(block_length[1]);
     BROTLI_LOG_UINT(block_length[2]);
 
+    if (!BrotliReadMoreInput(&br)) {
+      printf("[BrotliDecompress] Unexpected end of input.\n");
+      ok = 0;
+      goto End;
+    }
     distance_postfix_bits = BrotliReadBits(&br, 2);
     num_direct_distance_codes = NUM_DISTANCE_SHORT_CODES +
         (BrotliReadBits(&br, 4) << distance_postfix_bits);
     distance_postfix_mask = (1 << distance_postfix_bits) - 1;
     num_distance_codes = (num_direct_distance_codes +
                           (48 << distance_postfix_bits));
+    context_modes = (uint8_t*)malloc(num_block_types[0]);
+    for (i = 0; i < num_block_types[0]; ++i) {
+      context_modes[i] = BrotliReadBits(&br, 2) << 1;
+      BROTLI_LOG_ARRAY_INDEX(context_modes, i);
+    }
     BROTLI_LOG_UINT(num_direct_distance_codes);
     BROTLI_LOG_UINT(distance_postfix_bits);
 
-    DecodeContextMap(num_block_types[0], 0, &context_mode, &contexts_per_block,
-                     &num_literal_htrees, &context_map, &br);
-
-    DecodeContextMap(num_block_types[2], 2, &dist_context_mode,
-                     &dist_contexts_per_block,
-                     &num_dist_htrees, &dist_context_map, &br);
+    if (!DecodeContextMap(num_block_types[0] << kLiteralContextBits,
+                          &num_literal_htrees, &context_map, &br) ||
+        !DecodeContextMap(num_block_types[2] << kDistanceContextBits,
+                          &num_dist_htrees, &dist_context_map, &br)) {
+      ok = 0;
+      goto End;
+    }
 
     HuffmanTreeGroupInit(&hgroup[0], kNumLiteralCodes, num_literal_htrees);
     HuffmanTreeGroupInit(&hgroup[1], kNumInsertAndCopyCodes,
@@ -695,18 +795,32 @@ int BrotliDecompressBuffer(size_t encoded_size,
     HuffmanTreeGroupInit(&hgroup[2], num_distance_codes, num_dist_htrees);
 
     for (i = 0; i < 3; ++i) {
-      HuffmanTreeGroupDecode(&hgroup[i], &br);
+      if (!HuffmanTreeGroupDecode(&hgroup[i], &br)) {
+        ok = 0;
+        goto End;
+      }
     }
 
     context_map_slice = context_map;
     dist_context_map_slice = dist_context_map;
+    context_mode = context_modes[block_type[0]];
+    context_lookup_offset1 = kContextLookupOffsets[context_mode];
+    context_lookup_offset2 = kContextLookupOffsets[context_mode + 1];
 
-    while (pos < meta_block_end) {
+    while (pos < meta_block_end_pos) {
       int insert_length;
       int copy_length;
       int distance_code;
       int distance;
+      uint8_t context;
       int j;
+      const uint8_t* copy_src;
+      uint8_t* copy_dst;
+      if (!BrotliReadMoreInput(&br)) {
+        printf("[BrotliDecompress] Unexpected end of input.\n");
+        ok = 0;
+        goto End;
+      }
       if (block_length[1] == 0) {
         DecodeBlockType(block_type_trees, 1, block_type, block_type_rb,
                         block_type_rb_index, &br);
@@ -719,87 +833,120 @@ int BrotliDecompressBuffer(size_t encoded_size,
       BROTLI_LOG_UINT(copy_length);
       BROTLI_LOG_UINT(distance_code);
       for (j = 0; j < insert_length; ++j) {
+        if (!BrotliReadMoreInput(&br)) {
+          printf("[BrotliDecompress] Unexpected end of input.\n");
+          ok = 0;
+          goto End;
+        }
         if (block_length[0] == 0) {
           DecodeBlockType(block_type_trees, 0, block_type, block_type_rb,
                           block_type_rb_index, &br);
           block_length[0] = ReadBlockLength(&block_len_trees[0], &br);
-          literal_htree_index = block_type[0];
-          context_offset = block_type[0] * contexts_per_block;
+          context_offset = block_type[0] << kLiteralContextBits;
           context_map_slice = context_map + context_offset;
+          context_mode = context_modes[block_type[0]];
+          context_lookup_offset1 = kContextLookupOffsets[context_mode];
+          context_lookup_offset2 = kContextLookupOffsets[context_mode + 1];
         }
+        context = (kContextLookup[context_lookup_offset1 + prev_byte1] |
+                   kContextLookup[context_lookup_offset2 + prev_byte2]);
+        BROTLI_LOG_UINT(context);
+        literal_htree_index = context_map_slice[context];
         --block_length[0];
-        BrotliFillBitWindow(&br);
-        // Figure out htree
-        if (contexts_per_block > 1) {
-          uint8_t prev_byte = pos > 0 ? data[pos - 1] : 0;
-          uint8_t prev_byte2 = pos > 1 ? data[pos - 2] : 0;
-          uint8_t prev_byte3 = pos > 2 ? data[pos - 3] : 0;
-          uint8_t context = Context(prev_byte, prev_byte2, prev_byte3,
-                                    context_mode);
-          BROTLI_LOG_UINT(context);
-          literal_htree_index = context_map_slice[context];
-        }
-        data[pos] = ReadSymbol(&hgroup[0].htrees[literal_htree_index], &br);
+        prev_byte2 = prev_byte1;
+        prev_byte1 = ReadSymbol(&hgroup[0].htrees[literal_htree_index], &br);
+        ringbuffer[pos & ringbuffer_mask] = prev_byte1;
         BROTLI_LOG_UINT(literal_htree_index);
-        BROTLI_LOG_ARRAY_INDEX(data, pos);
+        BROTLI_LOG_ARRAY_INDEX(ringbuffer, pos & ringbuffer_mask);
+        if ((pos & ringbuffer_mask) == ringbuffer_mask) {
+          if (BrotliWrite(output, ringbuffer, ringbuffer_size) < 0) {
+            ok = 0;
+            goto End;
+          }
+        }
         ++pos;
       }
-      if (br.error_) {
-        printf("Read error after decoding literal sequence.\n");
-        ok = 0;
-        goto End;
-      }
-
-      if (pos == meta_block_end) break;
+      if (pos == meta_block_end_pos) break;
 
       if (distance_code < 0) {
+        if (!BrotliReadMoreInput(&br)) {
+          printf("[BrotliDecompress] Unexpected end of input.\n");
+          ok = 0;
+          goto End;
+        }
         if (block_length[2] == 0) {
           DecodeBlockType(block_type_trees, 2, block_type, block_type_rb,
                           block_type_rb_index, &br);
           block_length[2] = ReadBlockLength(&block_len_trees[2], &br);
           dist_htree_index = block_type[2];
-          dist_context_offset = block_type[2] * dist_contexts_per_block;
+          dist_context_offset = block_type[2] << kDistanceContextBits;
           dist_context_map_slice = dist_context_map + dist_context_offset;
         }
         --block_length[2];
-        if (dist_contexts_per_block > 1) {
-          uint8_t context = copy_length > 4 ? 3 : copy_length - 2;
-          dist_htree_index = dist_context_map_slice[context];
-        }
+        uint8_t context = copy_length > 4 ? 3 : copy_length - 2;
+        dist_htree_index = dist_context_map_slice[context];
         distance_code = ReadCopyDistance(&hgroup[2].htrees[dist_htree_index],
                                          num_direct_distance_codes,
                                          distance_postfix_bits,
                                          distance_postfix_mask,
                                          &br);
-        if (br.error_) {
-          printf("Could not read copy distance.\n");
-          ok = 0;
-          goto End;
-        }
       }
 
       // Convert the distance code to the actual distance by possibly looking
       // up past distnaces from the ringbuffer.
-      distance = TranslateShortCodes(distance_code, dist_rb, &dist_rb_idx);
+      distance = TranslateShortCodes(distance_code, dist_rb, dist_rb_idx);
+      if (distance_code > 0) {
+        dist_rb[dist_rb_idx & 3] = distance;
+        ++dist_rb_idx;
+      }
+
       BROTLI_LOG_UINT(distance);
 
-      // Do the actual copy if it is valid.
-      if (distance > 0 && pos >= (size_t)distance &&
-          pos + copy_length <= *decoded_size) {
-        int j;
-        for (j = 0; j < copy_length; ++j) {
-          data[pos + j] = data[pos + j - distance];
-        }
-        pos += copy_length;
-      } else {
-        printf("Invalid backward reference. pos: %lu distance: %d "
-               "len: %d end: %lu\n", (unsigned long)pos, distance, copy_length,
-               (unsigned long)*decoded_size);
+      if (pos < (size_t)distance || pos + copy_length > meta_block_end_pos) {
+        printf("Invalid backward reference. pos: %ld distance: %d "
+               "len: %d end: %lu\n", pos, distance, copy_length,
+               (unsigned long)meta_block_end_pos);
         ok = 0;
         goto End;
       }
+
+      copy_src = &ringbuffer[(pos - distance) & ringbuffer_mask];
+      copy_dst = &ringbuffer[pos & ringbuffer_mask];
+
+#if (defined(__x86_64__) || defined(_M_X64))
+      if (copy_src + copy_length <= ringbuffer_end &&
+          copy_dst + copy_length < ringbuffer_end) {
+        if (copy_length <= 16 && distance >= 8) {
+          UNALIGNED_COPY64(copy_dst, copy_src);
+          UNALIGNED_COPY64(copy_dst + 8, copy_src + 8);
+        } else {
+          IncrementalCopyFastPath(copy_dst, copy_src, copy_length);
+        }
+        pos += copy_length;
+        copy_length = 0;
+      }
+#endif
+
+      for (j = 0; j < copy_length; ++j) {
+        ringbuffer[pos & ringbuffer_mask] =
+            ringbuffer[(pos - distance) & ringbuffer_mask];
+        if ((pos & ringbuffer_mask) == ringbuffer_mask) {
+          if (BrotliWrite(output, ringbuffer, ringbuffer_size) < 0) {
+            ok = 0;
+            goto End;
+          }
+        }
+        ++pos;
+      }
+
+      // When we get here, we must have inserted at least one literal and made
+      // a copy of at least length two, therefore accessing the last 2 bytes is
+      // valid.
+      prev_byte1 = ringbuffer[(pos - 1) & ringbuffer_mask];
+      prev_byte2 = ringbuffer[(pos - 2) & ringbuffer_mask];
     }
  End:
+    free(context_modes);
     free(context_map);
     free(dist_context_map);
     for (i = 0; i < 3; ++i) {
@@ -809,6 +956,10 @@ int BrotliDecompressBuffer(size_t encoded_size,
     }
   }
 
+  if (BrotliWrite(output, ringbuffer, pos & ringbuffer_mask) < 0) {
+    ok = 0;
+  }
+  free(ringbuffer);
   return ok;
 }
 
diff --git a/brotli/dec/decode.h b/brotli/dec/decode.h
index 6d63e3a..760ec79 100644
--- a/brotli/dec/decode.h
+++ b/brotli/dec/decode.h
@@ -17,6 +17,7 @@
 #ifndef BROTLI_DEC_DECODE_H_
 #define BROTLI_DEC_DECODE_H_
 
+#include "./streams.h"
 #include "./types.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
@@ -39,6 +40,10 @@ int BrotliDecompressBuffer(size_t encoded_size,
                            size_t* decoded_size,
                            uint8_t* decoded_buffer);
 
+// Same as above, but uses the specified input and output callbacks instead of
+// reading from and writing to pre-allocated memory buffers.
+int BrotliDecompress(BrotliInput input, BrotliOutput output);
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
diff --git a/brotli/dec/huffman.c b/brotli/dec/huffman.c
index 1e11aec..b24a760 100644
--- a/brotli/dec/huffman.c
+++ b/brotli/dec/huffman.c
@@ -24,10 +24,6 @@
 extern "C" {
 #endif
 
-// Uncomment the following to use look-up table for ReverseBits()
-// (might be faster on some platform)
-// #define USE_LUT_REVERSE_BITS
-
 #define NON_EXISTENT_SYMBOL (-1)
 #define MAX_ALLOWED_CODE_LENGTH      15
 
@@ -55,7 +51,6 @@ static void AssignChildren(HuffmanTree* const tree,
 
 static int TreeInit(HuffmanTree* const tree, int num_leaves) {
   assert(tree != NULL);
-  tree->fixed_bit_length_ = 0;
   if (num_leaves == 0) return 0;
   // We allocate maximum possible nodes in the tree at once.
   // Note that a Huffman tree is a full binary tree; and in a full binary tree
@@ -84,7 +79,7 @@ void BrotliHuffmanTreeRelease(HuffmanTree* const tree) {
 // Utility: converts Huffman code lengths to corresponding Huffman codes.
 // 'huff_codes' should be pre-allocated.
 // Returns false in case of error (memory allocation, invalid codes).
-static int HuffmanCodeLengthsToCodes(const int* const code_lengths,
+static int HuffmanCodeLengthsToCodes(const uint8_t* const code_lengths,
                                      int code_lengths_size,
                                      int* const huff_codes) {
   int symbol;
@@ -133,35 +128,21 @@ static int HuffmanCodeLengthsToCodes(const int* const code_lengths,
   return 1;
 }
 
-#ifndef USE_LUT_REVERSE_BITS
-
-static int ReverseBitsShort(int bits, int num_bits) {
-  int retval = 0;
-  int i;
-  assert(num_bits <= 8);   // Not a hard requirement, just for coherency.
-  for (i = 0; i < num_bits; ++i) {
-    retval <<= 1;
-    retval |= bits & 1;
-    bits >>= 1;
-  }
-  return retval;
-}
-
-#else
-
-static const uint8_t kReversedBits[16] = {  // Pre-reversed 4-bit values.
-  0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
-  0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf
+static const uint8_t kReverse7[128] = {
+  0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120,
+  4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124,
+  2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122,
+  6, 70, 38, 102, 22, 86, 54, 118, 14, 78, 46, 110, 30, 94, 62, 126,
+  1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121,
+  5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125,
+  3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123,
+  7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127
 };
 
 static int ReverseBitsShort(int bits, int num_bits) {
-  const uint8_t v = (kReversedBits[bits & 0xf] << 4) | kReversedBits[bits >> 4];
-  assert(num_bits <= 8);
-  return v >> (8 - num_bits);
+  return kReverse7[bits] >> (7 - num_bits);
 }
 
-#endif
-
 static int TreeAddSymbol(HuffmanTree* const tree,
                          int symbol, int code, int code_length) {
   int step = HUFF_LUT_BITS;
@@ -170,13 +151,14 @@ static int TreeAddSymbol(HuffmanTree* const tree,
   const HuffmanTreeNode* const max_node = tree->root_ + tree->max_nodes_;
   assert(symbol == (int16_t)symbol);
   if (code_length <= HUFF_LUT_BITS) {
-    int i;
+    int i = 1 << (HUFF_LUT_BITS - code_length);
     base_code = ReverseBitsShort(code, code_length);
-    for (i = 0; i < (1 << (HUFF_LUT_BITS - code_length)); ++i) {
+    do {
+      --i;
       const int idx = base_code | (i << code_length);
       tree->lut_symbol_[idx] = (int16_t)symbol;
       tree->lut_bits_[idx] = code_length;
-    }
+    } while (i > 0);
   } else {
     base_code = ReverseBitsShort((code >> (code_length - HUFF_LUT_BITS)),
                                  HUFF_LUT_BITS);
@@ -206,7 +188,7 @@ static int TreeAddSymbol(HuffmanTree* const tree,
 }
 
 int BrotliHuffmanTreeBuildImplicit(HuffmanTree* const tree,
-                                   const int* const code_lengths,
+                                   const uint8_t* const code_lengths,
                                    int code_lengths_size) {
   int symbol;
   int num_symbols = 0;
@@ -264,41 +246,6 @@ int BrotliHuffmanTreeBuildImplicit(HuffmanTree* const tree,
   }
 }
 
-int BrotliHuffmanTreeBuildExplicit(HuffmanTree* const tree,
-                                   const int* const code_lengths,
-                                   const int* const codes,
-                                   const int* const symbols,
-                                   int max_symbol,
-                                   int num_symbols) {
-  int ok = 0;
-  int i;
-
-  assert(tree != NULL);
-  assert(code_lengths != NULL);
-  assert(codes != NULL);
-  assert(symbols != NULL);
-
-  // Initialize the tree. Will fail if num_symbols = 0.
-  if (!TreeInit(tree, num_symbols)) return 0;
-
-  // Add symbols one-by-one.
-  for (i = 0; i < num_symbols; ++i) {
-    if (codes[i] != NON_EXISTENT_SYMBOL) {
-      if (symbols[i] < 0 || symbols[i] >= max_symbol) {
-        goto End;
-      }
-      if (!TreeAddSymbol(tree, symbols[i], codes[i], code_lengths[i])) {
-        goto End;
-      }
-    }
-  }
-  ok = 1;
- End:
-  ok = ok && IsFull(tree);
-  if (!ok) BrotliHuffmanTreeRelease(tree);
-  return ok;
-}
-
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
diff --git a/brotli/dec/huffman.h b/brotli/dec/huffman.h
index be4bd46..f1a671d 100644
--- a/brotli/dec/huffman.h
+++ b/brotli/dec/huffman.h
@@ -43,7 +43,6 @@ struct HuffmanTree {
   HuffmanTreeNode* root_;   // all the nodes, starting at root.
   int max_nodes_;           // max number of nodes
   int num_nodes_;           // number of currently occupied nodes
-  int fixed_bit_length_;     // If non-zero, uses fixed length coding
 };
 
 // Returns true if the given node is not a leaf of the Huffman tree.
@@ -65,19 +64,9 @@ void BrotliHuffmanTreeRelease(HuffmanTree* const tree);
 // Builds Huffman tree assuming code lengths are implicitly in symbol order.
 // Returns false in case of error (invalid tree or memory error).
 int BrotliHuffmanTreeBuildImplicit(HuffmanTree* const tree,
-                                   const int* const code_lengths,
+                                   const uint8_t* const code_lengths,
                                    int code_lengths_size);
 
-// Build a Huffman tree with explicitly given lists of code lengths, codes
-// and symbols. Verifies that all symbols added are smaller than max_symbol.
-// Returns false in case of an invalid symbol, invalid tree or memory error.
-int BrotliHuffmanTreeBuildExplicit(HuffmanTree* const tree,
-                                   const int* const code_lengths,
-                                   const int* const codes,
-                                   const int* const symbols,
-                                   int max_symbol,
-                                   int num_symbols);
-
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
diff --git a/brotli/dec/streams.c b/brotli/dec/streams.c
new file mode 100644
index 0000000..ac1a55d
--- /dev/null
+++ b/brotli/dec/streams.c
@@ -0,0 +1,106 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Functions for streaming input and output.
+
+#include <string.h>
+#include <unistd.h>
+#include "./streams.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+int BrotliMemInputFunction(void* data, uint8_t* buf, size_t count) {
+  BrotliMemInput* input = (BrotliMemInput*)data;
+  if (input->pos > input->length) {
+    return -1;
+  }
+  if (input->pos + count > input->length) {
+    count = input->length - input->pos;
+  }
+  memcpy(buf, input->buffer + input->pos, count);
+  input->pos += count;
+  return count;
+}
+
+BrotliInput BrotliInitMemInput(const uint8_t* buffer, size_t length,
+                               BrotliMemInput* mem_input) {
+  mem_input->buffer = buffer;
+  mem_input->length = length;
+  mem_input->pos = 0;
+  BrotliInput input;
+  input.cb_ = &BrotliMemInputFunction;
+  input.data_ = mem_input;
+  return input;
+}
+
+int BrotliMemOutputFunction(void* data, const uint8_t* buf, size_t count) {
+  BrotliMemOutput* output = (BrotliMemOutput*)data;
+  if (output->pos + count > output->length) {
+    return -1;
+  }
+  memcpy(output->buffer + output->pos, buf, count);
+  output->pos += count;
+  return count;
+}
+
+BrotliOutput BrotliInitMemOutput(uint8_t* buffer, size_t length,
+                                 BrotliMemOutput* mem_output) {
+  mem_output->buffer = buffer;
+  mem_output->length = length;
+  mem_output->pos = 0;
+  BrotliOutput output;
+  output.cb_ = &BrotliMemOutputFunction;
+  output.data_ = mem_output;
+  return output;
+}
+
+int BrotliStdinInputFunction(void* data, uint8_t* buf, size_t count) {
+  return read(STDIN_FILENO, buf, count);
+}
+
+BrotliInput BrotliStdinInput() {
+  BrotliInput in;
+  in.cb_ = BrotliStdinInputFunction;
+  in.data_ = NULL;
+  return in;
+}
+
+int BrotliStdoutOutputFunction(void* data, const uint8_t* buf, size_t count) {
+  return write(STDOUT_FILENO, buf, count);
+}
+
+BrotliOutput BrotliStdoutOutput() {
+  BrotliOutput out;
+  out.cb_ = BrotliStdoutOutputFunction;
+  out.data_ = NULL;
+  return out;
+}
+
+int BrotliFileOutputFunction(void* data, const uint8_t* buf, size_t count) {
+  return fwrite(buf, 1, count, (FILE*)data);
+}
+
+BrotliOutput BrotliFileOutput(FILE* f) {
+  BrotliOutput out;
+  out.cb_ = BrotliFileOutputFunction;
+  out.data_ = f;
+  return out;
+}
+
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/brotli/dec/streams.h b/brotli/dec/streams.h
new file mode 100644
index 0000000..b055234
--- /dev/null
+++ b/brotli/dec/streams.h
@@ -0,0 +1,102 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Functions for streaming input and output.
+
+#ifndef BROTLI_DEC_STREAMS_H_
+#define BROTLI_DEC_STREAMS_H_
+
+#include <stdio.h>
+#include "./types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// Function pointer type used to read len bytes into buf. Returns the
+// number of bytes read or -1 on error.
+typedef int (*BrotliInputFunction)(void* data, uint8_t* buf, size_t len);
+
+// Input callback function with associated data.
+typedef struct {
+  BrotliInputFunction cb_;
+  void* data_;
+} BrotliInput;
+
+// Reads len bytes into buf, using the in callback.
+static BROTLI_INLINE int BrotliRead(BrotliInput in, uint8_t* buf, size_t len) {
+  return in.cb_(in.data_, buf, len);
+}
+
+// Function pointer type used to write len bytes into buf. Returns the
+// number of bytes written or -1 on error.
+typedef int (*BrotliOutputFunction)(void* data, const uint8_t* buf, size_t len);
+
+// Output callback function with associated data.
+typedef struct {
+  BrotliOutputFunction cb_;
+  void* data_;
+} BrotliOutput;
+
+// Writes len bytes into buf, using the out callback.
+static BROTLI_INLINE int BrotliWrite(BrotliOutput out,
+                                     const uint8_t* buf, size_t len) {
+  return out.cb_(out.data_, buf, len);
+}
+
+// Memory region with position.
+typedef struct {
+  const uint8_t* buffer;
+  size_t length;
+  size_t pos;
+} BrotliMemInput;
+
+// Input callback where *data is a BrotliMemInput struct.
+int BrotliMemInputFunction(void* data, uint8_t* buf, size_t count);
+
+// Returns an input callback that wraps the given memory region.
+BrotliInput BrotliInitMemInput(const uint8_t* buffer, size_t length,
+                               BrotliMemInput* mem_input);
+
+// Output buffer with position.
+typedef struct {
+  uint8_t* buffer;
+  size_t length;
+  size_t pos;
+} BrotliMemOutput;
+
+// Output callback where *data is a BrotliMemOutput struct.
+int BrotliMemOutputFunction(void* data, const uint8_t* buf, size_t count);
+
+// Returns an output callback that wraps the given memory region.
+BrotliOutput BrotliInitMemOutput(uint8_t* buffer, size_t length,
+                                 BrotliMemOutput* mem_output);
+
+// Input callback that reads from standard input.
+int BrotliStdinInputFunction(void* data, uint8_t* buf, size_t count);
+BrotliInput BrotliStdinInput();
+
+// Output callback that writes to standard output.
+int BrotliStdoutOutputFunction(void* data, const uint8_t* buf, size_t count);
+BrotliOutput BrotliStdoutOutput();
+
+// Output callback that writes to a file.
+int BrotliFileOutputFunction(void* data, const uint8_t* buf, size_t count);
+BrotliOutput BrotliFileOutput(FILE* f);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  // BROTLI_DEC_STREAMS_H_
diff --git a/brotli/enc/backward_references.cc b/brotli/enc/backward_references.cc
index 606fb99..5675633 100644
--- a/brotli/enc/backward_references.cc
+++ b/brotli/enc/backward_references.cc
@@ -20,60 +20,64 @@
 #include <vector>
 
 #include "./command.h"
-#include "./hash.h"
-#include "./literal_cost.h"
 
 namespace brotli {
 
-void CreateBackwardReferences(const uint8_t* data,
-                              int length,
+void CreateBackwardReferences(size_t num_bytes,
+                              size_t position,
+                              const uint8_t* ringbuffer,
+                              const float* literal_cost,
+                              size_t ringbuffer_mask,
+                              const size_t max_backward_limit,
+                              Hasher* hasher,
                               std::vector<Command>* commands) {
-  HashLongestMatch<13,11> *hasher = new HashLongestMatch<13,11>;
-  float *literal_cost = new float[length];
-  EstimateBitCostsForLiterals(length, data, literal_cost);
-  hasher->SetLiteralCost(literal_cost);
-
   // Length heuristic that seems to help probably by better selection
   // of lazy matches of similar lengths.
   int insert_length = 0;
-  size_t i = 0;
+  size_t i = position & ringbuffer_mask;
+  const int i_diff = position - i;
+  const size_t i_end = i + num_bytes;
 
   double average_cost = 0.0;
-  for (int i = 0; i < length; ++i) {
-    average_cost += literal_cost[i];
+  for (int k = position; k < position + num_bytes; ++k) {
+    average_cost += literal_cost[k & ringbuffer_mask];
   }
-  average_cost /= length;
+  average_cost /= num_bytes;
   hasher->set_average_cost(average_cost);
 
-  while (i + 2 < length) {
+  while (i + 2 < i_end) {
     size_t best_len = 0;
     size_t best_dist = 0;
     double best_score = 0;
-    const size_t max_distance = std::min(i, 1UL << 24);
+    const size_t max_distance = std::min(i + i_diff, max_backward_limit);
     hasher->set_insert_length(insert_length);
     bool match_found = hasher->FindLongestMatch(
-        data, i, length - i, max_distance,
+        ringbuffer, literal_cost, ringbuffer_mask,
+        i + i_diff, i_end - i, max_distance,
         &best_len, &best_dist, &best_score);
     if (match_found) {
       // Found a match. Let's look for something even better ahead.
       int delayed_backward_references_in_row = 0;
-      while (i + 4 < length &&
+      while (i + 4 < i_end &&
              delayed_backward_references_in_row < 4) {
         size_t best_len_2 = 0;
         size_t best_dist_2 = 0;
         double best_score_2 = 0;
-        hasher->Store(data + i, i);
+        hasher->Store(ringbuffer + i, i + i_diff);
         match_found = hasher->FindLongestMatch(
-            data, i + 1, length - i - 1, max_distance,
+            ringbuffer, literal_cost, ringbuffer_mask,
+            i + i_diff + 1, i_end - i - 1, max_distance,
             &best_len_2, &best_dist_2, &best_score_2);
         double cost_diff_lazy = 0;
         if (best_len >= 4) {
-          cost_diff_lazy += hasher->literal_cost(i + 4) - average_cost;
+          cost_diff_lazy +=
+              literal_cost[(i + 4) & ringbuffer_mask] - average_cost;
         }
         {
           const int tail_length = best_len_2 - best_len + 1;
           for (int k = 0; k < tail_length; ++k) {
-            cost_diff_lazy -= hasher->literal_cost(i + best_len + k) -
+            cost_diff_lazy -=
+                literal_cost[(i + best_len + k) & ringbuffer_mask] -
                 average_cost;
           }
         }
@@ -84,7 +88,7 @@ void CreateBackwardReferences(const uint8_t* data,
         }
         // Add bias to slightly avoid lazy matching.
         cost_diff_lazy += 2.0 + delayed_backward_references_in_row * 0.2;
-        cost_diff_lazy += 0.04 * hasher->literal_cost(i);
+        cost_diff_lazy += 0.04 * literal_cost[i & ringbuffer_mask];
 
         if (match_found && best_score_2 >= best_score + cost_diff_lazy) {
           // Ok, let's just write one byte for now and start a match from the
@@ -109,18 +113,18 @@ void CreateBackwardReferences(const uint8_t* data,
       insert_length = 0;
       ++i;
       for (int j = 1; j < best_len; ++j) {
-        if (i + 2 < length) {
-          hasher->Store(data + i, i);
+        if (i + 2 < i_end) {
+          hasher->Store(ringbuffer + i, i + i_diff);
         }
         ++i;
       }
     } else {
       ++insert_length;
-      hasher->Store(data + i, i);
+      hasher->Store(ringbuffer + i, i + i_diff);
       ++i;
     }
   }
-  insert_length += (length - i);
+  insert_length += (i_end - i);
 
   if (insert_length > 0) {
     Command cmd;
@@ -129,9 +133,6 @@ void CreateBackwardReferences(const uint8_t* data,
     cmd.copy_distance_ = 0;
     commands->push_back(cmd);
   }
-
-  delete[] literal_cost;
-  delete hasher;
 }
 
 }  // namespace brotli
diff --git a/brotli/enc/backward_references.h b/brotli/enc/backward_references.h
index 795f613..f666ef6 100644
--- a/brotli/enc/backward_references.h
+++ b/brotli/enc/backward_references.h
@@ -20,12 +20,18 @@
 #include <stdint.h>
 #include <vector>
 
+#include "./hash.h"
 #include "./command.h"
 
 namespace brotli {
 
-void CreateBackwardReferences(const uint8_t* data,
-                              int length,
+void CreateBackwardReferences(size_t num_bytes,
+                              size_t position,
+                              const uint8_t* ringbuffer,
+                              const float* literal_cost,
+                              size_t ringbuffer_mask,
+                              const size_t max_backward_limit,
+                              Hasher* hasher,
                               std::vector<Command>* commands);
 
 }  // namespace brotli
diff --git a/brotli/enc/bit_cost.h b/brotli/enc/bit_cost.h
index fc5381b..c2155aa 100644
--- a/brotli/enc/bit_cost.h
+++ b/brotli/enc/bit_cost.h
@@ -122,26 +122,31 @@ static inline int HuffmanBitCost(const uint8_t* depth, int length) {
 template<int kSize>
 double PopulationCost(const Histogram<kSize>& histogram) {
   if (histogram.total_count_ == 0) {
-    return 4;
+    return 11;
   }
-  int symbols[2] = { 0 };
   int count = 0;
-  for (int i = 0; i < kSize && count < 3; ++i) {
+  for (int i = 0; i < kSize && count < 5; ++i) {
     if (histogram.data_[i] > 0) {
-      if (count < 2) symbols[count] = i;
       ++count;
     }
   }
-  if (count <= 2 && symbols[0] < 256 && symbols[1] < 256) {
-    return ((symbols[0] <= 1 ? 4 : 11) +
-            (count == 2 ? 8 + histogram.total_count_ : 0));
+  if (count == 1) {
+    return 11;
+  }
+  if (count == 2) {
+    return 19 + histogram.total_count_;
   }
   uint8_t depth[kSize] = { 0 };
   CreateHuffmanTree(&histogram.data_[0], kSize, 15, depth);
-  int bits = HuffmanBitCost(depth, kSize);
+  int bits = 0;
   for (int i = 0; i < kSize; ++i) {
     bits += histogram.data_[i] * depth[i];
   }
+  if (count == 3) {
+    bits += 27;
+  } else {
+    bits += HuffmanBitCost(depth, kSize);
+  }
   return bits;
 }
 
diff --git a/brotli/enc/context.h b/brotli/enc/context.h
index 9bf9227..9b015d2 100644
--- a/brotli/enc/context.h
+++ b/brotli/enc/context.h
@@ -21,25 +21,124 @@
 
 namespace brotli {
 
-static const int kSigned2BitContextLookup[] = {
+// Second-order context lookup table for UTF8 byte streams.
+//
+// If p1 and p2 are the previous two bytes, we calcualte the context as
+//
+//   context = kUTF8ContextLookup[p1] | kUTF8ContextLookup[p2 + 256].
+//
+// If the previous two bytes are ASCII characters (i.e. < 128), this will be
+// equivalent to
+//
+//   context = 4 * context1(p1) + context2(p2),
+//
+// where context1 is based on the previous byte in the following way:
+//
+//   0  : non-ASCII control
+//   1  : \t, \n, \r
+//   2  : space
+//   3  : other punctuation
+//   4  : " '
+//   5  : %
+//   6  : ( < [ {
+//   7  : ) > ] }
+//   8  : , ; :
+//   9  : .
+//   10 : =
+//   11 : number
+//   12 : upper-case vowel
+//   13 : upper-case consonant
+//   14 : lower-case vowel
+//   15 : lower-case consonant
+//
+// and context2 is based on the second last byte:
+//
+//   0 : control, space
+//   1 : punctuation
+//   2 : upper-case letter, number
+//   3 : lower-case letter
+//
+// If the last byte is ASCII, and the second last byte is not (in a valid UTF8
+// stream it will be a continuation byte, value between 128 and 191), the
+// context is the same as if the second last byte was an ASCII control or space.
+//
+// If the last byte is a UTF8 lead byte (value >= 192), then the next byte will
+// be a continuation byte and the context id is 2 or 3 depending on the LSB of
+// the last byte and to a lesser extent on the second last byte if it is ASCII.
+//
+// If the last byte is a UTF8 continuation byte, the second last byte can be:
+//   - continuation byte: the next byte is probably ASCII or lead byte (assuming
+//     4-byte UTF8 characters are rare) and the context id is 0 or 1.
+//   - lead byte (192 - 207): next byte is ASCII or lead byte, context is 0 or 1
+//   - lead byte (208 - 255): next byte is continuation byte, context is 2 or 3
+//
+// The possible value combinations of the previous two bytes, the range of
+// context ids and the type of the next byte is summarized in the table below:
+//
+// |--------\-----------------------------------------------------------------|
+// |         \                         Last byte                              |
+// | Second   \---------------------------------------------------------------|
+// | last byte \    ASCII            |   cont. byte        |   lead byte      |
+// |            \   (0-127)          |   (128-191)         |   (192-)         |
+// |=============|===================|=====================|==================|
+// |  ASCII      | next: ASCII/lead  |  not valid          |  next: cont.     |
+// |  (0-127)    | context: 4 - 63   |                     |  context: 2 - 3  |
+// |-------------|-------------------|---------------------|------------------|
+// |  cont. byte | next: ASCII/lead  |  next: ASCII/lead   |  next: cont.     |
+// |  (128-191)  | context: 4 - 63   |  context: 0 - 1     |  context: 2 - 3  |
+// |-------------|-------------------|---------------------|------------------|
+// |  lead byte  | not valid         |  next: ASCII/lead   |  not valid       |
+// |  (192-207)  |                   |  context: 0 - 1     |                  |
+// |-------------|-------------------|---------------------|------------------|
+// |  lead byte  | not valid         |  next: cont.        |  not valid       |
+// |  (208-)     |                   |  context: 2 - 3     |                  |
+// |-------------|-------------------|---------------------|------------------|
+static const uint8_t kUTF8ContextLookup[512] = {
+  // Last byte.
+  //
+  // ASCII range.
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  4,  0,  0,  4,  0,  0,
+   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+   8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
+  44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
+  12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
+  52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
+  12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
+  60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12,  0,
+  // UTF8 continuation byte range.
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  // UTF8 lead byte range.
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  // Second last byte.
+  //
+  // ASCII range.
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
+  1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
+  1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
+  // UTF8 continuation byte range.
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // UTF8 lead byte range.
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
 };
 
+// Context lookup table for small signed integers.
 static const int kSigned3BitContextLookup[] = {
   0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -59,69 +158,25 @@ static const int kSigned3BitContextLookup[] = {
   6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
 };
 
-static const int kSigned4BitContextLookup[] = {
-   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
-   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 14, 15,
-};
-
 enum ContextType {
-  CONTEXT_NONE        = 0,
-  CONTEXT_FULL        = 1,
-  CONTEXT_MSB7        = 2,
-  CONTEXT_MSB6        = 3,
-  CONTEXT_MSB5        = 4,
-  CONTEXT_MSB4        = 5,
-  CONTEXT_MSB3        = 6,
-  CONTEXT_MSB2        = 7,
-  CONTEXT_MSB1        = 8,
-  CONTEXT_IS_ZERO     = 9,
-  CONTEXT_SIGNED_2BIT = 10,
-  CONTEXT_SIGNED_3BIT = 11,
-  CONTEXT_SIGNED_4BIT = 12,
-  CONTEXT_SIGNED_MIXED_3BYTE = 13,
+  CONTEXT_LSB6         = 0,
+  CONTEXT_MSB6         = 1,
+  CONTEXT_UTF8         = 2,
+  CONTEXT_SIGNED       = 3
 };
 
-static const int kContextSize[] = {
-  1, 256, 128, 64, 32, 16, 8, 4, 2, 2, 4, 8, 16, 64,
-};
-
-static inline int NumContexts(int mode) {
-  return kContextSize[mode];
-}
-
-static inline uint8_t Context(uint8_t prev_byte, uint8_t prev_byte2,
-                              uint8_t prev_byte3, int mode) {
+static inline uint8_t Context(uint8_t p1, uint8_t p2, int mode) {
   switch (mode) {
-    case CONTEXT_NONE:
-      return 0;
-    case CONTEXT_IS_ZERO:
-      return prev_byte == 0 ? 0 : 1;
-    case CONTEXT_SIGNED_2BIT:
-      return kSigned2BitContextLookup[prev_byte];
-    case CONTEXT_SIGNED_3BIT:
-      return kSigned3BitContextLookup[prev_byte];
-    case CONTEXT_SIGNED_4BIT:
-      return kSigned4BitContextLookup[prev_byte];
-    case CONTEXT_SIGNED_MIXED_3BYTE:
-      return ((kSigned3BitContextLookup[prev_byte] << 3) +
-              (kSigned2BitContextLookup[prev_byte2] << 1) +
-              (prev_byte3 == 0 ? 0 : 1));
+    case CONTEXT_LSB6:
+      return p1 & 0x3f;
+    case CONTEXT_MSB6:
+      return p1 >> 2;
+    case CONTEXT_UTF8:
+      return kUTF8ContextLookup[p1] | kUTF8ContextLookup[p2 + 256];
+    case CONTEXT_SIGNED:
+      return (kSigned3BitContextLookup[p1] << 3) + kSigned3BitContextLookup[p2];
     default:
-      return prev_byte >> (mode - 1);
+      return 0;
   }
 }
 
diff --git a/brotli/enc/encode.cc b/brotli/enc/encode.cc
index cefc7dc..bb3e3b8 100644
--- a/brotli/enc/encode.cc
+++ b/brotli/enc/encode.cc
@@ -26,7 +26,9 @@
 #include "./context.h"
 #include "./entropy_encode.h"
 #include "./fast_log.h"
+#include "./hash.h"
 #include "./histogram.h"
+#include "./literal_cost.h"
 #include "./prefix.h"
 #include "./write_bits.h"
 
@@ -41,31 +43,39 @@ double Entropy(const std::vector<Histogram<kSize> >& histograms) {
   return retval;
 }
 
+template<int kSize>
+double TotalBitCost(const std::vector<Histogram<kSize> >& histograms) {
+  double retval = 0;
+  for (int i = 0; i < histograms.size(); ++i) {
+    retval += PopulationCost(histograms[i]);
+  }
+  return retval;
+}
+
 void EncodeSize(size_t len, int* storage_ix, uint8_t* storage) {
   std::vector<uint8_t> len_bytes;
-  while (len > 0) {
+  do {
     len_bytes.push_back(len & 0xff);
     len >>= 8;
-  };
+  } while (len > 0);
   WriteBits(3, len_bytes.size(), storage_ix, storage);
   for (int i = 0; i < len_bytes.size(); ++i) {
     WriteBits(8, len_bytes[i], storage_ix, storage);
   }
 }
 
-void EncodeMetaBlockLength(int input_size_bits,
-                           size_t meta_block_size,
-                           bool is_last_meta_block,
+void EncodeMetaBlockLength(size_t meta_block_size,
                            int* storage_ix, uint8_t* storage) {
-  WriteBits(1, is_last_meta_block, storage_ix, storage);
-  if (is_last_meta_block) return;
-  while (input_size_bits > 0) {
-    WriteBits(8, meta_block_size & 0xff, storage_ix, storage);
-    meta_block_size >>= 8;
-    input_size_bits -= 8;
+  WriteBits(1, 0, storage_ix, storage);
+  int num_bits = Log2Floor(meta_block_size) + 1;
+  WriteBits(3, (num_bits + 3) >> 2, storage_ix, storage);
+  while (num_bits > 0) {
+    WriteBits(4, meta_block_size & 0xf, storage_ix, storage);
+    meta_block_size >>= 4;
+    num_bits -= 4;
   }
-  if (input_size_bits > 0) {
-    WriteBits(input_size_bits, meta_block_size, storage_ix, storage);
+  if (num_bits > 0) {
+    WriteBits(num_bits, meta_block_size, storage_ix, storage);
   }
 }
 
@@ -82,7 +92,7 @@ void StoreHuffmanTreeOfHuffmanTreeToBitMask(
     const uint8_t* code_length_bitdepth,
     int* storage_ix, uint8_t* storage) {
   static const uint8_t kStorageOrder[kCodeLengthCodes] = {
-    17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    1, 2, 3, 4, 0, 17, 18, 5, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15
   };
   // Throw away trailing zeros:
   int codes_to_store = kCodeLengthCodes;
@@ -92,8 +102,16 @@ void StoreHuffmanTreeOfHuffmanTreeToBitMask(
     }
   }
   WriteBits(4, codes_to_store - 4, storage_ix, storage);
-  for (int i = 0; i < codes_to_store; ++i) {
-    WriteBits(3, code_length_bitdepth[kStorageOrder[i]], storage_ix, storage);
+  const int skip_two_first =
+      code_length_bitdepth[kStorageOrder[0]] == 0 &&
+      code_length_bitdepth[kStorageOrder[1]] == 0;
+  WriteBits(1, skip_two_first, storage_ix, storage);
+
+  for (int i = skip_two_first * 2; i < codes_to_store; ++i) {
+    uint8_t len[] = { 2, 4, 3, 2, 2, 4 };
+    uint8_t bits[] = { 0, 7, 3, 1, 2, 15 };
+    int v = code_length_bitdepth[kStorageOrder[i]];
+    WriteBits(len[v], bits[v], storage_ix, storage);
   }
 }
 
@@ -124,30 +142,49 @@ void StoreHuffmanTreeToBitMask(
 template<int kSize>
 void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
                       int* storage_ix, uint8_t* storage) {
-  const int kMaxBits = 8;
-  const int kMaxSymbol = 1 << kMaxBits;
-
+  const uint8_t *depth = &code.depth_[0];
+  int max_bits_counter = alphabet_size - 1;
+  int max_bits = 0;
+  while (max_bits_counter) {
+    max_bits_counter >>= 1;
+    ++max_bits;
+  }
   if (code.count_ == 0) {   // emit minimal tree for empty cases
-    // bits: small tree marker: 1, count-1: 0, large 8-bit code: 0, code: 0
-    WriteBits(4, 0x01, storage_ix, storage);
+    // bits: small tree marker: 1, count-1: 0, max_bits-sized encoding for 0
+    WriteBits(3 + max_bits, 0x01, storage_ix, storage);
     return;
   }
-  if (code.count_ <= 2 &&
-      code.symbols_[0] < kMaxSymbol &&
-      code.symbols_[1] < kMaxSymbol) {
-    // Small tree marker to encode 1 or 2 symbols.
+  if (code.count_ <= 4) {
+    int symbols[4];
+    // Quadratic sort.
+    int k, j;
+    for (k = 0; k < code.count_; ++k) {
+      symbols[k] = code.symbols_[k];
+    }
+    for (k = 0; k < code.count_; ++k) {
+      for (j = k + 1; j < code.count_; ++j) {
+        if (depth[symbols[j]] < depth[symbols[k]]) {
+          int t = symbols[k];
+          symbols[k] = symbols[j];
+          symbols[j] = t;
+        }
+      }
+    }
+    // Small tree marker to encode 1-4 symbols.
     WriteBits(1, 1, storage_ix, storage);
-    WriteBits(1, code.count_ - 1, storage_ix, storage);
-    if (code.symbols_[0] <= 1) {
-      // Code bit for small (1 bit) symbol value.
-      WriteBits(1, 0, storage_ix, storage);
-      WriteBits(1, code.symbols_[0], storage_ix, storage);
-    } else {
-      WriteBits(1, 1, storage_ix, storage);
-      WriteBits(8, code.symbols_[0], storage_ix, storage);
+    WriteBits(2, code.count_ - 1, storage_ix, storage);
+    for (int i = 0; i < code.count_; ++i) {
+      WriteBits(max_bits, symbols[i], storage_ix, storage);
     }
-    if (code.count_ == 2) {
-      WriteBits(8, code.symbols_[1], storage_ix, storage);
+    if (code.count_ == 4) {
+      if (depth[symbols[0]] == 2 &&
+          depth[symbols[1]] == 2 &&
+          depth[symbols[2]] == 2 &&
+          depth[symbols[3]] == 2) {
+        WriteBits(1, 0, storage_ix, storage);
+      } else {
+        WriteBits(1, 1, storage_ix, storage);
+      }
     }
     return;
   }
@@ -156,7 +193,7 @@ void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
   uint8_t huffman_tree[kSize];
   uint8_t huffman_tree_extra_bits[kSize];
   int huffman_tree_size = 0;
-  WriteHuffmanTree(&code.depth_[0],
+  WriteHuffmanTree(depth,
                    alphabet_size,
                    &huffman_tree[0],
                    &huffman_tree_extra_bits[0],
@@ -167,7 +204,7 @@ void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
     huffman_tree_histogram.Add(huffman_tree[i]);
   }
   EntropyCode<kCodeLengthCodes> huffman_tree_entropy;
-  BuildEntropyCode(huffman_tree_histogram, 7, kCodeLengthCodes,
+  BuildEntropyCode(huffman_tree_histogram, 5, kCodeLengthCodes,
                    &huffman_tree_entropy);
   Histogram<kCodeLengthCodes> trimmed_histogram = huffman_tree_histogram;
   uint8_t* last_code = &huffman_tree[huffman_tree_size - 1];
@@ -178,7 +215,7 @@ void StoreHuffmanCode(const EntropyCode<kSize>& code, int alphabet_size,
   bool write_length = false;
   if (trimmed_size > 1 && trimmed_size < huffman_tree_size) {
     EntropyCode<kCodeLengthCodes> trimmed_entropy;
-    BuildEntropyCode(trimmed_histogram, 7, kCodeLengthCodes, &trimmed_entropy);
+    BuildEntropyCode(trimmed_histogram, 5, kCodeLengthCodes, &trimmed_entropy);
     int huffman_bit_cost = HuffmanTreeBitCost(huffman_tree_histogram,
                                               huffman_tree_entropy);
     int trimmed_bit_cost = HuffmanTreeBitCost(trimmed_histogram,
@@ -247,16 +284,15 @@ void EncodeCopyDistance(const Command& cmd, const EntropyCodeDistance& entropy,
   }
 }
 
-
-void ComputeDistanceShortCodes(std::vector<Command>* cmds) {
+void ComputeDistanceShortCodes(std::vector<Command>* cmds,
+                               int* dist_ringbuffer,
+                               size_t* ringbuffer_idx) {
   static const int kIndexOffset[16] = {
     3, 2, 1, 0, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2
   };
   static const int kValueOffset[16] = {
     0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
   };
-  int dist_ringbuffer[4] = { 4, 11, 15, 16 };
-  int ringbuffer_idx = 0;
   for (int i = 0; i < cmds->size(); ++i) {
     int cur_dist = (*cmds)[i].copy_distance_;
     if (cur_dist == 0) break;
@@ -268,7 +304,7 @@ void ComputeDistanceShortCodes(std::vector<Command>* cmds) {
         // with them.
         continue;
       }
-      int comp = (dist_ringbuffer[(ringbuffer_idx + kIndexOffset[k]) & 3] +
+      int comp = (dist_ringbuffer[(*ringbuffer_idx + kIndexOffset[k]) & 3] +
                   kValueOffset[k]);
       if (cur_dist == comp) {
         dist_code = k + 1;
@@ -276,8 +312,8 @@ void ComputeDistanceShortCodes(std::vector<Command>* cmds) {
       }
     }
     if (dist_code > 1) {
-      dist_ringbuffer[ringbuffer_idx & 3] = cur_dist;
-      ++ringbuffer_idx;
+      dist_ringbuffer[*ringbuffer_idx & 3] = cur_dist;
+      ++(*ringbuffer_idx);
     }
     (*cmds)[i].distance_code_ = dist_code;
   }
@@ -414,19 +450,8 @@ int BestMaxZeroRunLengthPrefix(const std::vector<int>& v) {
 }
 
 void EncodeContextMap(const std::vector<int>& context_map,
-                      int context_mode,
-                      int context_mode_bits,
                       int num_clusters,
                       int* storage_ix, uint8_t* storage) {
-  if (context_mode == 0) {
-    WriteBits(1, 0, storage_ix, storage);  // no context
-    return;
-  }
-
-  WriteBits(1, 1, storage_ix, storage);  // have context
-  if (context_mode_bits > 0) {
-    WriteBits(context_mode_bits, context_mode - 1, storage_ix, storage);
-  }
   WriteBits(8, num_clusters - 1, storage_ix, storage);
 
   if (num_clusters == 1 || num_clusters == context_map.size()) {
@@ -560,7 +585,6 @@ struct EncodingParams {
   int num_direct_distance_codes;
   int distance_postfix_bits;
   int literal_context_mode;
-  int distance_context_mode;
 };
 
 struct MetaBlock {
@@ -569,6 +593,7 @@ struct MetaBlock {
   BlockSplit literal_split;
   BlockSplit command_split;
   BlockSplit distance_split;
+  std::vector<int> literal_context_modes;
   std::vector<int> literal_context_map;
   std::vector<int> distance_context_map;
   std::vector<HistogramLiteral> literal_histograms;
@@ -578,8 +603,9 @@ struct MetaBlock {
 
 void BuildMetaBlock(const EncodingParams& params,
                     const std::vector<Command>& cmds,
-                    const uint8_t* input_buffer,
-                    size_t pos,
+                    const uint8_t* ringbuffer,
+                    const size_t pos,
+                    const size_t mask,
                     MetaBlock* mb) {
   mb->cmds = cmds;
   mb->params = params;
@@ -587,7 +613,7 @@ void BuildMetaBlock(const EncodingParams& params,
                          mb->params.num_direct_distance_codes,
                          mb->params.distance_postfix_bits);
   SplitBlock(mb->cmds,
-             input_buffer + pos,
+             &ringbuffer[pos & mask],
              &mb->literal_split,
              &mb->command_split,
              &mb->distance_split);
@@ -595,16 +621,14 @@ void BuildMetaBlock(const EncodingParams& params,
   ComputeBlockTypeShortCodes(&mb->command_split);
   ComputeBlockTypeShortCodes(&mb->distance_split);
 
-  int num_literal_contexts_per_block_type =
-      NumContexts(mb->params.literal_context_mode);
+  mb->literal_context_modes.resize(mb->literal_split.num_types_,
+                                   mb->params.literal_context_mode);
+
+
   int num_literal_contexts =
-      mb->literal_split.num_types_ *
-      num_literal_contexts_per_block_type;
-  int num_distance_contexts_per_block_type =
-      (mb->params.distance_context_mode > 0 ? 4 : 1);
+      mb->literal_split.num_types_ << kLiteralContextBits;
   int num_distance_contexts =
-      mb->distance_split.num_types_ *
-      num_distance_contexts_per_block_type;
+      mb->distance_split.num_types_ << kDistanceContextBits;
   std::vector<HistogramLiteral> literal_histograms(num_literal_contexts);
   mb->command_histograms.resize(mb->command_split.num_types_);
   std::vector<HistogramDistance> distance_histograms(num_distance_contexts);
@@ -612,10 +636,10 @@ void BuildMetaBlock(const EncodingParams& params,
                   mb->literal_split,
                   mb->command_split,
                   mb->distance_split,
-                  input_buffer,
+                  ringbuffer,
                   pos,
-                  mb->params.literal_context_mode,
-                  mb->params.distance_context_mode,
+                  mask,
+                  mb->literal_context_modes,
                   &literal_histograms,
                   &mb->command_histograms,
                   &distance_histograms);
@@ -625,24 +649,20 @@ void BuildMetaBlock(const EncodingParams& params,
   static const int kMaxNumberOfHistograms = 240;
 
   mb->literal_histograms = literal_histograms;
-  if (mb->params.literal_context_mode > 0) {
-    ClusterHistograms(literal_histograms,
-                      num_literal_contexts_per_block_type,
-                      mb->literal_split.num_types_,
-                      kMaxNumberOfHistograms,
-                      &mb->literal_histograms,
-                      &mb->literal_context_map);
-  }
+  ClusterHistograms(literal_histograms,
+                    1 << kLiteralContextBits,
+                    mb->literal_split.num_types_,
+                    kMaxNumberOfHistograms,
+                    &mb->literal_histograms,
+                    &mb->literal_context_map);
 
   mb->distance_histograms = distance_histograms;
-  if (mb->params.distance_context_mode > 0) {
-    ClusterHistograms(distance_histograms,
-                      num_distance_contexts_per_block_type,
-                      mb->distance_split.num_types_,
-                      kMaxNumberOfHistograms,
-                      &mb->distance_histograms,
-                      &mb->distance_context_map);
-  }
+  ClusterHistograms(distance_histograms,
+                    1 << kDistanceContextBits,
+                    mb->distance_split.num_types_,
+                    kMaxNumberOfHistograms,
+                    &mb->distance_histograms,
+                    &mb->distance_context_map);
 }
 
 size_t MetaBlockLength(const std::vector<Command>& cmds) {
@@ -655,14 +675,13 @@ size_t MetaBlockLength(const std::vector<Command>& cmds) {
 }
 
 void StoreMetaBlock(const MetaBlock& mb,
-                    const uint8_t* input_buffer,
-                    int input_size_bits,
-                    bool is_last,
+                    const uint8_t* ringbuffer,
+                    const size_t mask,
                     size_t* pos,
                     int* storage_ix, uint8_t* storage) {
   size_t length = MetaBlockLength(mb.cmds);
   const size_t end_pos = *pos + length;
-  EncodeMetaBlockLength(input_size_bits, length - 1, is_last,
+  EncodeMetaBlockLength(length - 1,
                         storage_ix, storage);
   BlockSplitCode literal_split_code;
   BlockSplitCode command_split_code;
@@ -680,10 +699,11 @@ void StoreMetaBlock(const MetaBlock& mb,
   int num_distance_codes =
       kNumDistanceShortCodes + mb.params.num_direct_distance_codes +
       (48 << mb.params.distance_postfix_bits);
-  EncodeContextMap(mb.literal_context_map, mb.params.literal_context_mode, 4,
-                   mb.literal_histograms.size(), storage_ix, storage);
-  EncodeContextMap(mb.distance_context_map, mb.params.distance_context_mode, 0,
-                   mb.distance_histograms.size(), storage_ix, storage);
+  for (int i = 0; i < mb.literal_split.num_types_; ++i) {
+    WriteBits(2, mb.literal_context_modes[i], storage_ix, storage);
+  }
+  EncodeContextMap(mb.literal_context_map, mb.literal_histograms.size(), storage_ix, storage);
+  EncodeContextMap(mb.distance_context_map, mb.distance_histograms.size(), storage_ix, storage);
   std::vector<EntropyCodeLiteral> literal_codes;
   std::vector<EntropyCodeCommand> command_codes;
   std::vector<EntropyCodeDistance> distance_codes;
@@ -705,27 +725,22 @@ void StoreMetaBlock(const MetaBlock& mb,
     for (int j = 0; j < cmd.insert_length_; ++j) {
       MoveAndEncode(literal_split_code, &literal_it, storage_ix, storage);
       int histogram_idx = literal_it.type_;
-      if (mb.params.literal_context_mode > 0) {
-        uint8_t prev_byte = *pos > 0 ? input_buffer[*pos - 1] : 0;
-        uint8_t prev_byte2 = *pos > 1 ? input_buffer[*pos - 2] : 0;
-        uint8_t prev_byte3 = *pos > 2 ? input_buffer[*pos - 3] : 0;
-        int context = (literal_it.type_ *
-                       NumContexts(mb.params.literal_context_mode) +
-                       Context(prev_byte, prev_byte2, prev_byte3,
-                               mb.params.literal_context_mode));
-        histogram_idx = mb.literal_context_map[context];
-      }
-      EntropyEncode(input_buffer[(*pos)++],
+      uint8_t prev_byte = *pos > 0 ? ringbuffer[(*pos - 1) & mask] : 0;
+      uint8_t prev_byte2 = *pos > 1 ? ringbuffer[(*pos - 2) & mask] : 0;
+      int context = ((literal_it.type_ << kLiteralContextBits) +
+                     Context(prev_byte, prev_byte2,
+                             mb.literal_context_modes[literal_it.type_]));
+      histogram_idx = mb.literal_context_map[context];
+      EntropyEncode(ringbuffer[*pos & mask],
                     literal_codes[histogram_idx], storage_ix, storage);
+      ++(*pos);
     }
     if (*pos < end_pos && cmd.distance_prefix_ != 0xffff) {
       MoveAndEncode(distance_split_code, &distance_it, storage_ix, storage);
       int histogram_index = distance_it.type_;
-      if (mb.params.distance_context_mode > 0) {
-        int context = distance_it.type_ << 2;
-        context += (cmd.copy_length_ > 4) ? 3 : cmd.copy_length_ - 2;
-        histogram_index = mb.distance_context_map[context];
-      }
+      int context = (distance_it.type_ << 2) +
+          ((cmd.copy_length_ > 4) ? 3 : cmd.copy_length_ - 2);
+      histogram_index = mb.distance_context_map[context];
       EncodeCopyDistance(cmd, distance_codes[histogram_index],
                          storage_ix, storage);
     }
@@ -733,45 +748,123 @@ void StoreMetaBlock(const MetaBlock& mb,
   }
 }
 
+static const int kWindowBits = 22;
+// To make decoding faster, we allow the decoder to write 16 bytes ahead in
+// its ringbuffer, therefore the encoder has to decrease max distance by this
+// amount.
+static const int kDecoderRingBufferWriteAheadSlack = 16;
+static const int kMaxBackwardDistance =
+    (1 << kWindowBits) - kDecoderRingBufferWriteAheadSlack;
+
+static const int kMetaBlockSizeBits = 21;
+static const int kRingBufferBits = 23;
+static const int kRingBufferMask = (1 << kRingBufferBits) - 1;
+
+BrotliCompressor::BrotliCompressor()
+    : hasher_(new Hasher),
+      dist_ringbuffer_idx_(0),
+      input_pos_(0),
+      ringbuffer_(kRingBufferBits, kMetaBlockSizeBits),
+      literal_cost_(1 << kRingBufferBits),
+      storage_ix_(0),
+      storage_(new uint8_t[2 << kMetaBlockSizeBits]) {
+    dist_ringbuffer_[0] = 4;
+    dist_ringbuffer_[1] = 11;
+    dist_ringbuffer_[2] = 15;
+    dist_ringbuffer_[3] = 16;
+    storage_[0] = 0;
+  }
+
+BrotliCompressor::~BrotliCompressor() {
+  delete hasher_;
+  delete[] storage_;
+}
+
+void BrotliCompressor::WriteStreamHeader() {
+  // Don't encode input size.
+  WriteBits(3, 0, &storage_ix_, storage_);
+  // Encode window size.
+  WriteBits(1, 1, &storage_ix_, storage_);
+  WriteBits(3, kWindowBits - 17, &storage_ix_, storage_);
+}
+
+void BrotliCompressor::WriteMetaBlock(const size_t input_size,
+                                      const uint8_t* input_buffer,
+                                      size_t* encoded_size,
+                                      uint8_t* encoded_buffer) {
+  ringbuffer_.Write(input_buffer, input_size);
+  EstimateBitCostsForLiterals(input_pos_, input_size,
+                              kRingBufferMask, ringbuffer_.start(),
+                              &literal_cost_[0]);
+  std::vector<Command> commands;
+  CreateBackwardReferences(input_size, input_pos_,
+                           ringbuffer_.start(),
+                           &literal_cost_[0],
+                           kRingBufferMask, kMaxBackwardDistance,
+                           hasher_,
+                           &commands);
+  ComputeDistanceShortCodes(&commands, dist_ringbuffer_,
+                            &dist_ringbuffer_idx_);
+  EncodingParams params;
+  params.num_direct_distance_codes = 12;
+  params.distance_postfix_bits = 1;
+  params.literal_context_mode = CONTEXT_SIGNED;
+  MetaBlock mb;
+  BuildMetaBlock(params, commands, ringbuffer_.start(), input_pos_,
+                 kRingBufferMask, &mb);
+  StoreMetaBlock(mb, ringbuffer_.start(), kRingBufferMask,
+                 &input_pos_, &storage_ix_, storage_);
+  size_t output_size = storage_ix_ >> 3;
+  memcpy(encoded_buffer, storage_, output_size);
+  *encoded_size = output_size;
+  storage_ix_ -= output_size << 3;
+  storage_[storage_ix_ >> 3] = storage_[output_size];
+}
+
+void BrotliCompressor::FinishStream(
+    size_t* encoded_size, uint8_t* encoded_buffer) {
+  WriteBits(1, 1, &storage_ix_, storage_);
+  *encoded_size = (storage_ix_ + 7) >> 3;
+  memcpy(encoded_buffer, storage_, *encoded_size);
+}
+
+
 int BrotliCompressBuffer(size_t input_size,
                          const uint8_t* input_buffer,
                          size_t* encoded_size,
                          uint8_t* encoded_buffer) {
-  int storage_ix = 0;
-  uint8_t* storage = encoded_buffer;
-  WriteBitsPrepareStorage(storage_ix, storage);
-  EncodeSize(input_size, &storage_ix, storage);
-
   if (input_size == 0) {
-    *encoded_size = (storage_ix + 7) >> 3;
+    encoded_buffer[0] = 1;
+    encoded_buffer[1] = 0;
+    *encoded_size = 2;
     return 1;
   }
-  int input_size_bits = Log2Ceiling(input_size);
 
-  std::vector<Command> all_commands;
-  CreateBackwardReferences(input_buffer, input_size, &all_commands);
-  ComputeDistanceShortCodes(&all_commands);
+  BrotliCompressor compressor;
+  compressor.WriteStreamHeader();
 
-  std::vector<std::vector<Command> > meta_block_commands;
-  SplitBlockByTotalLength(all_commands, input_size, 2 << 20,
-                          &meta_block_commands);
+  const int max_block_size = 1 << kMetaBlockSizeBits;
+  size_t max_output_size = *encoded_size;
+  const uint8_t* input_end = input_buffer + input_size;
+  *encoded_size = 0;
 
-  size_t pos = 0;
-  for (int block_idx = 0; block_idx < meta_block_commands.size(); ++block_idx) {
-    const std::vector<Command>& commands = meta_block_commands[block_idx];
-    bool is_last_meta_block = (block_idx + 1 == meta_block_commands.size());
-    EncodingParams params;
-    params.num_direct_distance_codes = 12;
-    params.distance_postfix_bits = 1;
-    params.literal_context_mode = CONTEXT_SIGNED_MIXED_3BYTE;
-    params.distance_context_mode = 1;
-    MetaBlock mb;
-    BuildMetaBlock(params, commands, input_buffer, pos, &mb);
-    StoreMetaBlock(mb, input_buffer, input_size_bits, is_last_meta_block,
-                   &pos, &storage_ix, storage);
+  while (input_buffer < input_end) {
+    int block_size = max_block_size;
+    if (block_size >= input_end - input_buffer) {
+      block_size = input_end - input_buffer;
+    }
+    size_t output_size = max_output_size;
+    compressor.WriteMetaBlock(block_size, input_buffer,
+                              &output_size, &encoded_buffer[*encoded_size]);
+    input_buffer += block_size;
+    *encoded_size += output_size;
+    max_output_size -= output_size;
   }
 
-  *encoded_size = (storage_ix + 7) >> 3;
+  size_t output_size = max_output_size;
+  compressor.FinishStream(&output_size, &encoded_buffer[*encoded_size]);
+  *encoded_size += output_size;
+
   return 1;
 }
 
diff --git a/brotli/enc/encode.h b/brotli/enc/encode.h
index 6e4044d..d2fb18e 100644
--- a/brotli/enc/encode.h
+++ b/brotli/enc/encode.h
@@ -20,9 +20,45 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <string>
+#include <vector>
+#include "./hash.h"
+#include "./ringbuffer.h"
 
 namespace brotli {
 
+class BrotliCompressor {
+ public:
+  BrotliCompressor();
+  ~BrotliCompressor();
+
+  // Writes the stream header into the internal output buffer.
+  void WriteStreamHeader();
+
+  // Encodes the data in input_buffer as a meta-block and writes it to
+  // encoded_buffer and sets *encoded_size to the number of bytes that was
+  // written.
+  void WriteMetaBlock(const size_t input_size,
+                      const uint8_t* input_buffer,
+                      size_t* encoded_size,
+                      uint8_t* encoded_buffer);
+
+  // Writes a zero-length meta-block with end-of-input bit set to the
+  // internal output buffer and copies the output buffer to encoded_buffer and
+  // sets *encoded_size to the number of bytes written.
+  void FinishStream(size_t* encoded_size, uint8_t* encoded_buffer);
+
+
+ private:
+  Hasher* hasher_;
+  int dist_ringbuffer_[4];
+  size_t dist_ringbuffer_idx_;
+  size_t input_pos_;
+  RingBuffer ringbuffer_;
+  std::vector<float> literal_cost_;
+  int storage_ix_;
+  uint8_t* storage_;
+};
+
 // Compresses the data in input_buffer into encoded_buffer, and sets
 // *encoded_size to the compressed length.
 // Returns 0 if there was an error and 1 otherwise.
diff --git a/brotli/enc/entropy_encode.cc b/brotli/enc/entropy_encode.cc
index 9a4d3e4..37d0d9e 100644
--- a/brotli/enc/entropy_encode.cc
+++ b/brotli/enc/entropy_encode.cc
@@ -43,6 +43,9 @@ HuffmanTree::HuffmanTree() {}
 
 // Sort the root nodes, least popular first.
 bool SortHuffmanTree(const HuffmanTree &v0, const HuffmanTree &v1) {
+  if (v0.total_count_ == v1.total_count_) {
+    return v0.index_right_or_value_ > v1.index_right_or_value_;
+  }
   return v0.total_count_ < v1.total_count_;
 }
 
@@ -276,7 +279,7 @@ int OptimizeHuffmanCountsForRle(int length, int* counts) {
   }
   // 3) Let's replace those population counts that lead to more rle codes.
   stride = 0;
-  limit = counts[0];
+  limit = (counts[0] + counts[1] + counts[2]) / 3 + 1;
   sum = 0;
   for (i = 0; i < length + 1; ++i) {
     if (i == length || good_for_rle[i] ||
@@ -301,11 +304,10 @@ int OptimizeHuffmanCountsForRle(int length, int* counts) {
       }
       stride = 0;
       sum = 0;
-      if (i < length - 3) {
+      if (i < length - 2) {
         // All interesting strides have a count of at least 4,
         // at least when non-zeros.
-        limit = (counts[i] + counts[i + 1] +
-                 counts[i + 2] + counts[i + 3] + 2) / 4;
+        limit = (counts[i] + counts[i + 1] + counts[i + 2]) / 3 + 1;
       } else if (i < length) {
         limit = counts[i];
       } else {
@@ -329,7 +331,7 @@ void WriteHuffmanTree(const uint8_t* depth, const int length,
                       uint8_t* tree,
                       uint8_t* extra_bits_data,
                       int* huffman_tree_size) {
-  int previous_value = 0;
+  int previous_value = 8;
   for (uint32_t i = 0; i < length;) {
     const int value = depth[i];
     int reps = 1;
diff --git a/brotli/enc/entropy_encode.h b/brotli/enc/entropy_encode.h
index e5993e6..6f08e9a 100644
--- a/brotli/enc/entropy_encode.h
+++ b/brotli/enc/entropy_encode.h
@@ -66,8 +66,8 @@ struct EntropyCode {
   uint16_t bits_[kSize];
   // How many non-zero depth.
   int count_;
-  // First two symbols with non-zero depth.
-  int symbols_[2];
+  // First four symbols with non-zero depth.
+  int symbols_[4];
 };
 
 template<int kSize>
@@ -82,7 +82,7 @@ void BuildEntropyCode(const Histogram<kSize>& histogram,
   if (histogram.total_count_ == 0) return;
   for (int i = 0; i < kSize; ++i) {
     if (histogram.data_[i] > 0) {
-      if (code->count_ < 2) code->symbols_[code->count_] = i;
+      if (code->count_ < 4) code->symbols_[code->count_] = i;
       ++code->count_;
     }
   }
diff --git a/brotli/enc/hash.h b/brotli/enc/hash.h
index 9cacf7e..9b9bbab 100644
--- a/brotli/enc/hash.h
+++ b/brotli/enc/hash.h
@@ -103,8 +103,7 @@ template <int kBucketBits, int kBlockBits>
 class HashLongestMatch {
  public:
   HashLongestMatch()
-      : literal_cost_(NULL),
-        last_distance1_(4),
+      : last_distance1_(4),
         last_distance2_(11),
         last_distance3_(15),
         last_distance4_(16),
@@ -115,10 +114,6 @@ class HashLongestMatch {
   void Reset() {
     std::fill(&num_[0], &num_[sizeof(num_) / sizeof(num_[0])], 0);
   }
-  void SetLiteralCost(float *cost) {
-    literal_cost_ = cost;
-  }
-  double literal_cost(int i) const { return literal_cost_[i]; }
 
   // Look at 3 bytes at data.
   // Compute a hash from these, and store the value of ix at that position.
@@ -146,25 +141,27 @@ class HashLongestMatch {
   // into best_distance_out.
   // Write the score of the best match into best_score_out.
   bool FindLongestMatch(const uint8_t * __restrict data,
+                        const float * __restrict literal_cost,
+                        const size_t ring_buffer_mask,
                         const uint32_t cur_ix,
                         uint32_t max_length,
                         const uint32_t max_backward,
                         size_t * __restrict best_len_out,
                         size_t * __restrict best_distance_out,
                         double * __restrict best_score_out) {
-    const double start_cost4 = literal_cost_ == NULL ? 20 :
-        literal_cost_[cur_ix] +
-        literal_cost_[cur_ix + 1] +
-        literal_cost_[cur_ix + 2] +
-        literal_cost_[cur_ix + 3];
-
-    const double start_cost3 = literal_cost_ == NULL ? 15 :
-        literal_cost_[cur_ix] +
-        literal_cost_[cur_ix + 1] +
-        literal_cost_[cur_ix + 2] + 0.3;
-    double start_cost2 = literal_cost_ == NULL ? 10 :
-        literal_cost_[cur_ix] +
-        literal_cost_[cur_ix + 1] + 1.2;
+    const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
+    const double start_cost4 = literal_cost == NULL ? 20 :
+        literal_cost[cur_ix_masked] +
+        literal_cost[(cur_ix + 1) & ring_buffer_mask] +
+        literal_cost[(cur_ix + 2) & ring_buffer_mask] +
+        literal_cost[(cur_ix + 3) & ring_buffer_mask];
+    const double start_cost3 = literal_cost == NULL ? 15 :
+        literal_cost[cur_ix_masked] +
+        literal_cost[(cur_ix + 1) & ring_buffer_mask] +
+        literal_cost[(cur_ix + 2) & ring_buffer_mask] + 0.3;
+    double start_cost2 = literal_cost == NULL ? 10 :
+        literal_cost[cur_ix_masked] +
+        literal_cost[(cur_ix + 1) & ring_buffer_mask] + 1.2;
     bool match_found = false;
     // Don't accept a short copy from far away.
     double best_score = 8.25;
@@ -177,7 +174,7 @@ class HashLongestMatch {
     size_t best_ix = 1;
     // Try last distance first.
     for (int i = 0; i < 16; ++i) {
-      int prev_ix = cur_ix;
+      size_t prev_ix = cur_ix;
       switch(i) {
         case 0: prev_ix -= last_distance1_; break;
         case 1: prev_ix -= last_distance2_; break;
@@ -205,11 +202,13 @@ class HashLongestMatch {
       if (PREDICT_FALSE(backward > max_backward)) {
         continue;
       }
-      if (data[cur_ix + best_len] != data[prev_ix + best_len]) {
+      prev_ix &= ring_buffer_mask;
+      if (data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
         continue;
       }
       const size_t len =
-          FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix], max_length);
+          FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
+                                   max_length);
       if (len >= 3 || (len == 2 && i < 2)) {
         // Comparing for >= 2 does not change the semantics, but just saves for
         // a few unnecessary binary logarithms in backward reference score,
@@ -234,7 +233,7 @@ class HashLongestMatch {
         }
       }
     }
-    const uint32_t key = Hash3Bytes(&data[cur_ix], kBucketBits);
+    const uint32_t key = Hash3Bytes(&data[cur_ix_masked], kBucketBits);
     const uint32_t * __restrict const bucket = &buckets_[key][0];
     const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
     int stop = int(cur_ix) - 64;
@@ -247,8 +246,9 @@ class HashLongestMatch {
       if (PREDICT_FALSE(backward > max_backward)) {
         break;
       }
-      if (data[cur_ix] != data[prev_ix] ||
-          data[cur_ix + 1] != data[prev_ix + 1]) {
+      prev_ix &= ring_buffer_mask;
+      if (data[cur_ix_masked] != data[prev_ix] ||
+          data[cur_ix_masked + 1] != data[prev_ix + 1]) {
         continue;
       }
       int len = 2;
@@ -269,11 +269,13 @@ class HashLongestMatch {
       if (PREDICT_FALSE(backward > max_backward)) {
         break;
       }
-      if (data[cur_ix + best_len] != data[prev_ix + best_len]) {
+      prev_ix &= ring_buffer_mask;
+      if (data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
         continue;
       }
       const size_t len =
-          FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix], max_length);
+          FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
+                                   max_length);
       if (len >= 3) {
         // Comparing for >= 3 does not change the semantics, but just saves for
         // a few unnecessary binary logarithms in backward reference score,
@@ -333,10 +335,6 @@ class HashLongestMatch {
   // Buckets containing kBlockSize of backward references.
   uint32_t buckets_[kBucketSize][kBlockSize];
 
-  // Model of how much the ith literal costs to encode using
-  // the entropy model.
-  float *literal_cost_;
-
   int last_distance1_;
   int last_distance2_;
   int last_distance3_;
@@ -349,6 +347,8 @@ class HashLongestMatch {
   double average_cost_;
 };
 
+typedef HashLongestMatch<13, 11> Hasher;
+
 }  // namespace brotli
 
 #endif  // BROTLI_ENC_HASH_H_
diff --git a/brotli/enc/histogram.cc b/brotli/enc/histogram.cc
index 32d7730..fcffd1f 100644
--- a/brotli/enc/histogram.cc
+++ b/brotli/enc/histogram.cc
@@ -31,10 +31,10 @@ void BuildHistograms(
     const BlockSplit& literal_split,
     const BlockSplit& insert_and_copy_split,
     const BlockSplit& dist_split,
-    const uint8_t* input_buffer,
+    const uint8_t* ringbuffer,
     size_t pos,
-    int context_mode,
-    int distance_context_mode,
+    size_t mask,
+    const std::vector<int>& context_modes,
     std::vector<HistogramLiteral>* literal_histograms,
     std::vector<HistogramCommand>* insert_and_copy_histograms,
     std::vector<HistogramDistance>* copy_dist_histograms) {
@@ -48,25 +48,47 @@ void BuildHistograms(
         cmd.command_prefix_);
     for (int j = 0; j < cmd.insert_length_; ++j) {
       literal_it.Next();
-      uint8_t prev_byte = pos > 0 ? input_buffer[pos - 1] : 0;
-      uint8_t prev_byte2 = pos > 1 ? input_buffer[pos - 2] : 0;
-      uint8_t prev_byte3 = pos > 2 ? input_buffer[pos - 3] : 0;
-      int context = (literal_it.type_ * NumContexts(context_mode) +
-                     Context(prev_byte, prev_byte2, prev_byte3, context_mode));
-      (*literal_histograms)[context].Add(input_buffer[pos]);
+      uint8_t prev_byte = pos > 0 ? ringbuffer[(pos - 1) & mask] : 0;
+      uint8_t prev_byte2 = pos > 1 ? ringbuffer[(pos - 2) & mask] : 0;
+      int context = (literal_it.type_ << kLiteralContextBits) +
+          Context(prev_byte, prev_byte2, context_modes[literal_it.type_]);
+      (*literal_histograms)[context].Add(ringbuffer[pos & mask]);
       ++pos;
     }
     pos += cmd.copy_length_;
     if (cmd.copy_length_ > 0 && cmd.distance_prefix_ != 0xffff) {
       dist_it.Next();
-      int context = dist_it.type_;
-      if (distance_context_mode > 0) {
-        context <<= 2;
-        context += (cmd.copy_length_ > 4) ? 3 : cmd.copy_length_ - 2;
-      }
+      int context = (dist_it.type_ << kDistanceContextBits) +
+          ((cmd.copy_length_ > 4) ? 3 : cmd.copy_length_ - 2);
       (*copy_dist_histograms)[context].Add(cmd.distance_prefix_);
     }
   }
 }
 
+void BuildLiteralHistogramsForBlockType(
+    const std::vector<Command>& cmds,
+    const BlockSplit& literal_split,
+    const uint8_t* ringbuffer,
+    size_t pos,
+    size_t mask,
+    int block_type,
+    int context_mode,
+    std::vector<HistogramLiteral>* histograms) {
+  BlockSplitIterator literal_it(literal_split);
+  for (int i = 0; i < cmds.size(); ++i) {
+    const Command &cmd = cmds[i];
+    for (int j = 0; j < cmd.insert_length_; ++j) {
+      literal_it.Next();
+      if (literal_it.type_ == block_type) {
+        uint8_t prev_byte = pos > 0 ? ringbuffer[(pos - 1) & mask] : 0;
+        uint8_t prev_byte2 = pos > 1 ? ringbuffer[(pos - 2) & mask] : 0;
+        int context = Context(prev_byte, prev_byte2, context_mode);
+        (*histograms)[context].Add(ringbuffer[pos & mask]);
+      }
+      ++pos;
+    }
+    pos += cmd.copy_length_;
+  }
+}
+
 }  // namespace brotli
diff --git a/brotli/enc/histogram.h b/brotli/enc/histogram.h
index 69f8b9a..d8012f9 100644
--- a/brotli/enc/histogram.h
+++ b/brotli/enc/histogram.h
@@ -79,19 +79,32 @@ typedef Histogram<kNumCommandPrefixes> HistogramCommand;
 typedef Histogram<kNumDistancePrefixes> HistogramDistance;
 typedef Histogram<kNumBlockLenPrefixes> HistogramBlockLength;
 
+static const int kLiteralContextBits = 6;
+static const int kDistanceContextBits = 2;
+
 void BuildHistograms(
     const std::vector<Command>& cmds,
     const BlockSplit& literal_split,
     const BlockSplit& insert_and_copy_split,
     const BlockSplit& dist_split,
-    const uint8_t* input_buffer,
+    const uint8_t* ringbuffer,
     size_t pos,
-    int context_mode,
-    int distance_context_mode,
+    size_t mask,
+    const std::vector<int>& context_modes,
     std::vector<HistogramLiteral>* literal_histograms,
     std::vector<HistogramCommand>* insert_and_copy_histograms,
     std::vector<HistogramDistance>* copy_dist_histograms);
 
+void BuildLiteralHistogramsForBlockType(
+    const std::vector<Command>& cmds,
+    const BlockSplit& literal_split,
+    const uint8_t* ringbuffer,
+    size_t pos,
+    size_t mask,
+    int block_type,
+    int context_mode,
+    std::vector<HistogramLiteral>* histograms);
+
 }  // namespace brotli
 
 #endif  // BROTLI_ENC_HISTOGRAM_H_
diff --git a/brotli/enc/literal_cost.cc b/brotli/enc/literal_cost.cc
index 0dac1a6..2a388d7 100644
--- a/brotli/enc/literal_cost.cc
+++ b/brotli/enc/literal_cost.cc
@@ -22,37 +22,39 @@
 
 namespace brotli {
 
-void EstimateBitCostsForLiterals(size_t len, const uint8_t *data, float *cost) {
+void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
+                                 const uint8_t *data, float *cost) {
   int histogram[256] = { 0 };
   int window_half = 2000;
   int in_window = std::min(static_cast<size_t>(window_half), len);
 
   // Bootstrap histogram.
   for (int i = 0; i < in_window; ++i) {
-    ++histogram[data[i]];
+    ++histogram[data[(pos + i) & mask]];
   }
 
   // Compute bit costs with sliding window.
   for (int i = 0; i < len; ++i) {
     if (i - window_half >= 0) {
       // Remove a byte in the past.
-      --histogram[data[i - window_half]];
+      --histogram[data[(pos + i - window_half) & mask]];
       --in_window;
     }
     if (i + window_half < len) {
       // Add a byte in the future.
-      ++histogram[data[i + window_half]];
+      ++histogram[data[(pos + i + window_half) & mask]];
       ++in_window;
     }
-    int histo = histogram[data[i]];
+    int masked_pos = (pos + i) & mask;
+    int histo = histogram[data[masked_pos]];
     if (histo == 0) {
       histo = 1;
     }
-    cost[i] = log2(static_cast<double>(in_window) / histo);
-    cost[i] += 0.03;
-    if (cost[i] < 1.0) {
-      cost[i] *= 0.5;
-      cost[i] += 0.5;
+    cost[masked_pos] = log2(static_cast<double>(in_window) / histo);
+    cost[masked_pos] += 0.03;
+    if (cost[masked_pos] < 1.0) {
+      cost[masked_pos] *= 0.5;
+      cost[masked_pos] += 0.5;
     }
   }
 }
diff --git a/brotli/enc/literal_cost.h b/brotli/enc/literal_cost.h
index 0dbbc3f..fd7f325 100644
--- a/brotli/enc/literal_cost.h
+++ b/brotli/enc/literal_cost.h
@@ -22,9 +22,11 @@
 
 namespace brotli {
 
-// Input: length of data, and the bytes.
-// Output: estimate of how many bits the literal will take entropy coded.
-void EstimateBitCostsForLiterals(size_t len, const uint8_t *data, float *cost);
+// Estimates how many bits the literals in the interval [pos, pos + len) in the
+// ringbuffer (data, mask) will take entropy coded and writes these estimates
+// to the ringbuffer (cost, mask).
+void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
+                                 const uint8_t *data, float *cost);
 
 }  // namespace brotli
 
diff --git a/brotli/enc/ringbuffer.h b/brotli/enc/ringbuffer.h
new file mode 100644
index 0000000..d88f2ca
--- /dev/null
+++ b/brotli/enc/ringbuffer.h
@@ -0,0 +1,89 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Sliding window over the input data.
+
+#ifndef BROTLI_ENC_RINGBUFFER_H_
+#define BROTLI_ENC_RINGBUFFER_H_
+
+// A RingBuffer(window_bits, tail_bits) contains `1 << window_bits' bytes of
+// data in a circular manner: writing a byte writes it to
+// `position() % (1 << window_bits)'. For convenience, the RingBuffer array
+// contains another copy of the first `1 << tail_bits' bytes:
+// buffer_[i] == buffer_[i + (1 << window_bits)] if i < (1 << tail_bits).
+class RingBuffer {
+ public:
+  RingBuffer(int window_bits, int tail_bits)
+      : window_bits_(window_bits), tail_bits_(tail_bits), pos_(0) {
+    static const int kSlackForThreeByteHashingEverywhere = 2;
+    const int buflen = (1 << window_bits_) + (1 << tail_bits_);
+    buffer_ = new uint8_t[buflen + kSlackForThreeByteHashingEverywhere];
+    for (int i = 0; i < kSlackForThreeByteHashingEverywhere; ++i) {
+      buffer_[buflen + i] = 0;
+    }
+  }
+  ~RingBuffer() {
+    delete [] buffer_;
+  }
+
+  // Push bytes into the ring buffer.
+  void Write(const uint8_t *bytes, size_t n) {
+    const size_t masked_pos = pos_ & ((1 << window_bits_) - 1);
+    // The length of the writes is limited so that we do not need to worry
+    // about a write
+    WriteTail(bytes, n);
+    if (masked_pos + n <= (1 << window_bits_)) {
+      // A single write fits.
+      memcpy(&buffer_[masked_pos], bytes, n);
+    } else {
+      // Split into two writes.
+      // Copy into the end of the buffer, including the tail buffer.
+      memcpy(&buffer_[masked_pos], bytes,
+             std::min(n,
+                      ((1 << window_bits_) + (1 << tail_bits_)) - masked_pos));
+      // Copy into the begining of the buffer
+      memcpy(&buffer_[0], bytes + ((1 << window_bits_) - masked_pos),
+             n - ((1 << window_bits_) - masked_pos));
+    }
+    pos_ += n;
+  }
+
+  // Logical cursor position in the ring buffer.
+  size_t position() const { return pos_; }
+
+  uint8_t *start() { return &buffer_[0]; }
+  const uint8_t *start() const { return &buffer_[0]; }
+
+ private:
+  void WriteTail(const uint8_t *bytes, size_t n) {
+    const size_t masked_pos = pos_ & ((1 << window_bits_) - 1);
+    if (masked_pos < (1 << tail_bits_)) {
+      // Just fill the tail buffer with the beginning data.
+      const size_t p = (1 << window_bits_) + masked_pos;
+      memcpy(&buffer_[p], bytes, std::min(n, (1 << tail_bits_) - masked_pos));
+    }
+  }
+
+  // Size of the ringbuffer is (1 << window_bits) + (1 << tail_bits).
+  const int window_bits_;
+  const int tail_bits_;
+
+  // Position to write in the ring buffer.
+  size_t pos_;
+  // The actual ring buffer containing the data and the copy of the beginning
+  // as a tail.
+  uint8_t *buffer_;
+};
+
+#endif  // BROTLI_ENC_RINGBUFFER_H_
author	Zoltan Szabadka <szabadka@google.com>	2013-11-15 19:02:17 +0100
committer	Zoltan Szabadka <szabadka@google.com>	2013-11-15 19:02:17 +0100
commit	1571db36a9b00e895882ee236e9f84d62f8ea226 (patch)
tree	365fdb3cbd81ab9dc863b4ec39dcba6837616b00
parent	79e99afe468407e9ff9f0820df7190cb069eabeb (diff)
download	src-1571db36a9b00e895882ee236e9f84d62f8ea226.tar.gz