From ec487d3265cdc5387b13b5242045b87ea8880e76 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Thu, 11 Aug 2022 17:48:35 -0700
Subject: faster CLI decompression speed for frames with -BD4 setting

lz4frame favors the faster prefix mode
when decompressing a frame with linked blocks.

This significantly improved CLI decompression on files compressed with -BD4 setting.
On my laptop, decompressing `enwik9` went from 0.89s to 0.52s.

This improvement is only for linked blocks.
It's more visible for small block sizes.
---
 lib/lz4frame.c | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/lib/lz4frame.c b/lib/lz4frame.c
index cba5744e..174f9ae4 100644
--- a/lib/lz4frame.c
+++ b/lib/lz4frame.c
@@ -1310,8 +1310,7 @@ static size_t LZ4F_decodeHeader(LZ4F_dctx* dctx, const void* src, size_t srcSize
         } else {
             dctx->dStage = dstage_getSFrameSize;
             return 4;
-        }
-    }
+    }   }
 
     /* control magic number */
 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
@@ -1371,8 +1370,7 @@ static size_t LZ4F_decodeHeader(LZ4F_dctx* dctx, const void* src, size_t srcSize
     dctx->frameInfo.blockSizeID = (LZ4F_blockSizeID_t)blockSizeID;
     dctx->maxBlockSize = LZ4F_getBlockSize((LZ4F_blockSizeID_t)blockSizeID);
     if (contentSizeFlag)
-        dctx->frameRemainingSize =
-            dctx->frameInfo.contentSize = LZ4F_readLE64(srcPtr+6);
+        dctx->frameRemainingSize = dctx->frameInfo.contentSize = LZ4F_readLE64(srcPtr+6);
     if (dictIDFlag)
         dctx->frameInfo.dictID = LZ4F_readLE32(srcPtr + frameHeaderSize - 5);
 
@@ -1467,16 +1465,14 @@ LZ4F_errorCode_t LZ4F_getFrameInfo(LZ4F_dctx* dctx,
 
 /* LZ4F_updateDict() :
  * only used for LZ4F_blockLinked mode
- * Condition : dstPtr != NULL
+ * Condition : @dstPtr != NULL
  */
 static void LZ4F_updateDict(LZ4F_dctx* dctx,
                       const BYTE* dstPtr, size_t dstSize, const BYTE* dstBufferStart,
                       unsigned withinTmp)
 {
     assert(dstPtr != NULL);
-    if (dctx->dictSize==0) {
-        dctx->dict = (const BYTE*)dstPtr;   /* priority to prefix mode */
-    }
+    if (dctx->dictSize==0) dctx->dict = (const BYTE*)dstPtr;  /* will lead to prefix mode */
     assert(dctx->dict != NULL);
 
     if (dctx->dict + dctx->dictSize == dstPtr) {  /* prefix mode, everything within dstBuffer */
@@ -1813,7 +1809,10 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
             }
 
             /* At this stage, input is large enough to decode a block */
+
+            /* First, decode and control block checksum if it exists */
             if (dctx->frameInfo.blockChecksumFlag) {
+                assert(dctx->tmpInTarget >= 4);
                 dctx->tmpInTarget -= 4;
                 assert(selectedIn != NULL);  /* selectedIn is defined at this stage (either srcPtr, or dctx->tmpIn) */
                 {   U32 const readBlockCrc = LZ4F_readLE32(selectedIn + dctx->tmpInTarget);
@@ -1826,17 +1825,22 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
 #endif
             }   }
 
-            if ((size_t)(dstEnd-dstPtr) >= dctx->maxBlockSize) {
+            /* decode directly into destination buffer if there is enough room */
+            if ( ((size_t)(dstEnd-dstPtr) >= dctx->maxBlockSize)
+                 /* unless the dictionary is stored in tmpOut:
+                  * in which case it's faster to decode within tmpOut
+                  * to benefit from prefix speedup */
+              && !(dctx->dict!= NULL && (const BYTE*)dctx->dict + dctx->dictSize == dctx->tmpOut) )
+            {
                 const char* dict = (const char*)dctx->dict;
                 size_t dictSize = dctx->dictSize;
                 int decodedSize;
                 assert(dstPtr != NULL);
                 if (dict && dictSize > 1 GB) {
-                    /* the dictSize param is an int, avoid truncation / sign issues */
+                    /* overflow control : dctx->dictSize is an int, avoid truncation / sign issues */
                     dict += dictSize - 64 KB;
                     dictSize = 64 KB;
                 }
-                /* enough capacity in `dst` to decompress directly there */
                 decodedSize = LZ4_decompress_safe_usingDict(
                         (const char*)selectedIn, (char*)dstPtr,
                         (int)dctx->tmpInTarget, (int)dctx->maxBlockSize,
@@ -1853,25 +1857,27 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
                 }
 
                 dstPtr += decodedSize;
-                dctx->dStage = dstage_getBlockHeader;
+                dctx->dStage = dstage_getBlockHeader;  /* end of block, let's get another one */
                 break;
             }
 
             /* not enough place into dst : decode into tmpOut */
-            /* ensure enough place for tmpOut */
+
+            /* manage dictionary */
             if (dctx->frameInfo.blockMode == LZ4F_blockLinked) {
                 if (dctx->dict == dctx->tmpOutBuffer) {
+                    /* truncate dictionary to 64 KB if too big */
                     if (dctx->dictSize > 128 KB) {
                         memcpy(dctx->tmpOutBuffer, dctx->dict + dctx->dictSize - 64 KB, 64 KB);
                         dctx->dictSize = 64 KB;
                     }
                     dctx->tmpOut = dctx->tmpOutBuffer + dctx->dictSize;
-                } else {  /* dict not within tmp */
+                } else {  /* dict not within tmpOut */
                     size_t const reservedDictSpace = MIN(dctx->dictSize, 64 KB);
                     dctx->tmpOut = dctx->tmpOutBuffer + reservedDictSpace;
             }   }
 
-            /* Decode block */
+            /* Decode block into tmpOut */
             {   const char* dict = (const char*)dctx->dict;
                 size_t dictSize = dctx->dictSize;
                 int decodedSize;
@@ -2014,7 +2020,7 @@ size_t LZ4F_decompress(LZ4F_dctx* dctx,
         }   /* switch (dctx->dStage) */
     }   /* while (doAnotherStage) */
 
-    /* preserve history within tmp whenever necessary */
+    /* preserve history within tmpOut whenever necessary */
     LZ4F_STATIC_ASSERT((unsigned)dstage_init == 2);
     if ( (dctx->frameInfo.blockMode==LZ4F_blockLinked)  /* next block will use up to 64KB from previous ones */
       && (dctx->dict != dctx->tmpOutBuffer)             /* dictionary is not already within tmp */
-- 
cgit v1.2.3