Implement multi-threaded ASTC CPU decompression.

On my machine, this results in a x5 speedup with 8 threads. Average decompression speed goes from 25 to 126 Mpixels/s for Summoners Wars: Chronicle. This also makes ASTC CPU decompression 3x faster than our current compute shade approach. I measured the loading times of the Aztec Ruins benchmark, which uses a lot of ASTC decompression before starting: - Compute shader decompression: 32 seconds - Multi-threaded CPU decompression: 10.5 seconds Change-Id: Id0fea5e3287231f13ad05ddbfc9bcc2a3bee5c1f
author: Greg Schlomoff <gregschlom@google.com> 2022-09-21 15:49:25 -0700
committer: Greg Schlomoff <gregschlom@google.com> 2022-10-10 17:02:38 +0000
commit: 1c3fe20fc7f7e67d4c53b3906cb9487f64eeff5c (patch)
tree: 521368eba452f6abdf47a7b93188be7e25cf1677
parent: 5634045841901f4c3585fdae018fe2b2855d4aa3 (diff)
download: vulkan-cereal-1c3fe20fc7f7e67d4c53b3906cb9487f64eeff5c.tar.gz
2 files changed, 131 insertions, 45 deletions
diff --git a/stream-servers/compressedTextureFormats/AstcCpuDecompressorImpl.cpp b/stream-servers/compressedTextureFormats/AstcCpuDecompressorImpl.cpp
index 69f82b09..5bf6ffb2 100644
--- a/stream-servers/compressedTextureFormats/AstcCpuDecompressorImpl.cpp
+++ b/stream-servers/compressedTextureFormats/AstcCpuDecompressorImpl.cpp
@@ -12,15 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <array>
+#include <future>
 #include <unordered_map>
 
 #include "AstcCpuDecompressor.h"
 #include "astcenc.h"
-#include "host-common/logging.h"
 
 namespace goldfish_vk {
 namespace {
 
+constexpr uint32_t kNumThreads = 2;
+
 const astcenc_swizzle kSwizzle = {ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A};
 
 // Used by std::unique_ptr to release the context when the pointer is destroyed
@@ -30,6 +33,40 @@ struct AstcencContextDeleter {
 
 using AstcencContextUniquePtr = std::unique_ptr<astcenc_context, AstcencContextDeleter>;
 
+// Creates a new astcenc_context and wraps it in a smart pointer.
+// It is not needed to call astcenc_context_free() on the returned pointer.
+// blockWith, blockSize: ASTC block size for the context
+// Error: (output param) Where to put the error status. Must not be null.
+// Returns nullptr in case of error.
+AstcencContextUniquePtr makeDecoderContext(uint32_t blockWidth, uint32_t blockHeight,
+                                           astcenc_error* error) {
+    astcenc_config config = {};
+    *error =
+        // TODO(gregschlom): Do we need to pass ASTCENC_PRF_LDR_SRGB here?
+        astcenc_config_init(ASTCENC_PRF_LDR, blockWidth, blockHeight, 1, ASTCENC_PRE_FASTEST,
+                            ASTCENC_FLG_DECOMPRESS_ONLY, &config);
+    if (*error != ASTCENC_SUCCESS) {
+        return nullptr;
+    }
+
+    astcenc_context* context;
+    *error = astcenc_context_alloc(&config, kNumThreads, &context);
+    if (*error != ASTCENC_SUCCESS) {
+        return nullptr;
+    }
+    return AstcencContextUniquePtr(context);
+}
+
+// Returns whether the ASTC decoder can be used on this machine. It might not be available if the
+// CPU doesn't support AVX2 instructions for example. Since this call is a bit expensive and never
+// changes, the result should be cached.
+bool isAstcDecoderAvailable() {
+    astcenc_error error;
+    // Try getting an arbitrary context. If it works, the decoder is available.
+    auto context = makeDecoderContext(5, 5, &error);
+    return context != nullptr;
+}
+
 // Caches and manages astcenc_context objects.
 //
 // Each context is fairly large (around 30 MB) and takes a while to construct, so it's important to
@@ -41,23 +78,17 @@ using AstcencContextUniquePtr = std::unique_ptr<astcenc_context, AstcencContextD
 //
 // Currently, there is no eviction strategy. Each cache could grow to a maximum of ~400 MB in size
 // since they are 13 possible ASTC block sizes.
+//
+// Thread-safety: not thread safe.
 class AstcDecoderContextCache {
    public:
-    // Returns the singleton instance of this class.
-    // The singleton is thread-local: each thread gets its own instance. Having a separate cache for
-    // each thread avoids needing to synchronize access to the context objects.
-    static AstcDecoderContextCache& instance() {
-        static thread_local AstcDecoderContextCache instance;
-        return instance;
-    }
-
     // Returns a context object for a given ASTC block size, along with the error code if the
     // context initialization failed.
     // In this case, the context will be null, and the status code will be non-zero.
     std::pair<astcenc_context*, astcenc_error> get(uint32_t blockWidth, uint32_t blockHeight) {
         Value& value = mContexts[{blockWidth, blockHeight}];
-        if (value.context == nullptr && value.error == ASTCENC_SUCCESS) {
-            value = makeDecoderContext(blockWidth, blockHeight);
+        if (value.context == nullptr) {
+            value.context = makeDecoderContext(blockWidth, blockHeight, &value.error);
         }
         return {value.context.get(), value.error};
     }
@@ -86,46 +117,88 @@ class AstcDecoderContextCache {
         }
     };
 
-    // Creates a new astcenc_context and wraps it in a smart pointer, so that we don't need to
-    // manually call astcenc_context_free.
-    Value makeDecoderContext(uint32_t blockWidth, uint32_t blockHeight) const {
-        astcenc_config config = {};
-        astcenc_error status =
-            // TODO(gregschlom): Do we need to pass ASTCENC_PRF_LDR_SRGB here?
-            astcenc_config_init(ASTCENC_PRF_LDR, blockWidth, blockHeight, 1, ASTCENC_PRE_FASTEST,
-                                ASTCENC_FLG_DECOMPRESS_ONLY, &config);
-        if (status != ASTCENC_SUCCESS) {
-            WARN("ASTC decoder: astcenc_config_init() failed: %s",
-                 astcenc_get_error_string(status));
-            return {nullptr, status};
-        }
+    std::unordered_map<Key, Value, KeyHash> mContexts;
+};
+
+// Thread-safety: all public methods are thread-safe
+class WorkerThread {
+   public:
+    explicit WorkerThread() : mThread(&WorkerThread::main, this) {}
+
+    // Terminates the thread. Call wait() to wait until the thread fully exits.
+    void terminate() {
+        std::lock_guard lock(mWorkerMutex);
+        mTerminated = true;
+        mWorkerCondition.notify_one();
+    }
 
-        astcenc_context* context;
-        status = astcenc_context_alloc(&config, /*thread_count=*/1, &context);
-        if (status != ASTCENC_SUCCESS) {
-            WARN("ASTC decoder: astcenc_context_alloc() failed: %s",
-                 astcenc_get_error_string(status));
-            return {nullptr, status};
+    // Blocks until the thread exits.
+    void wait() { mThread.join(); }
+
+    std::future<astcenc_error> decompress(astcenc_context* context, uint32_t threadIndex,
+                                          const uint8_t* data, size_t dataLength,
+                                          astcenc_image* image) {
+        std::lock_guard lock(mWorkerMutex);
+        mTask = std::packaged_task<astcenc_error()>{[=] {
+            return astcenc_decompress_image(context, data, dataLength, image, &kSwizzle,
+                                            threadIndex);
+        }};
+        mWorkerCondition.notify_one();
+        return mTask.get_future();
+    }
+
+   private:
+    // Thread's main loop
+    void main() {
+        while (true) {
+            std::packaged_task<astcenc_error()> task;
+            {
+                std::unique_lock lock(mWorkerMutex);
+                mWorkerCondition.wait(lock, [this] { return mTask.valid() || mTerminated; });
+                if (mTerminated) return;
+                task = std::move(mTask);
+            }
+            task();
         }
-        return {AstcencContextUniquePtr(context), ASTCENC_SUCCESS};
     }
 
-    std::unordered_map<Key, Value, KeyHash> mContexts;
+    bool mTerminated = false;
+    std::condition_variable mWorkerCondition = {};  // Signals availability of work
+    std::mutex mWorkerMutex = {};                   // Mutex used with mWorkerCondition.
+    std::packaged_task<astcenc_error()> mTask = {};
+    std::thread mThread = {};
 };
 
 // Performs ASTC decompression of an image on the CPU
 class AstcCpuDecompressorImpl : public AstcCpuDecompressor {
    public:
+    AstcCpuDecompressorImpl()
+        : AstcCpuDecompressor(), mContextCache(std::make_unique<AstcDecoderContextCache>()) {}
+
+    ~AstcCpuDecompressorImpl() override {
+        // Stop the worker threads, otherwise the process would hang upon exit.
+        std::lock_guard global_lock(mMutex);
+        for (auto& worker : mWorkerThreads) {
+            worker.terminate();
+            worker.wait();
+        }
+    }
+
     bool available() const override {
-        // Try getting an arbitrary context. This checks that we have all the pre-requisites
-        // (e.g. the CPU supports AVX2 instructions, etc.)
-        auto [context, status] = AstcDecoderContextCache::instance().get(5, 5);
-        return status == ASTCENC_SUCCESS;
+        static bool available = isAstcDecoderAvailable();
+        return available;
     }
 
     int32_t decompress(const uint32_t imgWidth, const uint32_t imgHeight, const uint32_t blockWidth,
                        const uint32_t blockHeight, const uint8_t* astcData, size_t astcDataLength,
                        uint8_t* output) override {
+        std::array<std::future<astcenc_error>, kNumThreads> futures;
+
+        std::lock_guard global_lock(mMutex);
+
+        auto [context, context_status] = mContextCache->get(blockWidth, blockHeight);
+        if (context_status != ASTCENC_SUCCESS) return context_status;
+
         astcenc_image image = {
             .dim_x = imgWidth,
             .dim_y = imgHeight,
@@ -134,16 +207,34 @@ class AstcCpuDecompressorImpl : public AstcCpuDecompressor {
             .data = reinterpret_cast<void**>(&output),
         };
 
-        auto [context, status] = AstcDecoderContextCache::instance().get(blockWidth, blockHeight);
-        if (status != ASTCENC_SUCCESS) return status;
+        for (uint32_t i = 0; i < kNumThreads; ++i) {
+            futures[i] = mWorkerThreads[i].decompress(context, i, astcData, astcDataLength, &image);
+        }
+
+        astcenc_error result = ASTCENC_SUCCESS;
+
+        // Wait for all threads to be done
+        for (auto& future : futures) {
+            astcenc_error status = future.get();
+            if (status != ASTCENC_SUCCESS) {
+                result = status;
+            }
+        }
+
+        astcenc_decompress_reset(context);
 
-        return astcenc_decompress_image(context, astcData, astcDataLength, &image, &kSwizzle, 0);
+        return result;
     }
 
     const char* getStatusString(int32_t statusCode) const override {
         const char* msg = astcenc_get_error_string((astcenc_error)statusCode);
         return msg ? msg : "ASTCENC_UNKNOWN_STATUS";
     }
+
+   private:
+    std::unique_ptr<AstcDecoderContextCache> mContextCache;
+    std::mutex mMutex;  // Locked while calling `decompress()`
+    std::array<WorkerThread, kNumThreads> mWorkerThreads;
 };
 
 }  // namespace
@@ -153,4 +244,4 @@ AstcCpuDecompressor& AstcCpuDecompressor::get() {
     return instance;
 }
 
-}  // namespace goldfish_vk
-\ No newline at end of file
+}  // namespace goldfish_vk
diff --git a/stream-servers/compressedTextureFormats/CMakeLists.txt b/stream-servers/compressedTextureFormats/CMakeLists.txt
index 4b9c1cef..e6b8f21f 100644
--- a/stream-servers/compressedTextureFormats/CMakeLists.txt
+++ b/stream-servers/compressedTextureFormats/CMakeLists.txt
@@ -8,11 +8,6 @@ add_library(
     gfxstream-compressedTextures
     ${astc-cpu-decompressor-sources}
     etc.cpp)
-target_include_directories(
-    gfxstream-compressedTextures PUBLIC ../)
-
-target_link_libraries(gfxstream-compressedTextures PRIVATE
-    gfxstream-base.headers)
 
 if(ASTC_CPU_DECODING)
     target_link_libraries(gfxstream-compressedTextures PRIVATE astcdec-avx2-static)
author	Greg Schlomoff <gregschlom@google.com>	2022-09-21 15:49:25 -0700
committer	Greg Schlomoff <gregschlom@google.com>	2022-10-10 17:02:38 +0000
commit	1c3fe20fc7f7e67d4c53b3906cb9487f64eeff5c (patch)
tree	521368eba452f6abdf47a7b93188be7e25cf1677
parent	5634045841901f4c3585fdae018fe2b2855d4aa3 (diff)
download	vulkan-cereal-1c3fe20fc7f7e67d4c53b3906cb9487f64eeff5c.tar.gz