diff options
author | Greg Schlomoff <gregschlom@google.com> | 2022-09-21 15:49:25 -0700 |
---|---|---|
committer | Greg Schlomoff <gregschlom@google.com> | 2022-10-10 17:02:38 +0000 |
commit | 1c3fe20fc7f7e67d4c53b3906cb9487f64eeff5c (patch) | |
tree | 521368eba452f6abdf47a7b93188be7e25cf1677 | |
parent | 5634045841901f4c3585fdae018fe2b2855d4aa3 (diff) | |
download | vulkan-cereal-1c3fe20fc7f7e67d4c53b3906cb9487f64eeff5c.tar.gz |
Implement multi-threaded ASTC CPU decompression.
On my machine, this results in a x5 speedup with 8 threads. Average decompression speed goes from 25 to 126 Mpixels/s for Summoners Wars: Chronicle.
This also makes ASTC CPU decompression 3x faster than our current compute shade approach. I measured the loading times of the Aztec Ruins benchmark, which uses a lot of ASTC decompression before starting:
- Compute shader decompression: 32 seconds
- Multi-threaded CPU decompression: 10.5 seconds
Change-Id: Id0fea5e3287231f13ad05ddbfc9bcc2a3bee5c1f
-rw-r--r-- | stream-servers/compressedTextureFormats/AstcCpuDecompressorImpl.cpp | 171 | ||||
-rw-r--r-- | stream-servers/compressedTextureFormats/CMakeLists.txt | 5 |
2 files changed, 131 insertions, 45 deletions
diff --git a/stream-servers/compressedTextureFormats/AstcCpuDecompressorImpl.cpp b/stream-servers/compressedTextureFormats/AstcCpuDecompressorImpl.cpp index 69f82b09..5bf6ffb2 100644 --- a/stream-servers/compressedTextureFormats/AstcCpuDecompressorImpl.cpp +++ b/stream-servers/compressedTextureFormats/AstcCpuDecompressorImpl.cpp @@ -12,15 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include <array> +#include <future> #include <unordered_map> #include "AstcCpuDecompressor.h" #include "astcenc.h" -#include "host-common/logging.h" namespace goldfish_vk { namespace { +constexpr uint32_t kNumThreads = 2; + const astcenc_swizzle kSwizzle = {ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A}; // Used by std::unique_ptr to release the context when the pointer is destroyed @@ -30,6 +33,40 @@ struct AstcencContextDeleter { using AstcencContextUniquePtr = std::unique_ptr<astcenc_context, AstcencContextDeleter>; +// Creates a new astcenc_context and wraps it in a smart pointer. +// It is not needed to call astcenc_context_free() on the returned pointer. +// blockWith, blockSize: ASTC block size for the context +// Error: (output param) Where to put the error status. Must not be null. +// Returns nullptr in case of error. +AstcencContextUniquePtr makeDecoderContext(uint32_t blockWidth, uint32_t blockHeight, + astcenc_error* error) { + astcenc_config config = {}; + *error = + // TODO(gregschlom): Do we need to pass ASTCENC_PRF_LDR_SRGB here? + astcenc_config_init(ASTCENC_PRF_LDR, blockWidth, blockHeight, 1, ASTCENC_PRE_FASTEST, + ASTCENC_FLG_DECOMPRESS_ONLY, &config); + if (*error != ASTCENC_SUCCESS) { + return nullptr; + } + + astcenc_context* context; + *error = astcenc_context_alloc(&config, kNumThreads, &context); + if (*error != ASTCENC_SUCCESS) { + return nullptr; + } + return AstcencContextUniquePtr(context); +} + +// Returns whether the ASTC decoder can be used on this machine. It might not be available if the +// CPU doesn't support AVX2 instructions for example. Since this call is a bit expensive and never +// changes, the result should be cached. +bool isAstcDecoderAvailable() { + astcenc_error error; + // Try getting an arbitrary context. If it works, the decoder is available. + auto context = makeDecoderContext(5, 5, &error); + return context != nullptr; +} + // Caches and manages astcenc_context objects. // // Each context is fairly large (around 30 MB) and takes a while to construct, so it's important to @@ -41,23 +78,17 @@ using AstcencContextUniquePtr = std::unique_ptr<astcenc_context, AstcencContextD // // Currently, there is no eviction strategy. Each cache could grow to a maximum of ~400 MB in size // since they are 13 possible ASTC block sizes. +// +// Thread-safety: not thread safe. class AstcDecoderContextCache { public: - // Returns the singleton instance of this class. - // The singleton is thread-local: each thread gets its own instance. Having a separate cache for - // each thread avoids needing to synchronize access to the context objects. - static AstcDecoderContextCache& instance() { - static thread_local AstcDecoderContextCache instance; - return instance; - } - // Returns a context object for a given ASTC block size, along with the error code if the // context initialization failed. // In this case, the context will be null, and the status code will be non-zero. std::pair<astcenc_context*, astcenc_error> get(uint32_t blockWidth, uint32_t blockHeight) { Value& value = mContexts[{blockWidth, blockHeight}]; - if (value.context == nullptr && value.error == ASTCENC_SUCCESS) { - value = makeDecoderContext(blockWidth, blockHeight); + if (value.context == nullptr) { + value.context = makeDecoderContext(blockWidth, blockHeight, &value.error); } return {value.context.get(), value.error}; } @@ -86,46 +117,88 @@ class AstcDecoderContextCache { } }; - // Creates a new astcenc_context and wraps it in a smart pointer, so that we don't need to - // manually call astcenc_context_free. - Value makeDecoderContext(uint32_t blockWidth, uint32_t blockHeight) const { - astcenc_config config = {}; - astcenc_error status = - // TODO(gregschlom): Do we need to pass ASTCENC_PRF_LDR_SRGB here? - astcenc_config_init(ASTCENC_PRF_LDR, blockWidth, blockHeight, 1, ASTCENC_PRE_FASTEST, - ASTCENC_FLG_DECOMPRESS_ONLY, &config); - if (status != ASTCENC_SUCCESS) { - WARN("ASTC decoder: astcenc_config_init() failed: %s", - astcenc_get_error_string(status)); - return {nullptr, status}; - } + std::unordered_map<Key, Value, KeyHash> mContexts; +}; + +// Thread-safety: all public methods are thread-safe +class WorkerThread { + public: + explicit WorkerThread() : mThread(&WorkerThread::main, this) {} + + // Terminates the thread. Call wait() to wait until the thread fully exits. + void terminate() { + std::lock_guard lock(mWorkerMutex); + mTerminated = true; + mWorkerCondition.notify_one(); + } - astcenc_context* context; - status = astcenc_context_alloc(&config, /*thread_count=*/1, &context); - if (status != ASTCENC_SUCCESS) { - WARN("ASTC decoder: astcenc_context_alloc() failed: %s", - astcenc_get_error_string(status)); - return {nullptr, status}; + // Blocks until the thread exits. + void wait() { mThread.join(); } + + std::future<astcenc_error> decompress(astcenc_context* context, uint32_t threadIndex, + const uint8_t* data, size_t dataLength, + astcenc_image* image) { + std::lock_guard lock(mWorkerMutex); + mTask = std::packaged_task<astcenc_error()>{[=] { + return astcenc_decompress_image(context, data, dataLength, image, &kSwizzle, + threadIndex); + }}; + mWorkerCondition.notify_one(); + return mTask.get_future(); + } + + private: + // Thread's main loop + void main() { + while (true) { + std::packaged_task<astcenc_error()> task; + { + std::unique_lock lock(mWorkerMutex); + mWorkerCondition.wait(lock, [this] { return mTask.valid() || mTerminated; }); + if (mTerminated) return; + task = std::move(mTask); + } + task(); } - return {AstcencContextUniquePtr(context), ASTCENC_SUCCESS}; } - std::unordered_map<Key, Value, KeyHash> mContexts; + bool mTerminated = false; + std::condition_variable mWorkerCondition = {}; // Signals availability of work + std::mutex mWorkerMutex = {}; // Mutex used with mWorkerCondition. + std::packaged_task<astcenc_error()> mTask = {}; + std::thread mThread = {}; }; // Performs ASTC decompression of an image on the CPU class AstcCpuDecompressorImpl : public AstcCpuDecompressor { public: + AstcCpuDecompressorImpl() + : AstcCpuDecompressor(), mContextCache(std::make_unique<AstcDecoderContextCache>()) {} + + ~AstcCpuDecompressorImpl() override { + // Stop the worker threads, otherwise the process would hang upon exit. + std::lock_guard global_lock(mMutex); + for (auto& worker : mWorkerThreads) { + worker.terminate(); + worker.wait(); + } + } + bool available() const override { - // Try getting an arbitrary context. This checks that we have all the pre-requisites - // (e.g. the CPU supports AVX2 instructions, etc.) - auto [context, status] = AstcDecoderContextCache::instance().get(5, 5); - return status == ASTCENC_SUCCESS; + static bool available = isAstcDecoderAvailable(); + return available; } int32_t decompress(const uint32_t imgWidth, const uint32_t imgHeight, const uint32_t blockWidth, const uint32_t blockHeight, const uint8_t* astcData, size_t astcDataLength, uint8_t* output) override { + std::array<std::future<astcenc_error>, kNumThreads> futures; + + std::lock_guard global_lock(mMutex); + + auto [context, context_status] = mContextCache->get(blockWidth, blockHeight); + if (context_status != ASTCENC_SUCCESS) return context_status; + astcenc_image image = { .dim_x = imgWidth, .dim_y = imgHeight, @@ -134,16 +207,34 @@ class AstcCpuDecompressorImpl : public AstcCpuDecompressor { .data = reinterpret_cast<void**>(&output), }; - auto [context, status] = AstcDecoderContextCache::instance().get(blockWidth, blockHeight); - if (status != ASTCENC_SUCCESS) return status; + for (uint32_t i = 0; i < kNumThreads; ++i) { + futures[i] = mWorkerThreads[i].decompress(context, i, astcData, astcDataLength, &image); + } + + astcenc_error result = ASTCENC_SUCCESS; + + // Wait for all threads to be done + for (auto& future : futures) { + astcenc_error status = future.get(); + if (status != ASTCENC_SUCCESS) { + result = status; + } + } + + astcenc_decompress_reset(context); - return astcenc_decompress_image(context, astcData, astcDataLength, &image, &kSwizzle, 0); + return result; } const char* getStatusString(int32_t statusCode) const override { const char* msg = astcenc_get_error_string((astcenc_error)statusCode); return msg ? msg : "ASTCENC_UNKNOWN_STATUS"; } + + private: + std::unique_ptr<AstcDecoderContextCache> mContextCache; + std::mutex mMutex; // Locked while calling `decompress()` + std::array<WorkerThread, kNumThreads> mWorkerThreads; }; } // namespace @@ -153,4 +244,4 @@ AstcCpuDecompressor& AstcCpuDecompressor::get() { return instance; } -} // namespace goldfish_vk
\ No newline at end of file +} // namespace goldfish_vk diff --git a/stream-servers/compressedTextureFormats/CMakeLists.txt b/stream-servers/compressedTextureFormats/CMakeLists.txt index 4b9c1cef..e6b8f21f 100644 --- a/stream-servers/compressedTextureFormats/CMakeLists.txt +++ b/stream-servers/compressedTextureFormats/CMakeLists.txt @@ -8,11 +8,6 @@ add_library( gfxstream-compressedTextures ${astc-cpu-decompressor-sources} etc.cpp) -target_include_directories( - gfxstream-compressedTextures PUBLIC ../) - -target_link_libraries(gfxstream-compressedTextures PRIVATE - gfxstream-base.headers) if(ASTC_CPU_DECODING) target_link_libraries(gfxstream-compressedTextures PRIVATE astcdec-avx2-static) |