summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGreg Schlomoff <gregschlom@google.com>2022-09-21 15:49:25 -0700
committerGreg Schlomoff <gregschlom@google.com>2022-10-10 17:02:38 +0000
commit1c3fe20fc7f7e67d4c53b3906cb9487f64eeff5c (patch)
tree521368eba452f6abdf47a7b93188be7e25cf1677
parent5634045841901f4c3585fdae018fe2b2855d4aa3 (diff)
downloadvulkan-cereal-1c3fe20fc7f7e67d4c53b3906cb9487f64eeff5c.tar.gz
Implement multi-threaded ASTC CPU decompression.
On my machine, this results in a x5 speedup with 8 threads. Average decompression speed goes from 25 to 126 Mpixels/s for Summoners Wars: Chronicle. This also makes ASTC CPU decompression 3x faster than our current compute shade approach. I measured the loading times of the Aztec Ruins benchmark, which uses a lot of ASTC decompression before starting: - Compute shader decompression: 32 seconds - Multi-threaded CPU decompression: 10.5 seconds Change-Id: Id0fea5e3287231f13ad05ddbfc9bcc2a3bee5c1f
-rw-r--r--stream-servers/compressedTextureFormats/AstcCpuDecompressorImpl.cpp171
-rw-r--r--stream-servers/compressedTextureFormats/CMakeLists.txt5
2 files changed, 131 insertions, 45 deletions
diff --git a/stream-servers/compressedTextureFormats/AstcCpuDecompressorImpl.cpp b/stream-servers/compressedTextureFormats/AstcCpuDecompressorImpl.cpp
index 69f82b09..5bf6ffb2 100644
--- a/stream-servers/compressedTextureFormats/AstcCpuDecompressorImpl.cpp
+++ b/stream-servers/compressedTextureFormats/AstcCpuDecompressorImpl.cpp
@@ -12,15 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include <array>
+#include <future>
#include <unordered_map>
#include "AstcCpuDecompressor.h"
#include "astcenc.h"
-#include "host-common/logging.h"
namespace goldfish_vk {
namespace {
+constexpr uint32_t kNumThreads = 2;
+
const astcenc_swizzle kSwizzle = {ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A};
// Used by std::unique_ptr to release the context when the pointer is destroyed
@@ -30,6 +33,40 @@ struct AstcencContextDeleter {
using AstcencContextUniquePtr = std::unique_ptr<astcenc_context, AstcencContextDeleter>;
+// Creates a new astcenc_context and wraps it in a smart pointer.
+// It is not needed to call astcenc_context_free() on the returned pointer.
+// blockWith, blockSize: ASTC block size for the context
+// Error: (output param) Where to put the error status. Must not be null.
+// Returns nullptr in case of error.
+AstcencContextUniquePtr makeDecoderContext(uint32_t blockWidth, uint32_t blockHeight,
+ astcenc_error* error) {
+ astcenc_config config = {};
+ *error =
+ // TODO(gregschlom): Do we need to pass ASTCENC_PRF_LDR_SRGB here?
+ astcenc_config_init(ASTCENC_PRF_LDR, blockWidth, blockHeight, 1, ASTCENC_PRE_FASTEST,
+ ASTCENC_FLG_DECOMPRESS_ONLY, &config);
+ if (*error != ASTCENC_SUCCESS) {
+ return nullptr;
+ }
+
+ astcenc_context* context;
+ *error = astcenc_context_alloc(&config, kNumThreads, &context);
+ if (*error != ASTCENC_SUCCESS) {
+ return nullptr;
+ }
+ return AstcencContextUniquePtr(context);
+}
+
+// Returns whether the ASTC decoder can be used on this machine. It might not be available if the
+// CPU doesn't support AVX2 instructions for example. Since this call is a bit expensive and never
+// changes, the result should be cached.
+bool isAstcDecoderAvailable() {
+ astcenc_error error;
+ // Try getting an arbitrary context. If it works, the decoder is available.
+ auto context = makeDecoderContext(5, 5, &error);
+ return context != nullptr;
+}
+
// Caches and manages astcenc_context objects.
//
// Each context is fairly large (around 30 MB) and takes a while to construct, so it's important to
@@ -41,23 +78,17 @@ using AstcencContextUniquePtr = std::unique_ptr<astcenc_context, AstcencContextD
//
// Currently, there is no eviction strategy. Each cache could grow to a maximum of ~400 MB in size
// since they are 13 possible ASTC block sizes.
+//
+// Thread-safety: not thread safe.
class AstcDecoderContextCache {
public:
- // Returns the singleton instance of this class.
- // The singleton is thread-local: each thread gets its own instance. Having a separate cache for
- // each thread avoids needing to synchronize access to the context objects.
- static AstcDecoderContextCache& instance() {
- static thread_local AstcDecoderContextCache instance;
- return instance;
- }
-
// Returns a context object for a given ASTC block size, along with the error code if the
// context initialization failed.
// In this case, the context will be null, and the status code will be non-zero.
std::pair<astcenc_context*, astcenc_error> get(uint32_t blockWidth, uint32_t blockHeight) {
Value& value = mContexts[{blockWidth, blockHeight}];
- if (value.context == nullptr && value.error == ASTCENC_SUCCESS) {
- value = makeDecoderContext(blockWidth, blockHeight);
+ if (value.context == nullptr) {
+ value.context = makeDecoderContext(blockWidth, blockHeight, &value.error);
}
return {value.context.get(), value.error};
}
@@ -86,46 +117,88 @@ class AstcDecoderContextCache {
}
};
- // Creates a new astcenc_context and wraps it in a smart pointer, so that we don't need to
- // manually call astcenc_context_free.
- Value makeDecoderContext(uint32_t blockWidth, uint32_t blockHeight) const {
- astcenc_config config = {};
- astcenc_error status =
- // TODO(gregschlom): Do we need to pass ASTCENC_PRF_LDR_SRGB here?
- astcenc_config_init(ASTCENC_PRF_LDR, blockWidth, blockHeight, 1, ASTCENC_PRE_FASTEST,
- ASTCENC_FLG_DECOMPRESS_ONLY, &config);
- if (status != ASTCENC_SUCCESS) {
- WARN("ASTC decoder: astcenc_config_init() failed: %s",
- astcenc_get_error_string(status));
- return {nullptr, status};
- }
+ std::unordered_map<Key, Value, KeyHash> mContexts;
+};
+
+// Thread-safety: all public methods are thread-safe
+class WorkerThread {
+ public:
+ explicit WorkerThread() : mThread(&WorkerThread::main, this) {}
+
+ // Terminates the thread. Call wait() to wait until the thread fully exits.
+ void terminate() {
+ std::lock_guard lock(mWorkerMutex);
+ mTerminated = true;
+ mWorkerCondition.notify_one();
+ }
- astcenc_context* context;
- status = astcenc_context_alloc(&config, /*thread_count=*/1, &context);
- if (status != ASTCENC_SUCCESS) {
- WARN("ASTC decoder: astcenc_context_alloc() failed: %s",
- astcenc_get_error_string(status));
- return {nullptr, status};
+ // Blocks until the thread exits.
+ void wait() { mThread.join(); }
+
+ std::future<astcenc_error> decompress(astcenc_context* context, uint32_t threadIndex,
+ const uint8_t* data, size_t dataLength,
+ astcenc_image* image) {
+ std::lock_guard lock(mWorkerMutex);
+ mTask = std::packaged_task<astcenc_error()>{[=] {
+ return astcenc_decompress_image(context, data, dataLength, image, &kSwizzle,
+ threadIndex);
+ }};
+ mWorkerCondition.notify_one();
+ return mTask.get_future();
+ }
+
+ private:
+ // Thread's main loop
+ void main() {
+ while (true) {
+ std::packaged_task<astcenc_error()> task;
+ {
+ std::unique_lock lock(mWorkerMutex);
+ mWorkerCondition.wait(lock, [this] { return mTask.valid() || mTerminated; });
+ if (mTerminated) return;
+ task = std::move(mTask);
+ }
+ task();
}
- return {AstcencContextUniquePtr(context), ASTCENC_SUCCESS};
}
- std::unordered_map<Key, Value, KeyHash> mContexts;
+ bool mTerminated = false;
+ std::condition_variable mWorkerCondition = {}; // Signals availability of work
+ std::mutex mWorkerMutex = {}; // Mutex used with mWorkerCondition.
+ std::packaged_task<astcenc_error()> mTask = {};
+ std::thread mThread = {};
};
// Performs ASTC decompression of an image on the CPU
class AstcCpuDecompressorImpl : public AstcCpuDecompressor {
public:
+ AstcCpuDecompressorImpl()
+ : AstcCpuDecompressor(), mContextCache(std::make_unique<AstcDecoderContextCache>()) {}
+
+ ~AstcCpuDecompressorImpl() override {
+ // Stop the worker threads, otherwise the process would hang upon exit.
+ std::lock_guard global_lock(mMutex);
+ for (auto& worker : mWorkerThreads) {
+ worker.terminate();
+ worker.wait();
+ }
+ }
+
bool available() const override {
- // Try getting an arbitrary context. This checks that we have all the pre-requisites
- // (e.g. the CPU supports AVX2 instructions, etc.)
- auto [context, status] = AstcDecoderContextCache::instance().get(5, 5);
- return status == ASTCENC_SUCCESS;
+ static bool available = isAstcDecoderAvailable();
+ return available;
}
int32_t decompress(const uint32_t imgWidth, const uint32_t imgHeight, const uint32_t blockWidth,
const uint32_t blockHeight, const uint8_t* astcData, size_t astcDataLength,
uint8_t* output) override {
+ std::array<std::future<astcenc_error>, kNumThreads> futures;
+
+ std::lock_guard global_lock(mMutex);
+
+ auto [context, context_status] = mContextCache->get(blockWidth, blockHeight);
+ if (context_status != ASTCENC_SUCCESS) return context_status;
+
astcenc_image image = {
.dim_x = imgWidth,
.dim_y = imgHeight,
@@ -134,16 +207,34 @@ class AstcCpuDecompressorImpl : public AstcCpuDecompressor {
.data = reinterpret_cast<void**>(&output),
};
- auto [context, status] = AstcDecoderContextCache::instance().get(blockWidth, blockHeight);
- if (status != ASTCENC_SUCCESS) return status;
+ for (uint32_t i = 0; i < kNumThreads; ++i) {
+ futures[i] = mWorkerThreads[i].decompress(context, i, astcData, astcDataLength, &image);
+ }
+
+ astcenc_error result = ASTCENC_SUCCESS;
+
+ // Wait for all threads to be done
+ for (auto& future : futures) {
+ astcenc_error status = future.get();
+ if (status != ASTCENC_SUCCESS) {
+ result = status;
+ }
+ }
+
+ astcenc_decompress_reset(context);
- return astcenc_decompress_image(context, astcData, astcDataLength, &image, &kSwizzle, 0);
+ return result;
}
const char* getStatusString(int32_t statusCode) const override {
const char* msg = astcenc_get_error_string((astcenc_error)statusCode);
return msg ? msg : "ASTCENC_UNKNOWN_STATUS";
}
+
+ private:
+ std::unique_ptr<AstcDecoderContextCache> mContextCache;
+ std::mutex mMutex; // Locked while calling `decompress()`
+ std::array<WorkerThread, kNumThreads> mWorkerThreads;
};
} // namespace
@@ -153,4 +244,4 @@ AstcCpuDecompressor& AstcCpuDecompressor::get() {
return instance;
}
-} // namespace goldfish_vk \ No newline at end of file
+} // namespace goldfish_vk
diff --git a/stream-servers/compressedTextureFormats/CMakeLists.txt b/stream-servers/compressedTextureFormats/CMakeLists.txt
index 4b9c1cef..e6b8f21f 100644
--- a/stream-servers/compressedTextureFormats/CMakeLists.txt
+++ b/stream-servers/compressedTextureFormats/CMakeLists.txt
@@ -8,11 +8,6 @@ add_library(
gfxstream-compressedTextures
${astc-cpu-decompressor-sources}
etc.cpp)
-target_include_directories(
- gfxstream-compressedTextures PUBLIC ../)
-
-target_link_libraries(gfxstream-compressedTextures PRIVATE
- gfxstream-base.headers)
if(ASTC_CPU_DECODING)
target_link_libraries(gfxstream-compressedTextures PRIVATE astcdec-avx2-static)