diff options
author | Austin Annestrand <a.annestrand@samsung.com> | 2024-04-03 17:46:38 -0500 |
---|---|---|
committer | Angle LUCI CQ <angle-scoped@luci-project-accounts.iam.gserviceaccount.com> | 2024-04-18 20:36:23 +0000 |
commit | d4abe62268f302e0df2d51cb655408a81361aa38 (patch) | |
tree | 906fc611f765676d54e306c58b8dbe747f8a90f5 | |
parent | 4813295059014a39fb75d6a9dd031debb079c69e (diff) | |
download | angle-d4abe62268f302e0df2d51cb655408a81361aa38.tar.gz |
CL/VK: Implement enqueue NDRangeKernel & Task
Adding support for:
clEnqueueNDRangeKernel
clEnqueueTask
Bug: angleproject:8631
Change-Id: If57002be3ea00a55215e89ca47ab8fe9a422c6e7
Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/5406614
Reviewed-by: Shahbaz Youssefi <syoussefi@chromium.org>
Commit-Queue: Austin Annestrand <a.annestrand@samsung.com>
Reviewed-by: Geoff Lang <geofflang@chromium.org>
-rw-r--r-- | src/libANGLE/renderer/vulkan/CLCommandQueueVk.cpp | 280 | ||||
-rw-r--r-- | src/libANGLE/renderer/vulkan/CLCommandQueueVk.h | 31 | ||||
-rw-r--r-- | src/libANGLE/renderer/vulkan/CLKernelVk.cpp | 90 | ||||
-rw-r--r-- | src/libANGLE/renderer/vulkan/CLKernelVk.h | 15 | ||||
-rw-r--r-- | src/libANGLE/renderer/vulkan/CLProgramVk.cpp | 4 | ||||
-rw-r--r-- | src/libANGLE/renderer/vulkan/CLProgramVk.h | 15 | ||||
-rw-r--r-- | src/libANGLE/renderer/vulkan/ProgramExecutableVk.cpp | 2 | ||||
-rw-r--r-- | src/libANGLE/renderer/vulkan/UtilsVk.cpp | 2 | ||||
-rw-r--r-- | src/libANGLE/renderer/vulkan/vk_helpers.cpp | 8 | ||||
-rw-r--r-- | src/libANGLE/renderer/vulkan/vk_helpers.h | 4 | ||||
-rw-r--r-- | src/libANGLE/validationCL.cpp | 20 |
11 files changed, 433 insertions, 38 deletions
diff --git a/src/libANGLE/renderer/vulkan/CLCommandQueueVk.cpp b/src/libANGLE/renderer/vulkan/CLCommandQueueVk.cpp index 258acb1c76..33f0de41f4 100644 --- a/src/libANGLE/renderer/vulkan/CLCommandQueueVk.cpp +++ b/src/libANGLE/renderer/vulkan/CLCommandQueueVk.cpp @@ -8,12 +8,21 @@ #include "libANGLE/renderer/vulkan/CLCommandQueueVk.h" #include "libANGLE/renderer/vulkan/CLContextVk.h" #include "libANGLE/renderer/vulkan/CLDeviceVk.h" +#include "libANGLE/renderer/vulkan/CLKernelVk.h" +#include "libANGLE/renderer/vulkan/CLMemoryVk.h" +#include "libANGLE/renderer/vulkan/CLProgramVk.h" +#include "libANGLE/renderer/vulkan/cl_types.h" #include "libANGLE/renderer/vulkan/vk_renderer.h" +#include "libANGLE/CLBuffer.h" #include "libANGLE/CLCommandQueue.h" #include "libANGLE/CLContext.h" +#include "libANGLE/CLEvent.h" +#include "libANGLE/CLKernel.h" #include "libANGLE/cl_utils.h" +#include "spirv/unified1/NonSemanticClspvReflection.h" + namespace rx { @@ -21,18 +30,10 @@ CLCommandQueueVk::CLCommandQueueVk(const cl::CommandQueue &commandQueue) : CLCommandQueueImpl(commandQueue), mContext(&commandQueue.getContext().getImpl<CLContextVk>()), mDevice(&commandQueue.getDevice().getImpl<CLDeviceVk>()), - mComputePassCommands(nullptr) + mComputePassCommands(nullptr), + mCurrentQueueSerialIndex(kInvalidQueueSerialIndex) {} -CLCommandQueueVk::~CLCommandQueueVk() -{ - VkDevice vkDevice = mContext->getDevice(); - - // Recycle the current command buffers - mContext->getRenderer()->recycleOutsideRenderPassCommandBufferHelper(&mComputePassCommands); - mCommandPool.outsideRenderPassPool.destroy(vkDevice); -} - angle::Result CLCommandQueueVk::init() { ANGLE_CL_IMPL_TRY_ERROR( @@ -46,9 +47,36 @@ angle::Result CLCommandQueueVk::init() &mOutsideRenderPassCommandsAllocator, &mComputePassCommands), CL_OUT_OF_RESOURCES); + // Generate initial QueueSerial for command buffer helper + ANGLE_CL_IMPL_TRY_ERROR( + mContext->getRenderer()->allocateQueueSerialIndex(&mCurrentQueueSerialIndex), + CL_OUT_OF_RESOURCES); + mComputePassCommands->setQueueSerial( + mCurrentQueueSerialIndex, + mContext->getRenderer()->generateQueueSerial(mCurrentQueueSerialIndex)); + + // Initialize serials to be valid but appear submitted and finished. + mLastFlushedQueueSerial = QueueSerial(mCurrentQueueSerialIndex, Serial()); + mLastSubmittedQueueSerial = mLastFlushedQueueSerial; + return angle::Result::Continue; } +CLCommandQueueVk::~CLCommandQueueVk() +{ + VkDevice vkDevice = mContext->getDevice(); + + if (mCurrentQueueSerialIndex != kInvalidQueueSerialIndex) + { + mContext->getRenderer()->releaseQueueSerialIndex(mCurrentQueueSerialIndex); + mCurrentQueueSerialIndex = kInvalidQueueSerialIndex; + } + + // Recycle the current command buffers + mContext->getRenderer()->recycleOutsideRenderPassCommandBufferHelper(&mComputePassCommands); + mCommandPool.outsideRenderPassPool.destroy(vkDevice); +} + angle::Result CLCommandQueueVk::setProperty(cl::CommandQueueProperties properties, cl_bool enable) { // NOTE: "clSetCommandQueueProperty" has been deprecated as of OpenCL 1.1 @@ -280,16 +308,41 @@ angle::Result CLCommandQueueVk::enqueueNDRangeKernel(const cl::Kernel &kernel, const cl::EventPtrs &waitEvents, CLEventImpl::CreateFunc *eventCreateFunc) { - UNIMPLEMENTED(); - ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES); + std::scoped_lock<std::mutex> sl(mCommandQueueMutex); + + ANGLE_TRY(processWaitlist(waitEvents)); + + cl::WorkgroupCount workgroupCount; + vk::PipelineCacheAccess pipelineCache; + vk::PipelineHelper *pipelineHelper = nullptr; + CLKernelVk &kernelImpl = kernel.getImpl<CLKernelVk>(); + + ANGLE_TRY(processKernelResources(kernelImpl, ndrange)); + + // Fetch or create compute pipeline (if we miss in cache) + ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->getPipelineCache(mContext, &pipelineCache), + CL_OUT_OF_RESOURCES); + ANGLE_TRY(kernelImpl.getOrCreateComputePipeline( + &pipelineCache, ndrange, mCommandQueue.getDevice(), &pipelineHelper, &workgroupCount)); + + mComputePassCommands->retainResource(pipelineHelper); + mComputePassCommands->getCommandBuffer().bindComputePipeline(pipelineHelper->getPipeline()); + mComputePassCommands->getCommandBuffer().dispatch(workgroupCount[0], workgroupCount[1], + workgroupCount[2]); + + ANGLE_TRY(createEvent(eventCreateFunc)); + + return angle::Result::Continue; } angle::Result CLCommandQueueVk::enqueueTask(const cl::Kernel &kernel, const cl::EventPtrs &waitEvents, CLEventImpl::CreateFunc *eventCreateFunc) { - UNIMPLEMENTED(); - ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES); + constexpr size_t globalWorkSize[3] = {1, 0, 0}; + constexpr size_t localWorkSize[3] = {1, 0, 0}; + cl::NDRange ndrange(1, nullptr, globalWorkSize, localWorkSize); + return enqueueNDRangeKernel(kernel, ndrange, waitEvents, eventCreateFunc); } angle::Result CLCommandQueueVk::enqueueNativeKernel(cl::UserFunc userFunc, @@ -348,4 +401,203 @@ angle::Result CLCommandQueueVk::finish() ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES); } +angle::Result CLCommandQueueVk::processKernelResources(CLKernelVk &kernelVk, + const cl::NDRange &ndrange) +{ + bool needsBarrier = false; + UpdateDescriptorSetsBuilder updateDescriptorSetsBuilder; + const CLProgramVk::DeviceProgramData *devProgramData = + kernelVk.getProgram()->getDeviceProgramData(mCommandQueue.getDevice().getNative()); + ASSERT(devProgramData != nullptr); + + // Allocate descriptor set + VkDescriptorSet descriptorSet{VK_NULL_HANDLE}; + ANGLE_TRY(kernelVk.getProgram()->allocateDescriptorSet( + kernelVk.getDescriptorSetLayouts()[DescriptorSetIndex::ShaderResource].get(), + &descriptorSet)); + + // Push global offset data + const VkPushConstantRange *globalOffsetRange = devProgramData->getGlobalOffsetRange(); + if (globalOffsetRange != nullptr) + { + mComputePassCommands->getCommandBuffer().pushConstants( + kernelVk.getPipelineLayout().get(), VK_SHADER_STAGE_COMPUTE_BIT, + globalOffsetRange->offset, globalOffsetRange->size, ndrange.globalWorkOffset.data()); + } + + // Push global size data + const VkPushConstantRange *globalSizeRange = devProgramData->getGlobalSizeRange(); + if (globalSizeRange != nullptr) + { + mComputePassCommands->getCommandBuffer().pushConstants( + kernelVk.getPipelineLayout().get(), VK_SHADER_STAGE_COMPUTE_BIT, + globalSizeRange->offset, globalSizeRange->size, ndrange.globalWorkSize.data()); + } + + // Process each kernel argument/resource + for (const auto &arg : kernelVk.getArgs()) + { + switch (arg.type) + { + case NonSemanticClspvReflectionArgumentUniform: + case NonSemanticClspvReflectionArgumentStorageBuffer: + { + cl::Memory *clMem = cl::Buffer::Cast(*static_cast<const cl_mem *>(arg.handle)); + CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>(); + + // Retain this resource until its associated dispatch completes + mMemoryCaptures.emplace_back(clMem); + + // Handle possible resource RAW hazard + if (arg.type != NonSemanticClspvReflectionArgumentUniform) + { + if (mDependencyTracker.contains(clMem) || + mDependencyTracker.size() == kMaxDependencyTrackerSize) + { + needsBarrier = true; + mDependencyTracker.clear(); + } + mDependencyTracker.insert(clMem); + } + + // Update buffer/descriptor info + VkDescriptorBufferInfo &bufferInfo = + updateDescriptorSetsBuilder.allocDescriptorBufferInfo(); + bufferInfo.range = clMem->getSize(); + bufferInfo.offset = clMem->getOffset(); + bufferInfo.buffer = vkMem.isSubBuffer() + ? vkMem.getParent()->getBuffer().getBuffer().getHandle() + : vkMem.getBuffer().getBuffer().getHandle(); + VkWriteDescriptorSet &writeDescriptorSet = + updateDescriptorSetsBuilder.allocWriteDescriptorSet(); + writeDescriptorSet.descriptorCount = 1; + writeDescriptorSet.descriptorType = + arg.type == NonSemanticClspvReflectionArgumentUniform + ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER + : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + writeDescriptorSet.pBufferInfo = &bufferInfo; + writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writeDescriptorSet.dstSet = descriptorSet; + writeDescriptorSet.dstBinding = arg.descriptorBinding; + break; + } + case NonSemanticClspvReflectionArgumentPodPushConstant: + { + mComputePassCommands->getCommandBuffer().pushConstants( + kernelVk.getPipelineLayout().get(), VK_SHADER_STAGE_COMPUTE_BIT, + arg.pushConstOffset, arg.pushConstantSize, arg.handle); + break; + } + case NonSemanticClspvReflectionArgumentSampler: + case NonSemanticClspvReflectionArgumentPodUniform: + case NonSemanticClspvReflectionArgumentStorageImage: + case NonSemanticClspvReflectionArgumentSampledImage: + case NonSemanticClspvReflectionArgumentPointerUniform: + case NonSemanticClspvReflectionArgumentPodStorageBuffer: + case NonSemanticClspvReflectionArgumentUniformTexelBuffer: + case NonSemanticClspvReflectionArgumentStorageTexelBuffer: + case NonSemanticClspvReflectionArgumentPointerPushConstant: + default: + { + UNIMPLEMENTED(); + break; + } + } + } + + if (needsBarrier) + { + VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr, + VK_ACCESS_SHADER_WRITE_BIT, + VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT}; + mComputePassCommands->getCommandBuffer().pipelineBarrier( + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, + &memoryBarrier, 0, nullptr, 0, nullptr); + } + + mContext->getPerfCounters().writeDescriptorSets = + updateDescriptorSetsBuilder.flushDescriptorSetUpdates(mContext->getRenderer()->getDevice()); + + mComputePassCommands->getCommandBuffer().bindDescriptorSets( + kernelVk.getPipelineLayout().get(), VK_PIPELINE_BIND_POINT_COMPUTE, + DescriptorSetIndex::Internal, 1, &descriptorSet, 0, nullptr); + + return angle::Result::Continue; +} + +angle::Result CLCommandQueueVk::flushComputePassCommands() +{ + UNIMPLEMENTED(); + ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES); +} + +angle::Result CLCommandQueueVk::processWaitlist(const cl::EventPtrs &waitEvents) +{ + if (!waitEvents.empty()) + { + bool insertedBarrier = false; + for (const cl::EventPtr &event : waitEvents) + { + if (event->getImpl<CLEventVk>().isUserEvent() || + event->getCommandQueue() != &mCommandQueue) + { + // We cannot use a barrier in these cases, therefore defer the event + // handling till submission time + // TODO: Perhaps we could utilize VkEvents here instead and have GPU wait(s) + // https://anglebug.com/8670 + mDependantEvents.push_back(event); + } + else if (event->getCommandQueue() == &mCommandQueue && !insertedBarrier) + { + // As long as there is at least one dependant command in same queue, + // we just need to insert one execution barrier + VkMemoryBarrier memoryBarrier = { + VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr, VK_ACCESS_SHADER_WRITE_BIT, + VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT}; + mComputePassCommands->getCommandBuffer().pipelineBarrier( + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, + 1, &memoryBarrier, 0, nullptr, 0, nullptr); + + insertedBarrier = true; + } + } + } + return angle::Result::Continue; +} + +angle::Result CLCommandQueueVk::submitCommands() +{ + UNIMPLEMENTED(); + ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES); +} + +angle::Result CLCommandQueueVk::createEvent(CLEventImpl::CreateFunc *createFunc) +{ + if (createFunc != nullptr) + { + *createFunc = [this](const cl::Event &event) { + auto eventVk = new (std::nothrow) CLEventVk(event); + if (eventVk == nullptr) + { + ERR() << "Failed to create event obj!"; + ANGLE_CL_SET_ERROR(CL_OUT_OF_HOST_MEMORY); + return CLEventImpl::Ptr(nullptr); + } + eventVk->setQueueSerial(mComputePassCommands->getQueueSerial()); + + // Save a reference to this event + mAssociatedEvents.push_back(cl::EventPtr{&eventVk->getFrontendObject()}); + + return CLEventImpl::Ptr(eventVk); + }; + } + return angle::Result::Continue; +} + +angle::Result CLCommandQueueVk::finishInternal() +{ + UNIMPLEMENTED(); + ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES); +} + } // namespace rx diff --git a/src/libANGLE/renderer/vulkan/CLCommandQueueVk.h b/src/libANGLE/renderer/vulkan/CLCommandQueueVk.h index 2d53190ef9..0238a98545 100644 --- a/src/libANGLE/renderer/vulkan/CLCommandQueueVk.h +++ b/src/libANGLE/renderer/vulkan/CLCommandQueueVk.h @@ -12,7 +12,10 @@ #include <vector> #include "libANGLE/renderer/vulkan/CLContextVk.h" +#include "libANGLE/renderer/vulkan/CLEventVk.h" +#include "libANGLE/renderer/vulkan/CLKernelVk.h" #include "libANGLE/renderer/vulkan/DisplayVk.h" +#include "libANGLE/renderer/vulkan/ShareGroupVk.h" #include "libANGLE/renderer/vulkan/cl_types.h" #include "libANGLE/renderer/vulkan/vk_command_buffer_utils.h" #include "libANGLE/renderer/vulkan/vk_helpers.h" @@ -222,16 +225,42 @@ class CLCommandQueueVk : public CLCommandQueueImpl CLPlatformVk *getPlatform() { return mContext->getPlatform(); } private: + static constexpr size_t kMaxDependencyTrackerSize = 64; + vk::ProtectionType getProtectionType() const { return vk::ProtectionType::Unprotected; } + // Create-update-bind the kernel's descriptor set, put push-constants in cmd buffer, capture + // kernel resources, and handle kernel execution dependencies + angle::Result processKernelResources(CLKernelVk &kernelVk, const cl::NDRange &ndrange); + + angle::Result submitCommands(); + angle::Result finishInternal(); + angle::Result flushComputePassCommands(); + angle::Result processWaitlist(const cl::EventPtrs &waitEvents); + angle::Result createEvent(CLEventImpl::CreateFunc *createFunc); + CLContextVk *mContext; const CLDeviceVk *mDevice; vk::SecondaryCommandPools mCommandPool; vk::OutsideRenderPassCommandBufferHelper *mComputePassCommands; vk::SecondaryCommandMemoryAllocator mOutsideRenderPassCommandsAllocator; + SerialIndex mCurrentQueueSerialIndex; + QueueSerial mLastSubmittedQueueSerial; + QueueSerial mLastFlushedQueueSerial; + std::mutex mCommandQueueMutex; + + // Created event objects associated with this command queue + cl::EventPtrs mAssociatedEvents; + + // Dependant event(s) that this queue has to wait on + cl::EventPtrs mDependantEvents; + + // Keep track of kernel resources on prior kernel enqueues + angle::HashSet<cl::Object *> mDependencyTracker; - std::vector<std::string> mCommandBufferDiagnostics; + // Resource reference capturing during execution + cl::MemoryPtrs mMemoryCaptures; }; } // namespace rx diff --git a/src/libANGLE/renderer/vulkan/CLKernelVk.cpp b/src/libANGLE/renderer/vulkan/CLKernelVk.cpp index c0224410a2..fb743afed3 100644 --- a/src/libANGLE/renderer/vulkan/CLKernelVk.cpp +++ b/src/libANGLE/renderer/vulkan/CLKernelVk.cpp @@ -28,7 +28,10 @@ CLKernelVk::CLKernelVk(const cl::Kernel &kernel, mName(name), mAttributes(attributes), mArgs(args) -{} +{ + mShaderProgramHelper.setShader(gl::ShaderType::Compute, + mKernel.getProgram().getImpl<CLProgramVk>().getShaderModule()); +} CLKernelVk::~CLKernelVk() { @@ -36,6 +39,13 @@ CLKernelVk::~CLKernelVk() { dsLayouts.reset(); } + + mPipelineLayout.reset(); + for (auto &pipelineHelper : mComputePipelineCache) + { + pipelineHelper.destroy(mContext->getDevice()); + } + mShaderProgramHelper.destroy(mContext->getRenderer()); } angle::Result CLKernelVk::setArg(cl_uint argIndex, size_t argSize, const void *argValue) @@ -89,12 +99,12 @@ angle::Result CLKernelVk::createInfo(CLKernelImpl::Info *info) const workGroup.prefWorkGroupSizeMultiple = 16u; workGroup.globalWorkSize = {0, 0, 0}; - if (deviceProgramData->reflectionData.kernelCompileWGS.contains(mName)) + if (deviceProgramData->reflectionData.kernelCompileWorkgroupSize.contains(mName)) { workGroup.compileWorkGroupSize = { - deviceProgramData->reflectionData.kernelCompileWGS.at(mName)[0], - deviceProgramData->reflectionData.kernelCompileWGS.at(mName)[1], - deviceProgramData->reflectionData.kernelCompileWGS.at(mName)[2]}; + deviceProgramData->reflectionData.kernelCompileWorkgroupSize.at(mName)[0], + deviceProgramData->reflectionData.kernelCompileWorkgroupSize.at(mName)[1], + deviceProgramData->reflectionData.kernelCompileWorkgroupSize.at(mName)[2]}; } else { @@ -105,4 +115,74 @@ angle::Result CLKernelVk::createInfo(CLKernelImpl::Info *info) const return angle::Result::Continue; } +angle::Result CLKernelVk::getOrCreateComputePipeline(vk::PipelineCacheAccess *pipelineCache, + const cl::NDRange &ndrange, + const cl::Device &device, + vk::PipelineHelper **pipelineOut, + cl::WorkgroupCount *workgroupCountOut) +{ + uint32_t constantDataOffset = 0; + angle::FixedVector<size_t, 3> specConstantData; + angle::FixedVector<VkSpecializationMapEntry, 3> mapEntries; + const CLProgramVk::DeviceProgramData *devProgramData = + getProgram()->getDeviceProgramData(device.getNative()); + ASSERT(devProgramData != nullptr); + + // Start with Workgroup size (WGS) from kernel attribute (if available) + cl::WorkgroupSize workgroupSize = devProgramData->getCompiledWorkgroupSize(getKernelName()); + + if (workgroupSize == kEmptyWorkgroupSize) + { + if (ndrange.nullLocalWorkSize) + { + // NULL value was passed, in which case the OpenCL implementation will determine + // how to be break the global work-items into appropriate work-group instances. + workgroupSize = device.getImpl<CLDeviceVk>().selectWorkGroupSize(ndrange); + } + else + { + // Local work size (LWS) was valid, use that as WGS + workgroupSize = ndrange.localWorkSize; + } + + // If at least one of the kernels does not use the reqd_work_group_size attribute, the + // Vulkan SPIR-V produced by the compiler will contain specialization constants + const std::array<uint32_t, 3> &specConstantWorkgroupSizeIDs = + devProgramData->reflectionData.specConstantWorkgroupSizeIDs; + ASSERT(ndrange.workDimensions <= 3); + for (cl_uint i = 0; i < ndrange.workDimensions; ++i) + { + mapEntries.push_back( + VkSpecializationMapEntry{.constantID = specConstantWorkgroupSizeIDs.at(i), + .offset = constantDataOffset, + .size = sizeof(uint32_t)}); + constantDataOffset += sizeof(uint32_t); + specConstantData.push_back(workgroupSize[i]); + } + } + + // Calculate the workgroup count + // TODO: Add support for non-uniform WGS + // http://angleproject:8631 + ASSERT(workgroupSize[0] != 0); + ASSERT(workgroupSize[1] != 0); + ASSERT(workgroupSize[2] != 0); + (*workgroupCountOut)[0] = static_cast<uint32_t>((ndrange.globalWorkSize[0] / workgroupSize[0])); + (*workgroupCountOut)[1] = static_cast<uint32_t>((ndrange.globalWorkSize[1] / workgroupSize[1])); + (*workgroupCountOut)[2] = static_cast<uint32_t>((ndrange.globalWorkSize[2] / workgroupSize[2])); + + VkSpecializationInfo computeSpecializationInfo{ + .mapEntryCount = static_cast<uint32_t>(mapEntries.size()), + .pMapEntries = mapEntries.data(), + .dataSize = specConstantData.size() * sizeof(specConstantData[0]), + .pData = specConstantData.data(), + }; + + // Now get or create (on compute pipeline cache miss) compute pipeline and return it + return mShaderProgramHelper.getOrCreateComputePipeline( + mContext, &mComputePipelineCache, pipelineCache, getPipelineLayout().get(), + vk::ComputePipelineFlags{}, PipelineSource::Draw, pipelineOut, mName.c_str(), + &computeSpecializationInfo); +} + } // namespace rx diff --git a/src/libANGLE/renderer/vulkan/CLKernelVk.h b/src/libANGLE/renderer/vulkan/CLKernelVk.h index 75964db203..34fb5cff7b 100644 --- a/src/libANGLE/renderer/vulkan/CLKernelVk.h +++ b/src/libANGLE/renderer/vulkan/CLKernelVk.h @@ -10,6 +10,7 @@ #include "libANGLE/renderer/vulkan/cl_types.h" #include "libANGLE/renderer/vulkan/vk_cache_utils.h" +#include "libANGLE/renderer/vulkan/vk_helpers.h" #include "libANGLE/renderer/vulkan/vk_utils.h" #include "libANGLE/renderer/CLKernelImpl.h" @@ -76,20 +77,28 @@ class CLKernelVk : public CLKernelImpl angle::Result createInfo(CLKernelImpl::Info *infoOut) const override; - const CLProgramVk *getProgram() { return mProgram; } + CLProgramVk *getProgram() { return mProgram; } const std::string &getKernelName() { return mName; } const CLKernelArguments &getArgs() { return mArgs; } - VkDescriptorSet &getDescriptorSet() { return mDescriptorSet; } vk::AtomicBindingPointer<vk::PipelineLayout> &getPipelineLayout() { return mPipelineLayout; } vk::DescriptorSetLayoutPointerArray &getDescriptorSetLayouts() { return mDescriptorSetLayouts; } + angle::Result getOrCreateComputePipeline(vk::PipelineCacheAccess *pipelineCache, + const cl::NDRange &ndrange, + const cl::Device &device, + vk::PipelineHelper **pipelineOut, + cl::WorkgroupCount *workgroupCountOut); + private: + static constexpr std::array<size_t, 3> kEmptyWorkgroupSize = {0, 0, 0}; + CLProgramVk *mProgram; CLContextVk *mContext; std::string mName; std::string mAttributes; CLKernelArguments mArgs; - VkDescriptorSet mDescriptorSet{VK_NULL_HANDLE}; + vk::ShaderProgramHelper mShaderProgramHelper; + vk::ComputePipelineCache mComputePipelineCache; vk::AtomicBindingPointer<vk::PipelineLayout> mPipelineLayout; vk::DescriptorSetLayoutPointerArray mDescriptorSetLayouts{}; }; diff --git a/src/libANGLE/renderer/vulkan/CLProgramVk.cpp b/src/libANGLE/renderer/vulkan/CLProgramVk.cpp index 946718ecd4..38fb4ac7b1 100644 --- a/src/libANGLE/renderer/vulkan/CLProgramVk.cpp +++ b/src/libANGLE/renderer/vulkan/CLProgramVk.cpp @@ -171,8 +171,8 @@ spv_result_t ParseReflection(CLProgramVk::SpvReflectionData &reflectionData, } case NonSemanticClspvReflectionPropertyRequiredWorkgroupSize: { - reflectionData - .kernelCompileWGS[reflectionData.spvStrLookup[spvInstr.words[5]]] = { + reflectionData.kernelCompileWorkgroupSize + [reflectionData.spvStrLookup[spvInstr.words[5]]] = { reflectionData.spvIntLookup[spvInstr.words[6]], reflectionData.spvIntLookup[spvInstr.words[7]], reflectionData.spvIntLookup[spvInstr.words[8]]}; diff --git a/src/libANGLE/renderer/vulkan/CLProgramVk.h b/src/libANGLE/renderer/vulkan/CLProgramVk.h index 4544d3749c..abbd0e2606 100644 --- a/src/libANGLE/renderer/vulkan/CLProgramVk.h +++ b/src/libANGLE/renderer/vulkan/CLProgramVk.h @@ -40,7 +40,7 @@ class CLProgramVk : public CLProgramImpl angle::HashMap<uint32_t, CLKernelVk::ArgInfo> kernelArgInfos; angle::HashMap<std::string, uint32_t> kernelFlags; angle::HashMap<std::string, std::string> kernelAttributes; - angle::HashMap<std::string, std::array<uint32_t, 3>> kernelCompileWGS; + angle::HashMap<std::string, std::array<uint32_t, 3>> kernelCompileWorkgroupSize; angle::HashMap<uint32_t, VkPushConstantRange> pushConstants; std::array<uint32_t, 3> specConstantWorkgroupSizeIDs{0, 0, 0}; CLKernelArgsMap kernelArgsMap; @@ -136,17 +136,18 @@ class CLProgramVk : public CLProgramImpl return kargsCopy; } - cl::WorkgroupSize getCompiledWGS(const std::string &kernelName) const + cl::WorkgroupSize getCompiledWorkgroupSize(const std::string &kernelName) const { - cl::WorkgroupSize compiledWGS{0, 0, 0}; - if (reflectionData.kernelCompileWGS.contains(kernelName)) + cl::WorkgroupSize compiledWorkgroupSize{0, 0, 0}; + if (reflectionData.kernelCompileWorkgroupSize.contains(kernelName)) { - for (size_t i = 0; i < compiledWGS.size(); ++i) + for (size_t i = 0; i < compiledWorkgroupSize.size(); ++i) { - compiledWGS[i] = reflectionData.kernelCompileWGS.at(kernelName)[i]; + compiledWorkgroupSize[i] = + reflectionData.kernelCompileWorkgroupSize.at(kernelName)[i]; } } - return compiledWGS; + return compiledWorkgroupSize; } std::string getKernelAttributes(const std::string &kernelName) const diff --git a/src/libANGLE/renderer/vulkan/ProgramExecutableVk.cpp b/src/libANGLE/renderer/vulkan/ProgramExecutableVk.cpp index b2d75cc1de..c2822cf203 100644 --- a/src/libANGLE/renderer/vulkan/ProgramExecutableVk.cpp +++ b/src/libANGLE/renderer/vulkan/ProgramExecutableVk.cpp @@ -1612,7 +1612,7 @@ angle::Result ProgramExecutableVk::getOrCreateComputePipeline( ASSERT(shaderProgram); return shaderProgram->getOrCreateComputePipeline(context, &mComputePipelines, pipelineCache, getPipelineLayout(), pipelineFlags, source, - pipelineOut); + pipelineOut, nullptr, nullptr); } angle::Result ProgramExecutableVk::createPipelineLayout( diff --git a/src/libANGLE/renderer/vulkan/UtilsVk.cpp b/src/libANGLE/renderer/vulkan/UtilsVk.cpp index 306a40ec28..526b5a0680 100644 --- a/src/libANGLE/renderer/vulkan/UtilsVk.cpp +++ b/src/libANGLE/renderer/vulkan/UtilsVk.cpp @@ -1845,7 +1845,7 @@ angle::Result UtilsVk::setupComputeProgram( ANGLE_TRY(renderer->getPipelineCache(contextVk, &pipelineCache)); ANGLE_TRY(programAndPipelines->program.getOrCreateComputePipeline( contextVk, &programAndPipelines->pipelines, &pipelineCache, pipelineLayout.get(), - contextVk->getComputePipelineFlags(), PipelineSource::Utils, &pipeline)); + contextVk->getComputePipelineFlags(), PipelineSource::Utils, &pipeline, nullptr, nullptr)); commandBufferHelper->retainResource(pipeline); vk::OutsideRenderPassCommandBuffer *commandBuffer = &commandBufferHelper->getCommandBuffer(); diff --git a/src/libANGLE/renderer/vulkan/vk_helpers.cpp b/src/libANGLE/renderer/vulkan/vk_helpers.cpp index cb33abe6ae..08f5e0b0b5 100644 --- a/src/libANGLE/renderer/vulkan/vk_helpers.cpp +++ b/src/libANGLE/renderer/vulkan/vk_helpers.cpp @@ -11698,7 +11698,9 @@ angle::Result ShaderProgramHelper::getOrCreateComputePipeline( const PipelineLayout &pipelineLayout, ComputePipelineFlags pipelineFlags, PipelineSource source, - PipelineHelper **pipelineOut) const + PipelineHelper **pipelineOut, + const char *shaderName, + VkSpecializationInfo *specializationInfo) const { PipelineHelper *computePipeline = &(*computePipelines)[pipelineFlags.bits()]; @@ -11715,8 +11717,8 @@ angle::Result ShaderProgramHelper::getOrCreateComputePipeline( shaderStage.flags = 0; shaderStage.stage = VK_SHADER_STAGE_COMPUTE_BIT; shaderStage.module = mShaders[gl::ShaderType::Compute].get().getHandle(); - shaderStage.pName = "main"; - shaderStage.pSpecializationInfo = nullptr; + shaderStage.pName = shaderName ? shaderName : "main"; + shaderStage.pSpecializationInfo = specializationInfo; createInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; createInfo.flags = 0; diff --git a/src/libANGLE/renderer/vulkan/vk_helpers.h b/src/libANGLE/renderer/vulkan/vk_helpers.h index ebb82ec4b4..7a4c4466a4 100644 --- a/src/libANGLE/renderer/vulkan/vk_helpers.h +++ b/src/libANGLE/renderer/vulkan/vk_helpers.h @@ -3475,7 +3475,9 @@ class ShaderProgramHelper : angle::NonCopyable const PipelineLayout &pipelineLayout, ComputePipelineFlags pipelineFlags, PipelineSource source, - PipelineHelper **pipelineOut) const; + PipelineHelper **pipelineOut, + const char *shaderName, + VkSpecializationInfo *specializationInfo) const; private: ShaderModuleMap mShaders; diff --git a/src/libANGLE/validationCL.cpp b/src/libANGLE/validationCL.cpp index 35dccdcdf7..523f8e26ec 100644 --- a/src/libANGLE/validationCL.cpp +++ b/src/libANGLE/validationCL.cpp @@ -2207,6 +2207,26 @@ cl_int ValidateEnqueueNDRangeKernel(cl_command_queue command_queue, return CL_INVALID_GLOBAL_OFFSET; } + // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and does not match the required + // work-group size for kernel in the program source. + size_t compileWorkGroupSize[3] = {0, 0, 0}; + if (IsError(krnl.getWorkGroupInfo(const_cast<cl_device_id>(device.getNative()), + KernelWorkGroupInfo::CompileWorkGroupSize, + sizeof(compileWorkGroupSize), compileWorkGroupSize, nullptr))) + { + return CL_INVALID_VALUE; + } + if (local_work_size != nullptr) + { + for (cl_uint i = 0; i < work_dim; ++i) + { + if (compileWorkGroupSize[i] != 0 && local_work_size[i] != compileWorkGroupSize[i]) + { + return CL_INVALID_WORK_GROUP_SIZE; + } + } + } + // CL_INVALID_GLOBAL_WORK_SIZE if global_work_size is NULL or if any of the values // specified in global_work_size[0] ... global_work_size[work_dim - 1] are 0. // Returning this error code under these circumstances is deprecated by version 2.1. |