aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAustin Annestrand <a.annestrand@samsung.com>2024-04-03 17:46:38 -0500
committerAngle LUCI CQ <angle-scoped@luci-project-accounts.iam.gserviceaccount.com>2024-04-18 20:36:23 +0000
commitd4abe62268f302e0df2d51cb655408a81361aa38 (patch)
tree906fc611f765676d54e306c58b8dbe747f8a90f5
parent4813295059014a39fb75d6a9dd031debb079c69e (diff)
downloadangle-d4abe62268f302e0df2d51cb655408a81361aa38.tar.gz
CL/VK: Implement enqueue NDRangeKernel & Task
Adding support for: clEnqueueNDRangeKernel clEnqueueTask Bug: angleproject:8631 Change-Id: If57002be3ea00a55215e89ca47ab8fe9a422c6e7 Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/5406614 Reviewed-by: Shahbaz Youssefi <syoussefi@chromium.org> Commit-Queue: Austin Annestrand <a.annestrand@samsung.com> Reviewed-by: Geoff Lang <geofflang@chromium.org>
-rw-r--r--src/libANGLE/renderer/vulkan/CLCommandQueueVk.cpp280
-rw-r--r--src/libANGLE/renderer/vulkan/CLCommandQueueVk.h31
-rw-r--r--src/libANGLE/renderer/vulkan/CLKernelVk.cpp90
-rw-r--r--src/libANGLE/renderer/vulkan/CLKernelVk.h15
-rw-r--r--src/libANGLE/renderer/vulkan/CLProgramVk.cpp4
-rw-r--r--src/libANGLE/renderer/vulkan/CLProgramVk.h15
-rw-r--r--src/libANGLE/renderer/vulkan/ProgramExecutableVk.cpp2
-rw-r--r--src/libANGLE/renderer/vulkan/UtilsVk.cpp2
-rw-r--r--src/libANGLE/renderer/vulkan/vk_helpers.cpp8
-rw-r--r--src/libANGLE/renderer/vulkan/vk_helpers.h4
-rw-r--r--src/libANGLE/validationCL.cpp20
11 files changed, 433 insertions, 38 deletions
diff --git a/src/libANGLE/renderer/vulkan/CLCommandQueueVk.cpp b/src/libANGLE/renderer/vulkan/CLCommandQueueVk.cpp
index 258acb1c76..33f0de41f4 100644
--- a/src/libANGLE/renderer/vulkan/CLCommandQueueVk.cpp
+++ b/src/libANGLE/renderer/vulkan/CLCommandQueueVk.cpp
@@ -8,12 +8,21 @@
#include "libANGLE/renderer/vulkan/CLCommandQueueVk.h"
#include "libANGLE/renderer/vulkan/CLContextVk.h"
#include "libANGLE/renderer/vulkan/CLDeviceVk.h"
+#include "libANGLE/renderer/vulkan/CLKernelVk.h"
+#include "libANGLE/renderer/vulkan/CLMemoryVk.h"
+#include "libANGLE/renderer/vulkan/CLProgramVk.h"
+#include "libANGLE/renderer/vulkan/cl_types.h"
#include "libANGLE/renderer/vulkan/vk_renderer.h"
+#include "libANGLE/CLBuffer.h"
#include "libANGLE/CLCommandQueue.h"
#include "libANGLE/CLContext.h"
+#include "libANGLE/CLEvent.h"
+#include "libANGLE/CLKernel.h"
#include "libANGLE/cl_utils.h"
+#include "spirv/unified1/NonSemanticClspvReflection.h"
+
namespace rx
{
@@ -21,18 +30,10 @@ CLCommandQueueVk::CLCommandQueueVk(const cl::CommandQueue &commandQueue)
: CLCommandQueueImpl(commandQueue),
mContext(&commandQueue.getContext().getImpl<CLContextVk>()),
mDevice(&commandQueue.getDevice().getImpl<CLDeviceVk>()),
- mComputePassCommands(nullptr)
+ mComputePassCommands(nullptr),
+ mCurrentQueueSerialIndex(kInvalidQueueSerialIndex)
{}
-CLCommandQueueVk::~CLCommandQueueVk()
-{
- VkDevice vkDevice = mContext->getDevice();
-
- // Recycle the current command buffers
- mContext->getRenderer()->recycleOutsideRenderPassCommandBufferHelper(&mComputePassCommands);
- mCommandPool.outsideRenderPassPool.destroy(vkDevice);
-}
-
angle::Result CLCommandQueueVk::init()
{
ANGLE_CL_IMPL_TRY_ERROR(
@@ -46,9 +47,36 @@ angle::Result CLCommandQueueVk::init()
&mOutsideRenderPassCommandsAllocator, &mComputePassCommands),
CL_OUT_OF_RESOURCES);
+ // Generate initial QueueSerial for command buffer helper
+ ANGLE_CL_IMPL_TRY_ERROR(
+ mContext->getRenderer()->allocateQueueSerialIndex(&mCurrentQueueSerialIndex),
+ CL_OUT_OF_RESOURCES);
+ mComputePassCommands->setQueueSerial(
+ mCurrentQueueSerialIndex,
+ mContext->getRenderer()->generateQueueSerial(mCurrentQueueSerialIndex));
+
+ // Initialize serials to be valid but appear submitted and finished.
+ mLastFlushedQueueSerial = QueueSerial(mCurrentQueueSerialIndex, Serial());
+ mLastSubmittedQueueSerial = mLastFlushedQueueSerial;
+
return angle::Result::Continue;
}
+CLCommandQueueVk::~CLCommandQueueVk()
+{
+ VkDevice vkDevice = mContext->getDevice();
+
+ if (mCurrentQueueSerialIndex != kInvalidQueueSerialIndex)
+ {
+ mContext->getRenderer()->releaseQueueSerialIndex(mCurrentQueueSerialIndex);
+ mCurrentQueueSerialIndex = kInvalidQueueSerialIndex;
+ }
+
+ // Recycle the current command buffers
+ mContext->getRenderer()->recycleOutsideRenderPassCommandBufferHelper(&mComputePassCommands);
+ mCommandPool.outsideRenderPassPool.destroy(vkDevice);
+}
+
angle::Result CLCommandQueueVk::setProperty(cl::CommandQueueProperties properties, cl_bool enable)
{
// NOTE: "clSetCommandQueueProperty" has been deprecated as of OpenCL 1.1
@@ -280,16 +308,41 @@ angle::Result CLCommandQueueVk::enqueueNDRangeKernel(const cl::Kernel &kernel,
const cl::EventPtrs &waitEvents,
CLEventImpl::CreateFunc *eventCreateFunc)
{
- UNIMPLEMENTED();
- ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
+ std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
+
+ ANGLE_TRY(processWaitlist(waitEvents));
+
+ cl::WorkgroupCount workgroupCount;
+ vk::PipelineCacheAccess pipelineCache;
+ vk::PipelineHelper *pipelineHelper = nullptr;
+ CLKernelVk &kernelImpl = kernel.getImpl<CLKernelVk>();
+
+ ANGLE_TRY(processKernelResources(kernelImpl, ndrange));
+
+ // Fetch or create compute pipeline (if we miss in cache)
+ ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->getPipelineCache(mContext, &pipelineCache),
+ CL_OUT_OF_RESOURCES);
+ ANGLE_TRY(kernelImpl.getOrCreateComputePipeline(
+ &pipelineCache, ndrange, mCommandQueue.getDevice(), &pipelineHelper, &workgroupCount));
+
+ mComputePassCommands->retainResource(pipelineHelper);
+ mComputePassCommands->getCommandBuffer().bindComputePipeline(pipelineHelper->getPipeline());
+ mComputePassCommands->getCommandBuffer().dispatch(workgroupCount[0], workgroupCount[1],
+ workgroupCount[2]);
+
+ ANGLE_TRY(createEvent(eventCreateFunc));
+
+ return angle::Result::Continue;
}
angle::Result CLCommandQueueVk::enqueueTask(const cl::Kernel &kernel,
const cl::EventPtrs &waitEvents,
CLEventImpl::CreateFunc *eventCreateFunc)
{
- UNIMPLEMENTED();
- ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
+ constexpr size_t globalWorkSize[3] = {1, 0, 0};
+ constexpr size_t localWorkSize[3] = {1, 0, 0};
+ cl::NDRange ndrange(1, nullptr, globalWorkSize, localWorkSize);
+ return enqueueNDRangeKernel(kernel, ndrange, waitEvents, eventCreateFunc);
}
angle::Result CLCommandQueueVk::enqueueNativeKernel(cl::UserFunc userFunc,
@@ -348,4 +401,203 @@ angle::Result CLCommandQueueVk::finish()
ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
}
+angle::Result CLCommandQueueVk::processKernelResources(CLKernelVk &kernelVk,
+ const cl::NDRange &ndrange)
+{
+ bool needsBarrier = false;
+ UpdateDescriptorSetsBuilder updateDescriptorSetsBuilder;
+ const CLProgramVk::DeviceProgramData *devProgramData =
+ kernelVk.getProgram()->getDeviceProgramData(mCommandQueue.getDevice().getNative());
+ ASSERT(devProgramData != nullptr);
+
+ // Allocate descriptor set
+ VkDescriptorSet descriptorSet{VK_NULL_HANDLE};
+ ANGLE_TRY(kernelVk.getProgram()->allocateDescriptorSet(
+ kernelVk.getDescriptorSetLayouts()[DescriptorSetIndex::ShaderResource].get(),
+ &descriptorSet));
+
+ // Push global offset data
+ const VkPushConstantRange *globalOffsetRange = devProgramData->getGlobalOffsetRange();
+ if (globalOffsetRange != nullptr)
+ {
+ mComputePassCommands->getCommandBuffer().pushConstants(
+ kernelVk.getPipelineLayout().get(), VK_SHADER_STAGE_COMPUTE_BIT,
+ globalOffsetRange->offset, globalOffsetRange->size, ndrange.globalWorkOffset.data());
+ }
+
+ // Push global size data
+ const VkPushConstantRange *globalSizeRange = devProgramData->getGlobalSizeRange();
+ if (globalSizeRange != nullptr)
+ {
+ mComputePassCommands->getCommandBuffer().pushConstants(
+ kernelVk.getPipelineLayout().get(), VK_SHADER_STAGE_COMPUTE_BIT,
+ globalSizeRange->offset, globalSizeRange->size, ndrange.globalWorkSize.data());
+ }
+
+ // Process each kernel argument/resource
+ for (const auto &arg : kernelVk.getArgs())
+ {
+ switch (arg.type)
+ {
+ case NonSemanticClspvReflectionArgumentUniform:
+ case NonSemanticClspvReflectionArgumentStorageBuffer:
+ {
+ cl::Memory *clMem = cl::Buffer::Cast(*static_cast<const cl_mem *>(arg.handle));
+ CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
+
+ // Retain this resource until its associated dispatch completes
+ mMemoryCaptures.emplace_back(clMem);
+
+ // Handle possible resource RAW hazard
+ if (arg.type != NonSemanticClspvReflectionArgumentUniform)
+ {
+ if (mDependencyTracker.contains(clMem) ||
+ mDependencyTracker.size() == kMaxDependencyTrackerSize)
+ {
+ needsBarrier = true;
+ mDependencyTracker.clear();
+ }
+ mDependencyTracker.insert(clMem);
+ }
+
+ // Update buffer/descriptor info
+ VkDescriptorBufferInfo &bufferInfo =
+ updateDescriptorSetsBuilder.allocDescriptorBufferInfo();
+ bufferInfo.range = clMem->getSize();
+ bufferInfo.offset = clMem->getOffset();
+ bufferInfo.buffer = vkMem.isSubBuffer()
+ ? vkMem.getParent()->getBuffer().getBuffer().getHandle()
+ : vkMem.getBuffer().getBuffer().getHandle();
+ VkWriteDescriptorSet &writeDescriptorSet =
+ updateDescriptorSetsBuilder.allocWriteDescriptorSet();
+ writeDescriptorSet.descriptorCount = 1;
+ writeDescriptorSet.descriptorType =
+ arg.type == NonSemanticClspvReflectionArgumentUniform
+ ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
+ : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+ writeDescriptorSet.pBufferInfo = &bufferInfo;
+ writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+ writeDescriptorSet.dstSet = descriptorSet;
+ writeDescriptorSet.dstBinding = arg.descriptorBinding;
+ break;
+ }
+ case NonSemanticClspvReflectionArgumentPodPushConstant:
+ {
+ mComputePassCommands->getCommandBuffer().pushConstants(
+ kernelVk.getPipelineLayout().get(), VK_SHADER_STAGE_COMPUTE_BIT,
+ arg.pushConstOffset, arg.pushConstantSize, arg.handle);
+ break;
+ }
+ case NonSemanticClspvReflectionArgumentSampler:
+ case NonSemanticClspvReflectionArgumentPodUniform:
+ case NonSemanticClspvReflectionArgumentStorageImage:
+ case NonSemanticClspvReflectionArgumentSampledImage:
+ case NonSemanticClspvReflectionArgumentPointerUniform:
+ case NonSemanticClspvReflectionArgumentPodStorageBuffer:
+ case NonSemanticClspvReflectionArgumentUniformTexelBuffer:
+ case NonSemanticClspvReflectionArgumentStorageTexelBuffer:
+ case NonSemanticClspvReflectionArgumentPointerPushConstant:
+ default:
+ {
+ UNIMPLEMENTED();
+ break;
+ }
+ }
+ }
+
+ if (needsBarrier)
+ {
+ VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr,
+ VK_ACCESS_SHADER_WRITE_BIT,
+ VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
+ mComputePassCommands->getCommandBuffer().pipelineBarrier(
+ VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1,
+ &memoryBarrier, 0, nullptr, 0, nullptr);
+ }
+
+ mContext->getPerfCounters().writeDescriptorSets =
+ updateDescriptorSetsBuilder.flushDescriptorSetUpdates(mContext->getRenderer()->getDevice());
+
+ mComputePassCommands->getCommandBuffer().bindDescriptorSets(
+ kernelVk.getPipelineLayout().get(), VK_PIPELINE_BIND_POINT_COMPUTE,
+ DescriptorSetIndex::Internal, 1, &descriptorSet, 0, nullptr);
+
+ return angle::Result::Continue;
+}
+
+angle::Result CLCommandQueueVk::flushComputePassCommands()
+{
+ UNIMPLEMENTED();
+ ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
+}
+
+angle::Result CLCommandQueueVk::processWaitlist(const cl::EventPtrs &waitEvents)
+{
+ if (!waitEvents.empty())
+ {
+ bool insertedBarrier = false;
+ for (const cl::EventPtr &event : waitEvents)
+ {
+ if (event->getImpl<CLEventVk>().isUserEvent() ||
+ event->getCommandQueue() != &mCommandQueue)
+ {
+ // We cannot use a barrier in these cases, therefore defer the event
+ // handling till submission time
+ // TODO: Perhaps we could utilize VkEvents here instead and have GPU wait(s)
+ // https://anglebug.com/8670
+ mDependantEvents.push_back(event);
+ }
+ else if (event->getCommandQueue() == &mCommandQueue && !insertedBarrier)
+ {
+ // As long as there is at least one dependant command in same queue,
+ // we just need to insert one execution barrier
+ VkMemoryBarrier memoryBarrier = {
+ VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr, VK_ACCESS_SHADER_WRITE_BIT,
+ VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
+ mComputePassCommands->getCommandBuffer().pipelineBarrier(
+ VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
+ 1, &memoryBarrier, 0, nullptr, 0, nullptr);
+
+ insertedBarrier = true;
+ }
+ }
+ }
+ return angle::Result::Continue;
+}
+
+angle::Result CLCommandQueueVk::submitCommands()
+{
+ UNIMPLEMENTED();
+ ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
+}
+
+angle::Result CLCommandQueueVk::createEvent(CLEventImpl::CreateFunc *createFunc)
+{
+ if (createFunc != nullptr)
+ {
+ *createFunc = [this](const cl::Event &event) {
+ auto eventVk = new (std::nothrow) CLEventVk(event);
+ if (eventVk == nullptr)
+ {
+ ERR() << "Failed to create event obj!";
+ ANGLE_CL_SET_ERROR(CL_OUT_OF_HOST_MEMORY);
+ return CLEventImpl::Ptr(nullptr);
+ }
+ eventVk->setQueueSerial(mComputePassCommands->getQueueSerial());
+
+ // Save a reference to this event
+ mAssociatedEvents.push_back(cl::EventPtr{&eventVk->getFrontendObject()});
+
+ return CLEventImpl::Ptr(eventVk);
+ };
+ }
+ return angle::Result::Continue;
+}
+
+angle::Result CLCommandQueueVk::finishInternal()
+{
+ UNIMPLEMENTED();
+ ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
+}
+
} // namespace rx
diff --git a/src/libANGLE/renderer/vulkan/CLCommandQueueVk.h b/src/libANGLE/renderer/vulkan/CLCommandQueueVk.h
index 2d53190ef9..0238a98545 100644
--- a/src/libANGLE/renderer/vulkan/CLCommandQueueVk.h
+++ b/src/libANGLE/renderer/vulkan/CLCommandQueueVk.h
@@ -12,7 +12,10 @@
#include <vector>
#include "libANGLE/renderer/vulkan/CLContextVk.h"
+#include "libANGLE/renderer/vulkan/CLEventVk.h"
+#include "libANGLE/renderer/vulkan/CLKernelVk.h"
#include "libANGLE/renderer/vulkan/DisplayVk.h"
+#include "libANGLE/renderer/vulkan/ShareGroupVk.h"
#include "libANGLE/renderer/vulkan/cl_types.h"
#include "libANGLE/renderer/vulkan/vk_command_buffer_utils.h"
#include "libANGLE/renderer/vulkan/vk_helpers.h"
@@ -222,16 +225,42 @@ class CLCommandQueueVk : public CLCommandQueueImpl
CLPlatformVk *getPlatform() { return mContext->getPlatform(); }
private:
+ static constexpr size_t kMaxDependencyTrackerSize = 64;
+
vk::ProtectionType getProtectionType() const { return vk::ProtectionType::Unprotected; }
+ // Create-update-bind the kernel's descriptor set, put push-constants in cmd buffer, capture
+ // kernel resources, and handle kernel execution dependencies
+ angle::Result processKernelResources(CLKernelVk &kernelVk, const cl::NDRange &ndrange);
+
+ angle::Result submitCommands();
+ angle::Result finishInternal();
+ angle::Result flushComputePassCommands();
+ angle::Result processWaitlist(const cl::EventPtrs &waitEvents);
+ angle::Result createEvent(CLEventImpl::CreateFunc *createFunc);
+
CLContextVk *mContext;
const CLDeviceVk *mDevice;
vk::SecondaryCommandPools mCommandPool;
vk::OutsideRenderPassCommandBufferHelper *mComputePassCommands;
vk::SecondaryCommandMemoryAllocator mOutsideRenderPassCommandsAllocator;
+ SerialIndex mCurrentQueueSerialIndex;
+ QueueSerial mLastSubmittedQueueSerial;
+ QueueSerial mLastFlushedQueueSerial;
+ std::mutex mCommandQueueMutex;
+
+ // Created event objects associated with this command queue
+ cl::EventPtrs mAssociatedEvents;
+
+ // Dependant event(s) that this queue has to wait on
+ cl::EventPtrs mDependantEvents;
+
+ // Keep track of kernel resources on prior kernel enqueues
+ angle::HashSet<cl::Object *> mDependencyTracker;
- std::vector<std::string> mCommandBufferDiagnostics;
+ // Resource reference capturing during execution
+ cl::MemoryPtrs mMemoryCaptures;
};
} // namespace rx
diff --git a/src/libANGLE/renderer/vulkan/CLKernelVk.cpp b/src/libANGLE/renderer/vulkan/CLKernelVk.cpp
index c0224410a2..fb743afed3 100644
--- a/src/libANGLE/renderer/vulkan/CLKernelVk.cpp
+++ b/src/libANGLE/renderer/vulkan/CLKernelVk.cpp
@@ -28,7 +28,10 @@ CLKernelVk::CLKernelVk(const cl::Kernel &kernel,
mName(name),
mAttributes(attributes),
mArgs(args)
-{}
+{
+ mShaderProgramHelper.setShader(gl::ShaderType::Compute,
+ mKernel.getProgram().getImpl<CLProgramVk>().getShaderModule());
+}
CLKernelVk::~CLKernelVk()
{
@@ -36,6 +39,13 @@ CLKernelVk::~CLKernelVk()
{
dsLayouts.reset();
}
+
+ mPipelineLayout.reset();
+ for (auto &pipelineHelper : mComputePipelineCache)
+ {
+ pipelineHelper.destroy(mContext->getDevice());
+ }
+ mShaderProgramHelper.destroy(mContext->getRenderer());
}
angle::Result CLKernelVk::setArg(cl_uint argIndex, size_t argSize, const void *argValue)
@@ -89,12 +99,12 @@ angle::Result CLKernelVk::createInfo(CLKernelImpl::Info *info) const
workGroup.prefWorkGroupSizeMultiple = 16u;
workGroup.globalWorkSize = {0, 0, 0};
- if (deviceProgramData->reflectionData.kernelCompileWGS.contains(mName))
+ if (deviceProgramData->reflectionData.kernelCompileWorkgroupSize.contains(mName))
{
workGroup.compileWorkGroupSize = {
- deviceProgramData->reflectionData.kernelCompileWGS.at(mName)[0],
- deviceProgramData->reflectionData.kernelCompileWGS.at(mName)[1],
- deviceProgramData->reflectionData.kernelCompileWGS.at(mName)[2]};
+ deviceProgramData->reflectionData.kernelCompileWorkgroupSize.at(mName)[0],
+ deviceProgramData->reflectionData.kernelCompileWorkgroupSize.at(mName)[1],
+ deviceProgramData->reflectionData.kernelCompileWorkgroupSize.at(mName)[2]};
}
else
{
@@ -105,4 +115,74 @@ angle::Result CLKernelVk::createInfo(CLKernelImpl::Info *info) const
return angle::Result::Continue;
}
+angle::Result CLKernelVk::getOrCreateComputePipeline(vk::PipelineCacheAccess *pipelineCache,
+ const cl::NDRange &ndrange,
+ const cl::Device &device,
+ vk::PipelineHelper **pipelineOut,
+ cl::WorkgroupCount *workgroupCountOut)
+{
+ uint32_t constantDataOffset = 0;
+ angle::FixedVector<size_t, 3> specConstantData;
+ angle::FixedVector<VkSpecializationMapEntry, 3> mapEntries;
+ const CLProgramVk::DeviceProgramData *devProgramData =
+ getProgram()->getDeviceProgramData(device.getNative());
+ ASSERT(devProgramData != nullptr);
+
+ // Start with Workgroup size (WGS) from kernel attribute (if available)
+ cl::WorkgroupSize workgroupSize = devProgramData->getCompiledWorkgroupSize(getKernelName());
+
+ if (workgroupSize == kEmptyWorkgroupSize)
+ {
+ if (ndrange.nullLocalWorkSize)
+ {
+ // NULL value was passed, in which case the OpenCL implementation will determine
+ // how to be break the global work-items into appropriate work-group instances.
+ workgroupSize = device.getImpl<CLDeviceVk>().selectWorkGroupSize(ndrange);
+ }
+ else
+ {
+ // Local work size (LWS) was valid, use that as WGS
+ workgroupSize = ndrange.localWorkSize;
+ }
+
+ // If at least one of the kernels does not use the reqd_work_group_size attribute, the
+ // Vulkan SPIR-V produced by the compiler will contain specialization constants
+ const std::array<uint32_t, 3> &specConstantWorkgroupSizeIDs =
+ devProgramData->reflectionData.specConstantWorkgroupSizeIDs;
+ ASSERT(ndrange.workDimensions <= 3);
+ for (cl_uint i = 0; i < ndrange.workDimensions; ++i)
+ {
+ mapEntries.push_back(
+ VkSpecializationMapEntry{.constantID = specConstantWorkgroupSizeIDs.at(i),
+ .offset = constantDataOffset,
+ .size = sizeof(uint32_t)});
+ constantDataOffset += sizeof(uint32_t);
+ specConstantData.push_back(workgroupSize[i]);
+ }
+ }
+
+ // Calculate the workgroup count
+ // TODO: Add support for non-uniform WGS
+ // http://angleproject:8631
+ ASSERT(workgroupSize[0] != 0);
+ ASSERT(workgroupSize[1] != 0);
+ ASSERT(workgroupSize[2] != 0);
+ (*workgroupCountOut)[0] = static_cast<uint32_t>((ndrange.globalWorkSize[0] / workgroupSize[0]));
+ (*workgroupCountOut)[1] = static_cast<uint32_t>((ndrange.globalWorkSize[1] / workgroupSize[1]));
+ (*workgroupCountOut)[2] = static_cast<uint32_t>((ndrange.globalWorkSize[2] / workgroupSize[2]));
+
+ VkSpecializationInfo computeSpecializationInfo{
+ .mapEntryCount = static_cast<uint32_t>(mapEntries.size()),
+ .pMapEntries = mapEntries.data(),
+ .dataSize = specConstantData.size() * sizeof(specConstantData[0]),
+ .pData = specConstantData.data(),
+ };
+
+ // Now get or create (on compute pipeline cache miss) compute pipeline and return it
+ return mShaderProgramHelper.getOrCreateComputePipeline(
+ mContext, &mComputePipelineCache, pipelineCache, getPipelineLayout().get(),
+ vk::ComputePipelineFlags{}, PipelineSource::Draw, pipelineOut, mName.c_str(),
+ &computeSpecializationInfo);
+}
+
} // namespace rx
diff --git a/src/libANGLE/renderer/vulkan/CLKernelVk.h b/src/libANGLE/renderer/vulkan/CLKernelVk.h
index 75964db203..34fb5cff7b 100644
--- a/src/libANGLE/renderer/vulkan/CLKernelVk.h
+++ b/src/libANGLE/renderer/vulkan/CLKernelVk.h
@@ -10,6 +10,7 @@
#include "libANGLE/renderer/vulkan/cl_types.h"
#include "libANGLE/renderer/vulkan/vk_cache_utils.h"
+#include "libANGLE/renderer/vulkan/vk_helpers.h"
#include "libANGLE/renderer/vulkan/vk_utils.h"
#include "libANGLE/renderer/CLKernelImpl.h"
@@ -76,20 +77,28 @@ class CLKernelVk : public CLKernelImpl
angle::Result createInfo(CLKernelImpl::Info *infoOut) const override;
- const CLProgramVk *getProgram() { return mProgram; }
+ CLProgramVk *getProgram() { return mProgram; }
const std::string &getKernelName() { return mName; }
const CLKernelArguments &getArgs() { return mArgs; }
- VkDescriptorSet &getDescriptorSet() { return mDescriptorSet; }
vk::AtomicBindingPointer<vk::PipelineLayout> &getPipelineLayout() { return mPipelineLayout; }
vk::DescriptorSetLayoutPointerArray &getDescriptorSetLayouts() { return mDescriptorSetLayouts; }
+ angle::Result getOrCreateComputePipeline(vk::PipelineCacheAccess *pipelineCache,
+ const cl::NDRange &ndrange,
+ const cl::Device &device,
+ vk::PipelineHelper **pipelineOut,
+ cl::WorkgroupCount *workgroupCountOut);
+
private:
+ static constexpr std::array<size_t, 3> kEmptyWorkgroupSize = {0, 0, 0};
+
CLProgramVk *mProgram;
CLContextVk *mContext;
std::string mName;
std::string mAttributes;
CLKernelArguments mArgs;
- VkDescriptorSet mDescriptorSet{VK_NULL_HANDLE};
+ vk::ShaderProgramHelper mShaderProgramHelper;
+ vk::ComputePipelineCache mComputePipelineCache;
vk::AtomicBindingPointer<vk::PipelineLayout> mPipelineLayout;
vk::DescriptorSetLayoutPointerArray mDescriptorSetLayouts{};
};
diff --git a/src/libANGLE/renderer/vulkan/CLProgramVk.cpp b/src/libANGLE/renderer/vulkan/CLProgramVk.cpp
index 946718ecd4..38fb4ac7b1 100644
--- a/src/libANGLE/renderer/vulkan/CLProgramVk.cpp
+++ b/src/libANGLE/renderer/vulkan/CLProgramVk.cpp
@@ -171,8 +171,8 @@ spv_result_t ParseReflection(CLProgramVk::SpvReflectionData &reflectionData,
}
case NonSemanticClspvReflectionPropertyRequiredWorkgroupSize:
{
- reflectionData
- .kernelCompileWGS[reflectionData.spvStrLookup[spvInstr.words[5]]] = {
+ reflectionData.kernelCompileWorkgroupSize
+ [reflectionData.spvStrLookup[spvInstr.words[5]]] = {
reflectionData.spvIntLookup[spvInstr.words[6]],
reflectionData.spvIntLookup[spvInstr.words[7]],
reflectionData.spvIntLookup[spvInstr.words[8]]};
diff --git a/src/libANGLE/renderer/vulkan/CLProgramVk.h b/src/libANGLE/renderer/vulkan/CLProgramVk.h
index 4544d3749c..abbd0e2606 100644
--- a/src/libANGLE/renderer/vulkan/CLProgramVk.h
+++ b/src/libANGLE/renderer/vulkan/CLProgramVk.h
@@ -40,7 +40,7 @@ class CLProgramVk : public CLProgramImpl
angle::HashMap<uint32_t, CLKernelVk::ArgInfo> kernelArgInfos;
angle::HashMap<std::string, uint32_t> kernelFlags;
angle::HashMap<std::string, std::string> kernelAttributes;
- angle::HashMap<std::string, std::array<uint32_t, 3>> kernelCompileWGS;
+ angle::HashMap<std::string, std::array<uint32_t, 3>> kernelCompileWorkgroupSize;
angle::HashMap<uint32_t, VkPushConstantRange> pushConstants;
std::array<uint32_t, 3> specConstantWorkgroupSizeIDs{0, 0, 0};
CLKernelArgsMap kernelArgsMap;
@@ -136,17 +136,18 @@ class CLProgramVk : public CLProgramImpl
return kargsCopy;
}
- cl::WorkgroupSize getCompiledWGS(const std::string &kernelName) const
+ cl::WorkgroupSize getCompiledWorkgroupSize(const std::string &kernelName) const
{
- cl::WorkgroupSize compiledWGS{0, 0, 0};
- if (reflectionData.kernelCompileWGS.contains(kernelName))
+ cl::WorkgroupSize compiledWorkgroupSize{0, 0, 0};
+ if (reflectionData.kernelCompileWorkgroupSize.contains(kernelName))
{
- for (size_t i = 0; i < compiledWGS.size(); ++i)
+ for (size_t i = 0; i < compiledWorkgroupSize.size(); ++i)
{
- compiledWGS[i] = reflectionData.kernelCompileWGS.at(kernelName)[i];
+ compiledWorkgroupSize[i] =
+ reflectionData.kernelCompileWorkgroupSize.at(kernelName)[i];
}
}
- return compiledWGS;
+ return compiledWorkgroupSize;
}
std::string getKernelAttributes(const std::string &kernelName) const
diff --git a/src/libANGLE/renderer/vulkan/ProgramExecutableVk.cpp b/src/libANGLE/renderer/vulkan/ProgramExecutableVk.cpp
index b2d75cc1de..c2822cf203 100644
--- a/src/libANGLE/renderer/vulkan/ProgramExecutableVk.cpp
+++ b/src/libANGLE/renderer/vulkan/ProgramExecutableVk.cpp
@@ -1612,7 +1612,7 @@ angle::Result ProgramExecutableVk::getOrCreateComputePipeline(
ASSERT(shaderProgram);
return shaderProgram->getOrCreateComputePipeline(context, &mComputePipelines, pipelineCache,
getPipelineLayout(), pipelineFlags, source,
- pipelineOut);
+ pipelineOut, nullptr, nullptr);
}
angle::Result ProgramExecutableVk::createPipelineLayout(
diff --git a/src/libANGLE/renderer/vulkan/UtilsVk.cpp b/src/libANGLE/renderer/vulkan/UtilsVk.cpp
index 306a40ec28..526b5a0680 100644
--- a/src/libANGLE/renderer/vulkan/UtilsVk.cpp
+++ b/src/libANGLE/renderer/vulkan/UtilsVk.cpp
@@ -1845,7 +1845,7 @@ angle::Result UtilsVk::setupComputeProgram(
ANGLE_TRY(renderer->getPipelineCache(contextVk, &pipelineCache));
ANGLE_TRY(programAndPipelines->program.getOrCreateComputePipeline(
contextVk, &programAndPipelines->pipelines, &pipelineCache, pipelineLayout.get(),
- contextVk->getComputePipelineFlags(), PipelineSource::Utils, &pipeline));
+ contextVk->getComputePipelineFlags(), PipelineSource::Utils, &pipeline, nullptr, nullptr));
commandBufferHelper->retainResource(pipeline);
vk::OutsideRenderPassCommandBuffer *commandBuffer = &commandBufferHelper->getCommandBuffer();
diff --git a/src/libANGLE/renderer/vulkan/vk_helpers.cpp b/src/libANGLE/renderer/vulkan/vk_helpers.cpp
index cb33abe6ae..08f5e0b0b5 100644
--- a/src/libANGLE/renderer/vulkan/vk_helpers.cpp
+++ b/src/libANGLE/renderer/vulkan/vk_helpers.cpp
@@ -11698,7 +11698,9 @@ angle::Result ShaderProgramHelper::getOrCreateComputePipeline(
const PipelineLayout &pipelineLayout,
ComputePipelineFlags pipelineFlags,
PipelineSource source,
- PipelineHelper **pipelineOut) const
+ PipelineHelper **pipelineOut,
+ const char *shaderName,
+ VkSpecializationInfo *specializationInfo) const
{
PipelineHelper *computePipeline = &(*computePipelines)[pipelineFlags.bits()];
@@ -11715,8 +11717,8 @@ angle::Result ShaderProgramHelper::getOrCreateComputePipeline(
shaderStage.flags = 0;
shaderStage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
shaderStage.module = mShaders[gl::ShaderType::Compute].get().getHandle();
- shaderStage.pName = "main";
- shaderStage.pSpecializationInfo = nullptr;
+ shaderStage.pName = shaderName ? shaderName : "main";
+ shaderStage.pSpecializationInfo = specializationInfo;
createInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
createInfo.flags = 0;
diff --git a/src/libANGLE/renderer/vulkan/vk_helpers.h b/src/libANGLE/renderer/vulkan/vk_helpers.h
index ebb82ec4b4..7a4c4466a4 100644
--- a/src/libANGLE/renderer/vulkan/vk_helpers.h
+++ b/src/libANGLE/renderer/vulkan/vk_helpers.h
@@ -3475,7 +3475,9 @@ class ShaderProgramHelper : angle::NonCopyable
const PipelineLayout &pipelineLayout,
ComputePipelineFlags pipelineFlags,
PipelineSource source,
- PipelineHelper **pipelineOut) const;
+ PipelineHelper **pipelineOut,
+ const char *shaderName,
+ VkSpecializationInfo *specializationInfo) const;
private:
ShaderModuleMap mShaders;
diff --git a/src/libANGLE/validationCL.cpp b/src/libANGLE/validationCL.cpp
index 35dccdcdf7..523f8e26ec 100644
--- a/src/libANGLE/validationCL.cpp
+++ b/src/libANGLE/validationCL.cpp
@@ -2207,6 +2207,26 @@ cl_int ValidateEnqueueNDRangeKernel(cl_command_queue command_queue,
return CL_INVALID_GLOBAL_OFFSET;
}
+ // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and does not match the required
+ // work-group size for kernel in the program source.
+ size_t compileWorkGroupSize[3] = {0, 0, 0};
+ if (IsError(krnl.getWorkGroupInfo(const_cast<cl_device_id>(device.getNative()),
+ KernelWorkGroupInfo::CompileWorkGroupSize,
+ sizeof(compileWorkGroupSize), compileWorkGroupSize, nullptr)))
+ {
+ return CL_INVALID_VALUE;
+ }
+ if (local_work_size != nullptr)
+ {
+ for (cl_uint i = 0; i < work_dim; ++i)
+ {
+ if (compileWorkGroupSize[i] != 0 && local_work_size[i] != compileWorkGroupSize[i])
+ {
+ return CL_INVALID_WORK_GROUP_SIZE;
+ }
+ }
+ }
+
// CL_INVALID_GLOBAL_WORK_SIZE if global_work_size is NULL or if any of the values
// specified in global_work_size[0] ... global_work_size[work_dim - 1] are 0.
// Returning this error code under these circumstances is deprecated by version 2.1.