aboutsummaryrefslogtreecommitdiff
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h190
1 files changed, 109 insertions, 81 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index abdf742c6..b20f80ba2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -54,8 +54,8 @@ class IndexMapper {
}
}
- array<Index, NumDims> cudaInputDimensions;
- array<Index, NumDims> cudaOutputDimensions;
+ array<Index, NumDims> gpuInputDimensions;
+ array<Index, NumDims> gpuOutputDimensions;
array<Index, NumDims> tmp = dimensions;
array<Index, NumDims> ordering;
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@@ -65,8 +65,8 @@ class IndexMapper {
const Index index = i + offset;
ordering[index] = indices[i];
tmp[indices[i]] = -1;
- cudaInputDimensions[index] = input_dims[indices[i]];
- cudaOutputDimensions[index] = dimensions[indices[i]];
+ gpuInputDimensions[index] = input_dims[indices[i]];
+ gpuOutputDimensions[index] = dimensions[indices[i]];
}
int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@@ -75,8 +75,8 @@ class IndexMapper {
for (int i = 0; i < NumDims; ++i) {
if (tmp[i] >= 0) {
ordering[written] = i;
- cudaInputDimensions[written] = input_dims[i];
- cudaOutputDimensions[written] = dimensions[i];
+ gpuInputDimensions[written] = input_dims[i];
+ gpuOutputDimensions[written] = dimensions[i];
++written;
}
}
@@ -89,37 +89,37 @@ class IndexMapper {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = 0; i < NumDims; ++i) {
if (i > NumKernelDims) {
- m_cudaInputStrides[i] =
- m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
- m_cudaOutputStrides[i] =
- m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
+ m_gpuInputStrides[i] =
+ m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
+ m_gpuOutputStrides[i] =
+ m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
} else {
- m_cudaInputStrides[i] = 1;
- m_cudaOutputStrides[i] = 1;
+ m_gpuInputStrides[i] = 1;
+ m_gpuOutputStrides[i] = 1;
}
}
} else {
for (int i = NumDims - 1; i >= 0; --i) {
- if (i + 1 < offset) {
- m_cudaInputStrides[i] =
- m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
- m_cudaOutputStrides[i] =
- m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
+ if (static_cast<size_t>(i + 1) < offset) {
+ m_gpuInputStrides[i] =
+ m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
+ m_gpuOutputStrides[i] =
+ m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
} else {
- m_cudaInputStrides[i] = 1;
- m_cudaOutputStrides[i] = 1;
+ m_gpuInputStrides[i] = 1;
+ m_gpuOutputStrides[i] = 1;
}
}
}
}
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int d = NumDims - 1; d > NumKernelDims; --d) {
- const Index idx = p / m_cudaInputStrides[d];
+ const Index idx = p / m_gpuInputStrides[d];
inputIndex += idx * m_inputStrides[d];
- p -= idx * m_cudaInputStrides[d];
+ p -= idx * m_gpuInputStrides[d];
}
inputIndex += p * m_inputStrides[NumKernelDims];
} else {
@@ -128,22 +128,22 @@ class IndexMapper {
limit = NumDims - NumKernelDims - 1;
}
for (int d = 0; d < limit; ++d) {
- const Index idx = p / m_cudaInputStrides[d];
+ const Index idx = p / m_gpuInputStrides[d];
inputIndex += idx * m_inputStrides[d];
- p -= idx * m_cudaInputStrides[d];
+ p -= idx * m_gpuInputStrides[d];
}
inputIndex += p * m_inputStrides[limit];
}
return inputIndex;
}
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
Index outputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int d = NumDims - 1; d > NumKernelDims; --d) {
- const Index idx = p / m_cudaOutputStrides[d];
+ const Index idx = p / m_gpuOutputStrides[d];
outputIndex += idx * m_outputStrides[d];
- p -= idx * m_cudaOutputStrides[d];
+ p -= idx * m_gpuOutputStrides[d];
}
outputIndex += p * m_outputStrides[NumKernelDims];
} else {
@@ -152,44 +152,44 @@ class IndexMapper {
limit = NumDims - NumKernelDims - 1;
}
for (int d = 0; d < limit; ++d) {
- const Index idx = p / m_cudaOutputStrides[d];
+ const Index idx = p / m_gpuOutputStrides[d];
outputIndex += idx * m_outputStrides[d];
- p -= idx * m_cudaOutputStrides[d];
+ p -= idx * m_gpuOutputStrides[d];
}
outputIndex += p * m_outputStrides[limit];
}
return outputIndex;
}
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0
: NumDims - NumKernelDims;
return i * m_inputStrides[offset];
}
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0
: NumDims - NumKernelDims;
return i * m_outputStrides[offset];
}
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0
: NumDims - NumKernelDims;
return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
}
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0
: NumDims - NumKernelDims;
return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
}
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0
: NumDims - NumKernelDims;
@@ -197,7 +197,7 @@ class IndexMapper {
k * m_inputStrides[offset + 2];
}
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
? 0
: NumDims - NumKernelDims;
@@ -209,8 +209,8 @@ class IndexMapper {
static const int NumDims = internal::array_size<InputDims>::value;
array<Index, NumDims> m_inputStrides;
array<Index, NumDims> m_outputStrides;
- array<Index, NumDims> m_cudaInputStrides;
- array<Index, NumDims> m_cudaOutputStrides;
+ array<Index, NumDims> m_gpuInputStrides;
+ array<Index, NumDims> m_gpuOutputStrides;
};
@@ -231,6 +231,8 @@ struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
typedef typename remove_reference<RhsNested>::type _RhsNested;
static const int NumDimensions = traits<InputXprType>::NumDimensions;
static const int Layout = traits<InputXprType>::Layout;
+ typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
+ typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType;
enum {
Flags = 0
@@ -300,17 +302,25 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
- static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
enum {
- IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
- PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
+ IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
+ PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
+ BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -374,12 +384,12 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
m_inputImpl.evalSubExprsIfNeeded(NULL);
preloadKernel();
return true;
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+ EIGEN_STRONG_INLINE void cleanup() {
m_inputImpl.cleanup();
if (m_local_kernel) {
m_device.deallocate((void*)m_kernel);
@@ -465,7 +475,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
PacketSize));
}
- EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
private:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -521,11 +531,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
m_local_kernel = false;
} else {
size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
- Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+ Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
typedef TensorEvalToOp<const KernelArgType> EvalTo;
EvalTo evalToTmp(local, m_kernelArg);
- const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
- internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
+ const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
+ internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
m_kernel = local;
m_local_kernel = true;
@@ -544,14 +554,14 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
KernelArgType m_kernelArg;
const Scalar* m_kernel;
bool m_local_kernel;
- const Device& m_device;
+ const Device EIGEN_DEVICE_REF m_device;
};
// Use an optimized implementation of the evaluation code for GPUs whenever possible.
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
template <int StaticKernelSize>
struct GetKernelSize {
@@ -568,13 +578,17 @@ struct GetKernelSize<Dynamic> {
template <typename InputEvaluator, typename Index, typename InputDims,
int StaticKernelSize>
-__global__ void EigenConvolutionKernel1D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
indexMapper,
const float* __restrict kernel, const int numPlanes, const int numX,
const int maxX, const int kernelSize, float* buffer) {
+#if defined(EIGEN_HIPCC)
+ HIP_DYNAMIC_SHARED(float, s)
+#else
extern __shared__ float s[];
+#endif
const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@@ -586,18 +600,18 @@ __global__ void EigenConvolutionKernel1D(
for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
// Load inputs to shared memory
- const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+ const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = threadIdx.y * num_x_input;
#pragma unroll
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
- const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
+ const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
s[i + plane_kernel_offset] = eval.coeff(tensor_index);
}
__syncthreads();
// Compute the convolution
- const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+ const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
#pragma unroll
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
@@ -607,7 +621,7 @@ __global__ void EigenConvolutionKernel1D(
for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
result += s[k + kernel_offset] * kernel[k];
}
- const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
+ const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
buffer[tensor_index] = result;
}
__syncthreads();
@@ -616,14 +630,18 @@ __global__ void EigenConvolutionKernel1D(
template <typename InputEvaluator, typename Index, typename InputDims,
int StaticKernelSizeX, int StaticKernelSizeY>
-__global__ void EigenConvolutionKernel2D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
indexMapper,
const float* __restrict kernel, const int numPlanes, const int numX,
const int maxX, const int numY, const int maxY, const int kernelSizeX,
const int kernelSizeY, float* buffer) {
+#if defined(EIGEN_HIPCC)
+ HIP_DYNAMIC_SHARED(float, s)
+#else
extern __shared__ float s[];
+#endif
const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@@ -640,7 +658,7 @@ __global__ void EigenConvolutionKernel2D(
for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
- const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+ const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = threadIdx.z * num_y_input;
// Load inputs to shared memory
@@ -649,7 +667,7 @@ __global__ void EigenConvolutionKernel2D(
const int input_offset = num_x_input * (j + plane_kernel_offset);
#pragma unroll
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
- const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
+ const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
s[i + input_offset] = eval.coeff(tensor_index);
}
}
@@ -657,7 +675,7 @@ __global__ void EigenConvolutionKernel2D(
__syncthreads();
// Convolution
- const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+ const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
#pragma unroll
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@@ -673,7 +691,7 @@ __global__ void EigenConvolutionKernel2D(
result += s[k + input_offset] * kernel[k + kernel_offset];
}
}
- const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
+ const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
buffer[tensor_index] = result;
}
}
@@ -683,7 +701,7 @@ __global__ void EigenConvolutionKernel2D(
};
template <typename InputEvaluator, typename Index, typename InputDims>
-__global__ void EigenConvolutionKernel3D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
indexMapper,
@@ -691,7 +709,11 @@ __global__ void EigenConvolutionKernel3D(
const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
const size_t kernelSizeZ, float* buffer) {
+#if defined(EIGEN_HIPCC)
+ HIP_DYNAMIC_SHARED(float, s)
+#else
extern __shared__ float s[];
+#endif
// Load inputs to shared memory
const int first_x = blockIdx.x * maxX;
@@ -708,13 +730,13 @@ __global__ void EigenConvolutionKernel3D(
for (int p = 0; p < numPlanes; ++p) {
- const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
+ const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = 0;
for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
- const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
+ const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
}
}
@@ -726,7 +748,7 @@ __global__ void EigenConvolutionKernel3D(
const int num_z_output = last_z - first_z + 1;
const int num_y_output = last_y - first_y + 1;
const int num_x_output = last_x - first_x + 1;
- const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
+ const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@@ -739,7 +761,7 @@ __global__ void EigenConvolutionKernel3D(
}
}
}
- const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
+ const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
buffer[tensor_index] = result;
}
}
@@ -764,13 +786,19 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
enum {
IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
PacketAccess = false,
+ BlockAccess = false,
+ PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
- EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
- : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ TensorEvaluator(const XprType& op, const GpuDevice& device)
+ : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -852,9 +880,9 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
const int maxSharedMem = m_device.sharedMemPerBlock();
- const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
- const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
- const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
+ const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
+ const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
+ const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
const int warpSize = 32;
switch (NumKernelDims) {
@@ -889,7 +917,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
}
const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
- assert(shared_mem <= maxSharedMem);
+ gpu_assert(shared_mem <= maxSharedMem);
const int num_x_blocks = ceil(numX, maxX);
const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
@@ -906,15 +934,15 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
m_inputImpl.dimensions(), kernel_dims, indices);
switch(kernel_size) {
case 4: {
- LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
break;
}
case 7: {
- LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
break;
}
default: {
- LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
}
}
break;
@@ -946,7 +974,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
- assert(shared_mem <= maxSharedMem);
+ gpu_assert(shared_mem <= maxSharedMem);
const int num_x_blocks = ceil(numX, maxX);
const int num_y_blocks = ceil(numY, maxY);
@@ -967,11 +995,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
case 4: {
switch (kernel_size_y) {
case 7: {
- LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
break;
}
default: {
- LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
break;
}
}
@@ -980,18 +1008,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
case 7: {
switch (kernel_size_y) {
case 4: {
- LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
break;
}
default: {
- LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
break;
}
}
break;
}
default: {
- LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
break;
}
}
@@ -1026,7 +1054,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
- assert(shared_mem <= maxSharedMem);
+ gpu_assert(shared_mem <= maxSharedMem);
//cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
@@ -1037,7 +1065,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
m_inputImpl.dimensions(), kernel_dims, indices);
- LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
break;
}