aboutsummaryrefslogtreecommitdiff
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h198
1 files changed, 76 insertions, 122 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 37c1d1c3d..1655a813e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -2,7 +2,6 @@
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@@ -17,23 +16,50 @@ namespace internal {
namespace {
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#ifdef __CUDA_ARCH__
// We don't support 3d kernels since we currently only use 1 and
// 2d kernels.
- gpu_assert(threadIdx.z == 0);
- return blockIdx.x * blockDim.x + threadIdx.x
- + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
+ assert(threadIdx.z == 0);
+ return clock64() +
+ blockIdx.x * blockDim.x + threadIdx.x +
+ gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
+
+#elif defined _WIN32
+ // Use the current time as a baseline.
+ SYSTEMTIME st;
+ GetSystemTime(&st);
+ int time = st.wSecond + 1000 * st.wMilliseconds;
+ // Mix in a random number to make sure that we get different seeds if
+ // we try to generate seeds faster than the clock resolution.
+ // We need 2 random values since the generator only generate 16 bits at
+ // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
+ int rnd1 = ::rand();
+ int rnd2 = ::rand();
+ uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
+ return rnd;
+
+#elif defined __APPLE__
+ // Same approach as for win32, except that the random number generator
+ // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
+ uint64_t rnd = ::random() ^ mach_absolute_time();
+ return rnd;
+
#else
- // Rely on Eigen's random implementation.
- return random<uint64_t>();
+ // Augment the current time with pseudo random number generation
+ // to ensure that we get different seeds if we try to generate seeds
+ // faster than the clock resolution.
+ timespec ts;
+ clock_gettime(CLOCK_REALTIME, &ts);
+ uint64_t rnd = ::random() ^ ts.tv_nsec;
+ return rnd;
#endif
}
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
// TODO: Unify with the implementation in the non blocking thread pool.
uint64_t current = *state;
// Update the internal state
- *state = current * 6364136223846793005ULL + (stream << 1 | 1);
+ *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
// Generate the random output (using the PCG-XSH-RS scheme)
return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
}
@@ -47,42 +73,34 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t
template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
- unsigned rnd = PCG_XSH_RS_generator(state, stream);
+T RandomToTypeUniform(uint64_t* state) {
+ unsigned rnd = PCG_XSH_RS_generator(state);
return static_cast<T>(rnd);
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
- // Generate 10 random bits for the mantissa, merge with exponent.
- unsigned rnd = PCG_XSH_RS_generator(state, stream);
- const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
- Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
+ Eigen::half result;
+ // Generate 10 random bits for the mantissa
+ unsigned rnd = PCG_XSH_RS_generator(state);
+ result.x = static_cast<uint16_t>(rnd & 0x3ffu);
+ // Set the exponent
+ result.x |= (static_cast<uint16_t>(15) << 10);
// Return the final result
return result - Eigen::half(1.0f);
}
-template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
-
- // Generate 7 random bits for the mantissa, merge with exponent.
- unsigned rnd = PCG_XSH_RS_generator(state, stream);
- const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
- Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
- // Return the final result
- return result - Eigen::bfloat16(1.0f);
-}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
+float RandomToTypeUniform<float>(uint64_t* state) {
typedef union {
uint32_t raw;
float fp;
} internal;
internal result;
// Generate 23 random bits for the mantissa mantissa
- const unsigned rnd = PCG_XSH_RS_generator(state, stream);
+ const unsigned rnd = PCG_XSH_RS_generator(state);
result.raw = rnd & 0x7fffffu;
// Set the exponent
result.raw |= (static_cast<uint32_t>(127) << 23);
@@ -91,7 +109,7 @@ float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
+double RandomToTypeUniform<double>(uint64_t* state) {
typedef union {
uint64_t raw;
double dp;
@@ -100,9 +118,9 @@ double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
result.raw = 0;
// Generate 52 random bits for the mantissa
// First generate the upper 20 bits
- unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
+ unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
// The generate the lower 32 bits
- unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
+ unsigned rnd2 = PCG_XSH_RS_generator(state);
result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
// Set the exponent
result.raw |= (static_cast<uint64_t>(1023) << 52);
@@ -111,14 +129,14 @@ double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
- return std::complex<float>(RandomToTypeUniform<float>(state, stream),
- RandomToTypeUniform<float>(state, stream));
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
+ return std::complex<float>(RandomToTypeUniform<float>(state),
+ RandomToTypeUniform<float>(state));
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
- return std::complex<double>(RandomToTypeUniform<double>(state, stream),
- RandomToTypeUniform<double>(state, stream));
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
+ return std::complex<double>(RandomToTypeUniform<double>(state),
+ RandomToTypeUniform<double>(state));
}
template <typename T> class UniformRandomGenerator {
@@ -129,42 +147,17 @@ template <typename T> class UniformRandomGenerator {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
uint64_t seed = 0) {
m_state = PCG_XSH_RS_state(seed);
- #ifdef EIGEN_USE_SYCL
- // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
- // Therefor, we need two step to initializate the m_state.
- // IN SYCL, the constructor of the functor is s called on the CPU
- // and we get the clock seed here from the CPU. However, This seed is
- //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
- // and only available on the Operator() function (which is called on the GPU).
- // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
- // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
- // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
- // similar to CUDA Therefore, the thread Id injection is not available at this stage.
- //However when the operator() is called the thread ID will be avilable. So inside the opeator,
- // we add the thrreadID, BlockId,... (which is equivalent of i)
- //to the seed and construct the unique m_state per thead similar to cuda.
- m_exec_once =false;
- #endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
const UniformRandomGenerator& other) {
m_state = other.m_state;
- #ifdef EIGEN_USE_SYCL
- m_exec_once =other.m_exec_once;
- #endif
}
template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T operator()(Index i) const {
- #ifdef EIGEN_USE_SYCL
- if(!m_exec_once) {
- // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
- // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
- m_state += (i * 6364136223846793005ULL);
- m_exec_once =true;
- }
- #endif
- T result = RandomToTypeUniform<T>(&m_state, i);
+ uint64_t local_state = m_state + i;
+ T result = RandomToTypeUniform<T>(&local_state);
+ m_state = local_state;
return result;
}
@@ -172,25 +165,16 @@ template <typename T> class UniformRandomGenerator {
Packet packetOp(Index i) const {
const int packetSize = internal::unpacket_traits<Packet>::size;
EIGEN_ALIGN_MAX T values[packetSize];
- #ifdef EIGEN_USE_SYCL
- if(!m_exec_once) {
- // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
- m_state += (i * 6364136223846793005ULL);
- m_exec_once =true;
- }
- #endif
- EIGEN_UNROLL_LOOP
+ uint64_t local_state = m_state + i;
for (int j = 0; j < packetSize; ++j) {
- values[j] = RandomToTypeUniform<T>(&m_state, i);
+ values[j] = RandomToTypeUniform<T>(&local_state);
}
+ m_state = local_state;
return internal::pload<Packet>(values);
}
private:
mutable uint64_t m_state;
- #ifdef EIGEN_USE_SYCL
- mutable bool m_exec_once;
- #endif
};
template <typename Scalar>
@@ -206,14 +190,14 @@ struct functor_traits<UniformRandomGenerator<Scalar> > {
template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
+T RandomToTypeNormal(uint64_t* state) {
// Use the ratio of uniform method to generate numbers following a normal
// distribution. See for example Numerical Recipes chapter 7.3.9 for the
// details.
T u, v, q;
do {
- u = RandomToTypeUniform<T>(state, stream);
- v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
+ u = RandomToTypeUniform<T>(state);
+ v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
const T x = u - T(0.449871);
const T y = numext::abs(v) + T(0.386595);
q = x*x + y * (T(0.196)*y - T(0.25472)*x);
@@ -224,14 +208,14 @@ T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
- return std::complex<float>(RandomToTypeNormal<float>(state, stream),
- RandomToTypeNormal<float>(state, stream));
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
+ return std::complex<float>(RandomToTypeNormal<float>(state),
+ RandomToTypeNormal<float>(state));
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
- return std::complex<double>(RandomToTypeNormal<double>(state, stream),
- RandomToTypeNormal<double>(state, stream));
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
+ return std::complex<double>(RandomToTypeNormal<double>(state),
+ RandomToTypeNormal<double>(state));
}
@@ -242,38 +226,17 @@ template <typename T> class NormalRandomGenerator {
// Uses the given "seed" if non-zero, otherwise uses a random seed.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
m_state = PCG_XSH_RS_state(seed);
- #ifdef EIGEN_USE_SYCL
- // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
- // Therefor, we need two steps to initializate the m_state.
- // IN SYCL, the constructor of the functor is s called on the CPU
- // and we get the clock seed here from the CPU. However, This seed is
- //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
- // and only available on the Operator() function (which is called on the GPU).
- // Therefore, the thread Id injection is not available at this stage. However when the operator()
- //is called the thread ID will be avilable. So inside the opeator,
- // we add the thrreadID, BlockId,... (which is equivalent of i)
- //to the seed and construct the unique m_state per thead similar to cuda.
- m_exec_once =false;
- #endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
const NormalRandomGenerator& other) {
m_state = other.m_state;
-#ifdef EIGEN_USE_SYCL
- m_exec_once=other.m_exec_once;
-#endif
}
template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T operator()(Index i) const {
- #ifdef EIGEN_USE_SYCL
- if(!m_exec_once) {
- // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
- m_state += (i * 6364136223846793005ULL);
- m_exec_once =true;
- }
- #endif
- T result = RandomToTypeNormal<T>(&m_state, i);
+ uint64_t local_state = m_state + i;
+ T result = RandomToTypeNormal<T>(&local_state);
+ m_state = local_state;
return result;
}
@@ -281,25 +244,16 @@ template <typename T> class NormalRandomGenerator {
Packet packetOp(Index i) const {
const int packetSize = internal::unpacket_traits<Packet>::size;
EIGEN_ALIGN_MAX T values[packetSize];
- #ifdef EIGEN_USE_SYCL
- if(!m_exec_once) {
- // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
- m_state += (i * 6364136223846793005ULL);
- m_exec_once =true;
- }
- #endif
- EIGEN_UNROLL_LOOP
+ uint64_t local_state = m_state + i;
for (int j = 0; j < packetSize; ++j) {
- values[j] = RandomToTypeNormal<T>(&m_state, i);
+ values[j] = RandomToTypeNormal<T>(&local_state);
}
+ m_state = local_state;
return internal::pload<Packet>(values);
}
private:
mutable uint64_t m_state;
- #ifdef EIGEN_USE_SYCL
- mutable bool m_exec_once;
- #endif
};