diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h | 198 |
1 files changed, 76 insertions, 122 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 37c1d1c3d..1655a813e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -2,7 +2,6 @@ // for linear algebra. // // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> -// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -17,23 +16,50 @@ namespace internal { namespace { EIGEN_DEVICE_FUNC uint64_t get_random_seed() { -#if defined(EIGEN_GPU_COMPILE_PHASE) +#ifdef __CUDA_ARCH__ // We don't support 3d kernels since we currently only use 1 and // 2d kernels. - gpu_assert(threadIdx.z == 0); - return blockIdx.x * blockDim.x + threadIdx.x - + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); + assert(threadIdx.z == 0); + return clock64() + + blockIdx.x * blockDim.x + threadIdx.x + + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); + +#elif defined _WIN32 + // Use the current time as a baseline. + SYSTEMTIME st; + GetSystemTime(&st); + int time = st.wSecond + 1000 * st.wMilliseconds; + // Mix in a random number to make sure that we get different seeds if + // we try to generate seeds faster than the clock resolution. + // We need 2 random values since the generator only generate 16 bits at + // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx) + int rnd1 = ::rand(); + int rnd2 = ::rand(); + uint64_t rnd = (rnd1 | rnd2 << 16) ^ time; + return rnd; + +#elif defined __APPLE__ + // Same approach as for win32, except that the random number generator + // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random). + uint64_t rnd = ::random() ^ mach_absolute_time(); + return rnd; + #else - // Rely on Eigen's random implementation. - return random<uint64_t>(); + // Augment the current time with pseudo random number generation + // to ensure that we get different seeds if we try to generate seeds + // faster than the clock resolution. + timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + uint64_t rnd = ::random() ^ ts.tv_nsec; + return rnd; #endif } -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) { // TODO: Unify with the implementation in the non blocking thread pool. uint64_t current = *state; // Update the internal state - *state = current * 6364136223846793005ULL + (stream << 1 | 1); + *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; // Generate the random output (using the PCG-XSH-RS scheme) return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61))); } @@ -47,42 +73,34 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeUniform(uint64_t* state, uint64_t stream) { - unsigned rnd = PCG_XSH_RS_generator(state, stream); +T RandomToTypeUniform(uint64_t* state) { + unsigned rnd = PCG_XSH_RS_generator(state); return static_cast<T>(rnd); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) { - // Generate 10 random bits for the mantissa, merge with exponent. - unsigned rnd = PCG_XSH_RS_generator(state, stream); - const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10); - Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits); +Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) { + Eigen::half result; + // Generate 10 random bits for the mantissa + unsigned rnd = PCG_XSH_RS_generator(state); + result.x = static_cast<uint16_t>(rnd & 0x3ffu); + // Set the exponent + result.x |= (static_cast<uint16_t>(15) << 10); // Return the final result return result - Eigen::half(1.0f); } -template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) { - - // Generate 7 random bits for the mantissa, merge with exponent. - unsigned rnd = PCG_XSH_RS_generator(state, stream); - const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7); - Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits); - // Return the final result - return result - Eigen::bfloat16(1.0f); -} template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) { +float RandomToTypeUniform<float>(uint64_t* state) { typedef union { uint32_t raw; float fp; } internal; internal result; // Generate 23 random bits for the mantissa mantissa - const unsigned rnd = PCG_XSH_RS_generator(state, stream); + const unsigned rnd = PCG_XSH_RS_generator(state); result.raw = rnd & 0x7fffffu; // Set the exponent result.raw |= (static_cast<uint32_t>(127) << 23); @@ -91,7 +109,7 @@ float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) { +double RandomToTypeUniform<double>(uint64_t* state) { typedef union { uint64_t raw; double dp; @@ -100,9 +118,9 @@ double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) { result.raw = 0; // Generate 52 random bits for the mantissa // First generate the upper 20 bits - unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu; + unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu; // The generate the lower 32 bits - unsigned rnd2 = PCG_XSH_RS_generator(state, stream); + unsigned rnd2 = PCG_XSH_RS_generator(state); result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2; // Set the exponent result.raw |= (static_cast<uint64_t>(1023) << 52); @@ -111,14 +129,14 @@ double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) { - return std::complex<float>(RandomToTypeUniform<float>(state, stream), - RandomToTypeUniform<float>(state, stream)); +std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) { + return std::complex<float>(RandomToTypeUniform<float>(state), + RandomToTypeUniform<float>(state)); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) { - return std::complex<double>(RandomToTypeUniform<double>(state, stream), - RandomToTypeUniform<double>(state, stream)); +std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) { + return std::complex<double>(RandomToTypeUniform<double>(state), + RandomToTypeUniform<double>(state)); } template <typename T> class UniformRandomGenerator { @@ -129,42 +147,17 @@ template <typename T> class UniformRandomGenerator { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( uint64_t seed = 0) { m_state = PCG_XSH_RS_state(seed); - #ifdef EIGEN_USE_SYCL - // In SYCL it is not possible to build PCG_XSH_RS_state in one step. - // Therefor, we need two step to initializate the m_state. - // IN SYCL, the constructor of the functor is s called on the CPU - // and we get the clock seed here from the CPU. However, This seed is - //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. - // and only available on the Operator() function (which is called on the GPU). - // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread - // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds - // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction - // similar to CUDA Therefore, the thread Id injection is not available at this stage. - //However when the operator() is called the thread ID will be avilable. So inside the opeator, - // we add the thrreadID, BlockId,... (which is equivalent of i) - //to the seed and construct the unique m_state per thead similar to cuda. - m_exec_once =false; - #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( const UniformRandomGenerator& other) { m_state = other.m_state; - #ifdef EIGEN_USE_SYCL - m_exec_once =other.m_exec_once; - #endif } template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { - #ifdef EIGEN_USE_SYCL - if(!m_exec_once) { - // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread - // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side - m_state += (i * 6364136223846793005ULL); - m_exec_once =true; - } - #endif - T result = RandomToTypeUniform<T>(&m_state, i); + uint64_t local_state = m_state + i; + T result = RandomToTypeUniform<T>(&local_state); + m_state = local_state; return result; } @@ -172,25 +165,16 @@ template <typename T> class UniformRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits<Packet>::size; EIGEN_ALIGN_MAX T values[packetSize]; - #ifdef EIGEN_USE_SYCL - if(!m_exec_once) { - // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread - m_state += (i * 6364136223846793005ULL); - m_exec_once =true; - } - #endif - EIGEN_UNROLL_LOOP + uint64_t local_state = m_state + i; for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeUniform<T>(&m_state, i); + values[j] = RandomToTypeUniform<T>(&local_state); } + m_state = local_state; return internal::pload<Packet>(values); } private: mutable uint64_t m_state; - #ifdef EIGEN_USE_SYCL - mutable bool m_exec_once; - #endif }; template <typename Scalar> @@ -206,14 +190,14 @@ struct functor_traits<UniformRandomGenerator<Scalar> > { template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeNormal(uint64_t* state, uint64_t stream) { +T RandomToTypeNormal(uint64_t* state) { // Use the ratio of uniform method to generate numbers following a normal // distribution. See for example Numerical Recipes chapter 7.3.9 for the // details. T u, v, q; do { - u = RandomToTypeUniform<T>(state, stream); - v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5)); + u = RandomToTypeUniform<T>(state); + v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5)); const T x = u - T(0.449871); const T y = numext::abs(v) + T(0.386595); q = x*x + y * (T(0.196)*y - T(0.25472)*x); @@ -224,14 +208,14 @@ T RandomToTypeNormal(uint64_t* state, uint64_t stream) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) { - return std::complex<float>(RandomToTypeNormal<float>(state, stream), - RandomToTypeNormal<float>(state, stream)); +std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) { + return std::complex<float>(RandomToTypeNormal<float>(state), + RandomToTypeNormal<float>(state)); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) { - return std::complex<double>(RandomToTypeNormal<double>(state, stream), - RandomToTypeNormal<double>(state, stream)); +std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) { + return std::complex<double>(RandomToTypeNormal<double>(state), + RandomToTypeNormal<double>(state)); } @@ -242,38 +226,17 @@ template <typename T> class NormalRandomGenerator { // Uses the given "seed" if non-zero, otherwise uses a random seed. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { m_state = PCG_XSH_RS_state(seed); - #ifdef EIGEN_USE_SYCL - // In SYCL it is not possible to build PCG_XSH_RS_state in one step. - // Therefor, we need two steps to initializate the m_state. - // IN SYCL, the constructor of the functor is s called on the CPU - // and we get the clock seed here from the CPU. However, This seed is - //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. - // and only available on the Operator() function (which is called on the GPU). - // Therefore, the thread Id injection is not available at this stage. However when the operator() - //is called the thread ID will be avilable. So inside the opeator, - // we add the thrreadID, BlockId,... (which is equivalent of i) - //to the seed and construct the unique m_state per thead similar to cuda. - m_exec_once =false; - #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( const NormalRandomGenerator& other) { m_state = other.m_state; -#ifdef EIGEN_USE_SYCL - m_exec_once=other.m_exec_once; -#endif } template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { - #ifdef EIGEN_USE_SYCL - if(!m_exec_once) { - // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread - m_state += (i * 6364136223846793005ULL); - m_exec_once =true; - } - #endif - T result = RandomToTypeNormal<T>(&m_state, i); + uint64_t local_state = m_state + i; + T result = RandomToTypeNormal<T>(&local_state); + m_state = local_state; return result; } @@ -281,25 +244,16 @@ template <typename T> class NormalRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits<Packet>::size; EIGEN_ALIGN_MAX T values[packetSize]; - #ifdef EIGEN_USE_SYCL - if(!m_exec_once) { - // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread - m_state += (i * 6364136223846793005ULL); - m_exec_once =true; - } - #endif - EIGEN_UNROLL_LOOP + uint64_t local_state = m_state + i; for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeNormal<T>(&m_state, i); + values[j] = RandomToTypeNormal<T>(&local_state); } + m_state = local_state; return internal::pload<Packet>(values); } private: mutable uint64_t m_state; - #ifdef EIGEN_USE_SYCL - mutable bool m_exec_once; - #endif }; |