diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h | 10 |
1 files changed, 6 insertions, 4 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 83c449cf1..195267ce8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -174,8 +174,11 @@ class TensorCostModel { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { double cost = totalCost(output_size, cost_per_coeff); - int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; - return numext::mini(max_threads, numext::maxi(1, threads)); + double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + // Make sure we don't invoke undefined behavior when we convert to an int. + threads = numext::mini<double>(threads, GenericNumTraits<int>::highest()); + return numext::mini(max_threads, + numext::maxi<int>(1, static_cast<int>(threads))); } // taskSize assesses parallel task size. @@ -186,14 +189,13 @@ class TensorCostModel { return totalCost(output_size, cost_per_coeff) / kTaskSize; } - private: static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( double output_size, const TensorOpCost& cost_per_coeff) { // Cost of memory fetches from L2 cache. 64 is typical cache line size. // 11 is L2 cache latency on Haswell. // We don't know whether data is in L1, L2 or L3. But we are most interested // in single-threaded computational time around 100us-10ms (smaller time - // is too small for parallelization, larger time is not intersting + // is too small for parallelization, larger time is not interesting // either because we are probably using all available threads already). // And for the target time range, L2 seems to be what matters. Data set // fitting into L1 is too small to take noticeable time. Data set fitting |