aboutsummaryrefslogtreecommitdiff
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h10
1 files changed, 6 insertions, 4 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index 83c449cf1..195267ce8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -174,8 +174,11 @@ class TensorCostModel {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
double cost = totalCost(output_size, cost_per_coeff);
- int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
- return numext::mini(max_threads, numext::maxi(1, threads));
+ double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
+ // Make sure we don't invoke undefined behavior when we convert to an int.
+ threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
+ return numext::mini(max_threads,
+ numext::maxi<int>(1, static_cast<int>(threads)));
}
// taskSize assesses parallel task size.
@@ -186,14 +189,13 @@ class TensorCostModel {
return totalCost(output_size, cost_per_coeff) / kTaskSize;
}
- private:
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
double output_size, const TensorOpCost& cost_per_coeff) {
// Cost of memory fetches from L2 cache. 64 is typical cache line size.
// 11 is L2 cache latency on Haswell.
// We don't know whether data is in L1, L2 or L3. But we are most interested
// in single-threaded computational time around 100us-10ms (smaller time
- // is too small for parallelization, larger time is not intersting
+ // is too small for parallelization, larger time is not interesting
// either because we are probably using all available threads already).
// And for the target time range, L2 seems to be what matters. Data set
// fitting into L1 is too small to take noticeable time. Data set fitting