TensorCostModel.h
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
00005 //
00006 // This Source Code Form is subject to the terms of the Mozilla
00007 // Public License v. 2.0. If a copy of the MPL was not distributed
00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
00009 
00010 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
00011 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
00012 
00013 namespace Eigen {
00014 
00023 // Class storing the cost of evaluating a tensor expression in terms of the
00024 // estimated number of operand bytes loads, bytes stored, and compute cycles.
00025 class TensorOpCost {
00026  public:
00027   // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
00028   // model based on minimal reciprocal throughput numbers from Intel or
00029   // Agner Fog's tables would be better than what is there now.
00030   template <typename ArgType>
00031   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
00032     return internal::functor_traits<
00033         internal::scalar_product_op<ArgType, ArgType> >::Cost;
00034   }
00035   template <typename ArgType>
00036   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
00037     return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
00038   }
00039   template <typename ArgType>
00040   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
00041     return internal::functor_traits<
00042         internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
00043   }
00044   template <typename ArgType>
00045   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
00046     return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
00047   }
00048   template <typename SrcType, typename TargetType>
00049   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
00050     return internal::functor_traits<
00051         internal::scalar_cast_op<SrcType, TargetType> >::Cost;
00052   }
00053 
00054   EIGEN_DEVICE_FUNC
00055   TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
00056   EIGEN_DEVICE_FUNC
00057   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
00058       : bytes_loaded_(bytes_loaded),
00059         bytes_stored_(bytes_stored),
00060         compute_cycles_(compute_cycles) {}
00061 
00062   EIGEN_DEVICE_FUNC
00063   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
00064                bool vectorized, double packet_size)
00065       : bytes_loaded_(bytes_loaded),
00066         bytes_stored_(bytes_stored),
00067         compute_cycles_(vectorized ? compute_cycles / packet_size
00068                                    : compute_cycles) {
00069     eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
00070     eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
00071     eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
00072   }
00073 
00074   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
00075     return bytes_loaded_;
00076   }
00077   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
00078     return bytes_stored_;
00079   }
00080   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
00081     return compute_cycles_;
00082   }
00083   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
00084       double load_cost, double store_cost, double compute_cost) const {
00085     return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
00086            compute_cost * compute_cycles_;
00087   }
00088 
00089   // Drop memory access component. Intended for cases when memory accesses are
00090   // sequential or are completely masked by computations.
00091   EIGEN_DEVICE_FUNC void dropMemoryCost() {
00092     bytes_loaded_ = 0;
00093     bytes_stored_ = 0;
00094   }
00095 
00096   // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
00097   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
00098       const TensorOpCost& rhs) const {
00099     double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
00100     double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
00101     double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
00102     return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
00103   }
00104 
00105   // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
00106   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
00107       const TensorOpCost& rhs) const {
00108     double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
00109     double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
00110     double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
00111     return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
00112   }
00113 
00114   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
00115       const TensorOpCost& rhs) {
00116     bytes_loaded_ += rhs.bytes_loaded();
00117     bytes_stored_ += rhs.bytes_stored();
00118     compute_cycles_ += rhs.compute_cycles();
00119     return *this;
00120   }
00121 
00122   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
00123     bytes_loaded_ *= rhs;
00124     bytes_stored_ *= rhs;
00125     compute_cycles_ *= rhs;
00126     return *this;
00127   }
00128 
00129   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
00130       TensorOpCost lhs, const TensorOpCost& rhs) {
00131     lhs += rhs;
00132     return lhs;
00133   }
00134   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
00135       TensorOpCost lhs, double rhs) {
00136     lhs *= rhs;
00137     return lhs;
00138   }
00139   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
00140       double lhs, TensorOpCost rhs) {
00141     rhs *= lhs;
00142     return rhs;
00143   }
00144 
00145   friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
00146     return os << "[bytes_loaded = " << tc.bytes_loaded()
00147               << ", bytes_stored = " << tc.bytes_stored()
00148               << ", compute_cycles = " << tc.compute_cycles() << "]";
00149   }
00150 
00151  private:
00152   double bytes_loaded_;
00153   double bytes_stored_;
00154   double compute_cycles_;
00155 };
00156 
00157 // TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
00158 // in [1:max_threads] instead of just switching multi-threading off for small
00159 // work units.
00160 template <typename Device>
00161 class TensorCostModel {
00162  public:
00163   // Scaling from Eigen compute cost to device cycles.
00164   static const int kDeviceCyclesPerComputeCycle = 1;
00165 
00166  // Costs in device cycles.
00167   static const int kStartupCycles = 100000;
00168   static const int kPerThreadCycles = 100000;
00169   static const int kTaskSize = 40000;
00170 
00171   // Returns the number of threads in [1:max_threads] to use for
00172   // evaluating an expression with the given output size and cost per
00173   // coefficient.
00174   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
00175       double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
00176     double cost = totalCost(output_size, cost_per_coeff);
00177     int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
00178     return numext::mini(max_threads, numext::maxi(1, threads));
00179   }
00180 
00181   // taskSize assesses parallel task size.
00182   // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
00183   // granularity needs to be increased to mitigate parallelization overheads.
00184   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
00185       double output_size, const TensorOpCost& cost_per_coeff) {
00186     return totalCost(output_size, cost_per_coeff) / kTaskSize;
00187   }
00188 
00189  private:
00190   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
00191       double output_size, const TensorOpCost& cost_per_coeff) {
00192     // Cost of memory fetches from L2 cache. 64 is typical cache line size.
00193     // 11 is L2 cache latency on Haswell.
00194     // We don't know whether data is in L1, L2 or L3. But we are most interested
00195     // in single-threaded computational time around 100us-10ms (smaller time
00196     // is too small for parallelization, larger time is not intersting
00197     // either because we are probably using all available threads already).
00198     // And for the target time range, L2 seems to be what matters. Data set
00199     // fitting into L1 is too small to take noticeable time. Data set fitting
00200     // only into L3 presumably will take more than 10ms to load and process.
00201     const double kLoadCycles = 1.0 / 64 * 11;
00202     const double kStoreCycles = 1.0 / 64 * 11;
00203     // Scaling from Eigen compute cost to device cycles.
00204     return output_size *
00205         cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
00206                                   kDeviceCyclesPerComputeCycle);
00207   }
00208 };
00209 
00210 }  // namespace Eigen
00211 
00212 #endif  // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
 All Classes Functions Variables Typedefs Enumerator