![]() |
Eigen-unsupported
3.3.3
|
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com> 00005 // 00006 // This Source Code Form is subject to the terms of the Mozilla 00007 // Public License v. 2.0. If a copy of the MPL was not distributed 00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00009 00010 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H 00011 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H 00012 00013 namespace Eigen { 00014 00023 // Class storing the cost of evaluating a tensor expression in terms of the 00024 // estimated number of operand bytes loads, bytes stored, and compute cycles. 00025 class TensorOpCost { 00026 public: 00027 // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple 00028 // model based on minimal reciprocal throughput numbers from Intel or 00029 // Agner Fog's tables would be better than what is there now. 00030 template <typename ArgType> 00031 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() { 00032 return internal::functor_traits< 00033 internal::scalar_product_op<ArgType, ArgType> >::Cost; 00034 } 00035 template <typename ArgType> 00036 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() { 00037 return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost; 00038 } 00039 template <typename ArgType> 00040 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() { 00041 return internal::functor_traits< 00042 internal::scalar_quotient_op<ArgType, ArgType> >::Cost; 00043 } 00044 template <typename ArgType> 00045 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() { 00046 return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost; 00047 } 00048 template <typename SrcType, typename TargetType> 00049 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() { 00050 return internal::functor_traits< 00051 internal::scalar_cast_op<SrcType, TargetType> >::Cost; 00052 } 00053 00054 EIGEN_DEVICE_FUNC 00055 TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} 00056 EIGEN_DEVICE_FUNC 00057 TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) 00058 : bytes_loaded_(bytes_loaded), 00059 bytes_stored_(bytes_stored), 00060 compute_cycles_(compute_cycles) {} 00061 00062 EIGEN_DEVICE_FUNC 00063 TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, 00064 bool vectorized, double packet_size) 00065 : bytes_loaded_(bytes_loaded), 00066 bytes_stored_(bytes_stored), 00067 compute_cycles_(vectorized ? compute_cycles / packet_size 00068 : compute_cycles) { 00069 eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded)); 00070 eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored)); 00071 eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles)); 00072 } 00073 00074 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { 00075 return bytes_loaded_; 00076 } 00077 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { 00078 return bytes_stored_; 00079 } 00080 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { 00081 return compute_cycles_; 00082 } 00083 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost( 00084 double load_cost, double store_cost, double compute_cost) const { 00085 return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + 00086 compute_cost * compute_cycles_; 00087 } 00088 00089 // Drop memory access component. Intended for cases when memory accesses are 00090 // sequential or are completely masked by computations. 00091 EIGEN_DEVICE_FUNC void dropMemoryCost() { 00092 bytes_loaded_ = 0; 00093 bytes_stored_ = 0; 00094 } 00095 00096 // TODO(rmlarsen): Define min in terms of total cost, not elementwise. 00097 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin( 00098 const TensorOpCost& rhs) const { 00099 double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded()); 00100 double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored()); 00101 double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles()); 00102 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); 00103 } 00104 00105 // TODO(rmlarsen): Define max in terms of total cost, not elementwise. 00106 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax( 00107 const TensorOpCost& rhs) const { 00108 double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); 00109 double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored()); 00110 double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles()); 00111 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); 00112 } 00113 00114 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( 00115 const TensorOpCost& rhs) { 00116 bytes_loaded_ += rhs.bytes_loaded(); 00117 bytes_stored_ += rhs.bytes_stored(); 00118 compute_cycles_ += rhs.compute_cycles(); 00119 return *this; 00120 } 00121 00122 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { 00123 bytes_loaded_ *= rhs; 00124 bytes_stored_ *= rhs; 00125 compute_cycles_ *= rhs; 00126 return *this; 00127 } 00128 00129 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+( 00130 TensorOpCost lhs, const TensorOpCost& rhs) { 00131 lhs += rhs; 00132 return lhs; 00133 } 00134 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( 00135 TensorOpCost lhs, double rhs) { 00136 lhs *= rhs; 00137 return lhs; 00138 } 00139 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( 00140 double lhs, TensorOpCost rhs) { 00141 rhs *= lhs; 00142 return rhs; 00143 } 00144 00145 friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { 00146 return os << "[bytes_loaded = " << tc.bytes_loaded() 00147 << ", bytes_stored = " << tc.bytes_stored() 00148 << ", compute_cycles = " << tc.compute_cycles() << "]"; 00149 } 00150 00151 private: 00152 double bytes_loaded_; 00153 double bytes_stored_; 00154 double compute_cycles_; 00155 }; 00156 00157 // TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads 00158 // in [1:max_threads] instead of just switching multi-threading off for small 00159 // work units. 00160 template <typename Device> 00161 class TensorCostModel { 00162 public: 00163 // Scaling from Eigen compute cost to device cycles. 00164 static const int kDeviceCyclesPerComputeCycle = 1; 00165 00166 // Costs in device cycles. 00167 static const int kStartupCycles = 100000; 00168 static const int kPerThreadCycles = 100000; 00169 static const int kTaskSize = 40000; 00170 00171 // Returns the number of threads in [1:max_threads] to use for 00172 // evaluating an expression with the given output size and cost per 00173 // coefficient. 00174 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( 00175 double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { 00176 double cost = totalCost(output_size, cost_per_coeff); 00177 int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; 00178 return numext::mini(max_threads, numext::maxi(1, threads)); 00179 } 00180 00181 // taskSize assesses parallel task size. 00182 // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task 00183 // granularity needs to be increased to mitigate parallelization overheads. 00184 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize( 00185 double output_size, const TensorOpCost& cost_per_coeff) { 00186 return totalCost(output_size, cost_per_coeff) / kTaskSize; 00187 } 00188 00189 private: 00190 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( 00191 double output_size, const TensorOpCost& cost_per_coeff) { 00192 // Cost of memory fetches from L2 cache. 64 is typical cache line size. 00193 // 11 is L2 cache latency on Haswell. 00194 // We don't know whether data is in L1, L2 or L3. But we are most interested 00195 // in single-threaded computational time around 100us-10ms (smaller time 00196 // is too small for parallelization, larger time is not intersting 00197 // either because we are probably using all available threads already). 00198 // And for the target time range, L2 seems to be what matters. Data set 00199 // fitting into L1 is too small to take noticeable time. Data set fitting 00200 // only into L3 presumably will take more than 10ms to load and process. 00201 const double kLoadCycles = 1.0 / 64 * 11; 00202 const double kStoreCycles = 1.0 / 64 * 11; 00203 // Scaling from Eigen compute cost to device cycles. 00204 return output_size * 00205 cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, 00206 kDeviceCyclesPerComputeCycle); 00207 } 00208 }; 00209 00210 } // namespace Eigen 00211 00212 #endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H