![]() |
Eigen-unsupported
3.3.3
|
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 00005 // 00006 // This Source Code Form is subject to the terms of the Mozilla 00007 // Public License v. 2.0. If a copy of the MPL was not distributed 00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00009 00010 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H 00011 #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H 00012 00013 // evaluator for thread pool device 00014 #ifdef EIGEN_USE_THREADS 00015 00016 namespace Eigen { 00017 00018 #ifdef EIGEN_USE_SIMPLE_THREAD_POOL 00019 namespace internal { 00020 00021 template<typename LhsScalar, typename LhsMapper, typename Index> 00022 struct packLhsArg { 00023 LhsScalar* blockA; 00024 const LhsMapper& lhs; 00025 const Index m_start; 00026 const Index k_start; 00027 const Index mc; 00028 const Index kc; 00029 }; 00030 00031 template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index> 00032 struct packRhsAndKernelArg { 00033 const MaxSizeVector<LhsScalar*>* blockAs; 00034 RhsScalar* blockB; 00035 const RhsMapper& rhs; 00036 OutputMapper& output; 00037 const Index m; 00038 const Index k; 00039 const Index n; 00040 const Index mc; 00041 const Index kc; 00042 const Index nc; 00043 const Index num_threads; 00044 const Index num_blockAs; 00045 const Index max_m; 00046 const Index k_block_idx; 00047 const Index m_block_idx; 00048 const Index n_block_idx; 00049 const Index m_blocks; 00050 const Index n_blocks; 00051 MaxSizeVector<Notification*>* kernel_notifications; 00052 const MaxSizeVector<Notification*>* lhs_notifications; 00053 const bool need_to_pack; 00054 }; 00055 00056 } // end namespace internal 00057 #endif // EIGEN_USE_SIMPLE_THREAD_POOL 00058 00059 template<typename Indices, typename LeftArgType, typename RightArgType> 00060 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> : 00061 public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > { 00062 00063 typedef ThreadPoolDevice Device; 00064 00065 typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self; 00066 typedef TensorContractionEvaluatorBase<Self> Base; 00067 00068 typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType; 00069 typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; 00070 typedef typename XprType::Index Index; 00071 typedef typename XprType::CoeffReturnType CoeffReturnType; 00072 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; 00073 00074 enum { 00075 Layout = TensorEvaluator<LeftArgType, Device>::Layout, 00076 }; 00077 00078 // Most of the code is assuming that both input tensors are ColMajor. If the 00079 // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: 00080 // If we want to compute A * B = C, where A is LHS and B is RHS, the code 00081 // will pretend B is LHS and A is RHS. 00082 typedef typename internal::conditional< 00083 static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; 00084 typedef typename internal::conditional< 00085 static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; 00086 00087 static const int LDims = 00088 internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value; 00089 static const int RDims = 00090 internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value; 00091 static const int ContractDims = internal::array_size<Indices>::value; 00092 00093 typedef array<Index, LDims> left_dim_mapper_t; 00094 typedef array<Index, RDims> right_dim_mapper_t; 00095 00096 typedef array<Index, ContractDims> contract_t; 00097 typedef array<Index, LDims - ContractDims> left_nocontract_t; 00098 typedef array<Index, RDims - ContractDims> right_nocontract_t; 00099 00100 static const int NumDims = LDims + RDims - 2 * ContractDims; 00101 00102 typedef DSizes<Index, NumDims> Dimensions; 00103 00104 // typedefs needed in evalTo 00105 typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar; 00106 typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar; 00107 typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; 00108 00109 typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; 00110 typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; 00111 00112 TensorEvaluator(const XprType& op, const Device& device) : 00113 Base(op, device) {} 00114 00115 #ifndef EIGEN_USE_SIMPLE_THREAD_POOL 00116 template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, 00117 bool rhs_inner_dim_reordered, int Alignment> 00118 void evalProduct(Scalar* buffer) const { 00119 typedef 00120 typename internal::remove_const<typename EvalLeftArgType::Scalar>::type 00121 LhsScalar; 00122 typedef 00123 typename internal::remove_const<typename EvalRightArgType::Scalar>::type 00124 RhsScalar; 00125 typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits; 00126 typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator; 00127 typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator; 00128 typedef internal::TensorContractionInputMapper< 00129 LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, 00130 contract_t, internal::packet_traits<LhsScalar>::size, 00131 lhs_inner_dim_contiguous, false, Unaligned> 00132 LhsMapper; 00133 typedef internal::TensorContractionInputMapper< 00134 RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, 00135 contract_t, internal::packet_traits<RhsScalar>::size, 00136 rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> 00137 RhsMapper; 00138 typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; 00139 typedef internal::gemm_pack_lhs<LhsScalar, Index, 00140 typename LhsMapper::SubMapper, Traits::mr, 00141 Traits::LhsProgress, ColMajor> 00142 LhsPacker; 00143 typedef internal::gemm_pack_rhs< 00144 RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> 00145 RhsPacker; 00146 typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, 00147 Traits::mr, Traits::nr, false, false> 00148 GebpKernel; 00149 00150 const Index m = this->m_i_size; 00151 const Index n = this->m_j_size; 00152 const Index k = this->m_k_size; 00153 if (m == 0 || n == 0 || k == 0) return; 00154 00155 // Compute a set of algorithm parameters: 00156 // - kernel block sizes (bm, bn, bk) 00157 // - task grain sizes (number of kernels executed per task: gm, gn) 00158 // - number of threads 00159 // - sharding by row/column 00160 // - parallel packing or first lhs then rhs 00161 // and some derived parameters: 00162 // - number of tasks (nm, nn, nk) 00163 // - number of kernels (nm0, nn0) 00164 // Unfortunately, all these parameters are tightly interdependent. 00165 // So in some cases we first compute approximate values, then compute other 00166 // values based on these approximations and then refine the approximations. 00167 00168 // There are lots of heuristics here. There is some reasoning behind them, 00169 // but ultimately they are just tuned on contraction benchmarks for 00170 // different input configurations, thread counts and instruction sets. 00171 // So feel free to question any of them. 00172 00173 // Compute whether we want to shard by row or by column. 00174 // This is a first approximation, it will be refined later. Since we don't 00175 // know number of threads yet we use 2, because what's we are most 00176 // interested in at this point is whether it makes sense to use 00177 // parallelization at all or not. 00178 bool shard_by_col = shardByCol(m, n, 2); 00179 00180 // First approximation of kernel blocking sizes. 00181 // Again, we don't know number of threads yet, so we use 2. 00182 Index bm, bn, bk; 00183 if (shard_by_col) { 00184 internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, 00185 internal::ShardByCol> 00186 blocking(k, m, n, 2); 00187 bm = blocking.mc(); 00188 bn = blocking.nc(); 00189 bk = blocking.kc(); 00190 } else { 00191 internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, 00192 internal::ShardByRow> 00193 blocking(k, m, n, 2); 00194 bm = blocking.mc(); 00195 bn = blocking.nc(); 00196 bk = blocking.kc(); 00197 } 00198 00199 // Compute optimal number of threads. 00200 // Note: we use bk instead of k here because we are interested in amount of 00201 // _parallelizable_ computations, and computations are not parallelizable 00202 // across k dimension. 00203 const TensorOpCost cost = 00204 contractionCost(m, n, bm, bn, bk, shard_by_col, false); 00205 int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( 00206 static_cast<double>(n) * m, cost, this->m_device.numThreads()); 00207 00208 // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost 00209 // model is not tuned. Remove this when the cost model is tuned. 00210 if (n == 1) num_threads = 1; 00211 00212 if (num_threads == 1) { 00213 // The single-threaded algorithm should be faster in this case. 00214 if (n == 1) 00215 this->template evalGemv<lhs_inner_dim_contiguous, 00216 rhs_inner_dim_contiguous, 00217 rhs_inner_dim_reordered, Alignment>(buffer); 00218 else 00219 this->template evalGemm<lhs_inner_dim_contiguous, 00220 rhs_inner_dim_contiguous, 00221 rhs_inner_dim_reordered, Alignment>(buffer); 00222 return; 00223 } 00224 00225 // Now that we know number of threads, recalculate sharding and blocking. 00226 shard_by_col = shardByCol(m, n, num_threads); 00227 if (shard_by_col) { 00228 internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, 00229 internal::ShardByCol> 00230 blocking(k, m, n, num_threads); 00231 bm = blocking.mc(); 00232 bn = blocking.nc(); 00233 bk = blocking.kc(); 00234 } else { 00235 internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, 00236 internal::ShardByRow> 00237 blocking(k, m, n, num_threads); 00238 bm = blocking.mc(); 00239 bn = blocking.nc(); 00240 bk = blocking.kc(); 00241 } 00242 00243 // Number of kernels for each dimension. 00244 Index nm0 = divup(m, bm); 00245 Index nn0 = divup(n, bn); 00246 Index nk = divup(k, bk); 00247 00248 // Calculate task grain size (number of kernels executed per task). 00249 // This task size coarsening serves two purposes: 00250 // 1. It reduces per-task overheads including synchronization overheads. 00251 // 2. It allows to use caches better (reuse the same packed rhs in several 00252 // consecutive kernels). 00253 Index gm = 1; 00254 Index gn = 1; 00255 // If we are sharding by column, then we prefer to reduce rows first. 00256 if (shard_by_col) { 00257 gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); 00258 gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); 00259 } else { 00260 gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); 00261 gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); 00262 } 00263 // Number of tasks in each dimension. 00264 Index nm = divup(nm0, gm); 00265 Index nn = divup(nn0, gn); 00266 00267 // Last by not least, decide whether we want to issue both lhs and rhs 00268 // packing in parallel; or issue lhs packing first, and then issue rhs 00269 // packing when lhs packing completes (for !shard_by_col lhs and rhs are 00270 // swapped). Parallel packing allows more parallelism (for both packing and 00271 // kernels), while sequential packing provides better locality (once 00272 // a thread finishes rhs packing it proceed to kernels with that rhs). 00273 // First, we are interested in parallel packing if there are few tasks. 00274 bool parallel_pack = num_threads >= nm * nn; 00275 // Also do parallel packing if all data fits into L2$. 00276 if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <= 00277 l2CacheSize() * num_threads) 00278 parallel_pack = true; 00279 // But don't do it if we will use each rhs only once. Locality seems to be 00280 // more important in this case. 00281 if ((shard_by_col ? nm : nn) == 1) parallel_pack = false; 00282 00283 LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, 00284 this->m_i_strides, this->m_left_contracting_strides, 00285 this->m_k_strides); 00286 00287 RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, 00288 this->m_j_strides, this->m_right_contracting_strides, 00289 this->m_k_strides); 00290 00291 Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper, 00292 OutputMapper>(this->m_device, num_threads, lhs, rhs, buffer, m, n, 00293 k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, 00294 shard_by_col, parallel_pack) 00295 .run(); 00296 } 00297 00298 // Context coordinates a single parallel gemm operation. 00299 template <typename LhsPacker, typename RhsPacker, typename GebpKernel, 00300 typename LhsMapper, typename RhsMapper, typename OutputMapper> 00301 class Context { 00302 public: 00303 Context(const Device& device, int num_threads, LhsMapper& lhs, 00304 RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, 00305 Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, 00306 Index gn, Index nm0, Index nn0, bool shard_by_col, 00307 bool parallel_pack) 00308 : device_(device), 00309 lhs_(lhs), 00310 rhs_(rhs), 00311 buffer_(buffer), 00312 output_(buffer, tm), 00313 num_threads_(num_threads), 00314 shard_by_col_(shard_by_col), 00315 parallel_pack_(parallel_pack), 00316 m_(tm), 00317 n_(tn), 00318 k_(tk), 00319 bm_(bm), 00320 bn_(bn), 00321 bk_(bk), 00322 nm_(nm), 00323 nn_(nn), 00324 nk_(nk), 00325 gm_(gm), 00326 gn_(gn), 00327 nm0_(nm0), 00328 nn0_(nn0) 00329 { 00330 for (Index x = 0; x < P; x++) { 00331 // Normal number of notifications for k slice switch is 00332 // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only 00333 // nm_ + nn_ notifications, because they will not receive notifications 00334 // from preceeding kernels. 00335 state_switch_[x] = 00336 x == 0 00337 ? 1 00338 : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + 00339 (x == P - 1 ? nm_ * nn_ : 0); 00340 state_packing_ready_[x] = 00341 parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_); 00342 state_kernel_[x] = new std::atomic<uint8_t>*[nm_]; 00343 for (Index m = 0; m < nm_; m++) { 00344 state_kernel_[x][m] = new std::atomic<uint8_t>[nn_]; 00345 // Kernels generally receive 3 notifications (previous kernel + 2 00346 // packing), but the first slice won't get notifications from previous 00347 // kernels. 00348 for (Index n = 0; n < nn_; n++) 00349 state_kernel_[x][m][n].store( 00350 (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), 00351 std::memory_order_relaxed); 00352 } 00353 } 00354 00355 // Allocate memory for packed rhs/lhs matrices. 00356 size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); 00357 size_t lhs_size = 00358 divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align; 00359 size_t rhs_size = 00360 divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align; 00361 packed_mem_ = static_cast<char*>(internal::aligned_malloc( 00362 (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1))); 00363 char* mem = static_cast<char*>(packed_mem_); 00364 for (Index x = 0; x < numext::mini<Index>(nk_, P - 1); x++) { 00365 packed_lhs_[x].resize(nm0_); 00366 for (Index m = 0; m < nm0_; m++) { 00367 packed_lhs_[x][m] = reinterpret_cast<LhsScalar*>(mem); 00368 mem += lhs_size; 00369 } 00370 packed_rhs_[x].resize(nn0_); 00371 for (Index n = 0; n < nn0_; n++) { 00372 packed_rhs_[x][n] = reinterpret_cast<RhsScalar*>(mem); 00373 mem += rhs_size; 00374 } 00375 } 00376 } 00377 00378 ~Context() { 00379 for (Index x = 0; x < P; x++) { 00380 for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; 00381 delete[] state_kernel_[x]; 00382 } 00383 internal::aligned_free(packed_mem_); 00384 } 00385 00386 void run() { 00387 // Kick off packing of the first slice. 00388 signal_switch(0, 1); 00389 // Wait for overall completion. 00390 // TODO(dvyukov): this wait can lead to deadlock. 00391 // If nthreads contractions are concurrently submitted from worker 00392 // threads, this wait will block all worker threads and the system will 00393 // deadlock. 00394 done_.Wait(); 00395 } 00396 00397 private: 00398 Notification done_; 00399 const Device& device_; 00400 LhsMapper& lhs_; 00401 RhsMapper& rhs_; 00402 Scalar* const buffer_; 00403 OutputMapper output_; 00404 const int num_threads_; 00405 const bool shard_by_col_; 00406 const bool parallel_pack_; 00407 // Matrix sizes. 00408 const Index m_; 00409 const Index n_; 00410 const Index k_; 00411 // Block sizes. 00412 const Index bm_; 00413 const Index bn_; 00414 const Index bk_; 00415 // Number of tasks. 00416 const Index nm_; 00417 const Index nn_; 00418 const Index nk_; 00419 // Task grain sizes (number of kernels executed per task). 00420 const Index gm_; 00421 const Index gn_; 00422 // Number of blocks (this is different from ni_/nn_ because of task size 00423 // coarsening). 00424 const Index nm0_; 00425 const Index nn0_; 00426 00427 // Parallelization strategy. 00428 // 00429 // Blocks related to the same k block can run in parallel because they write 00430 // to different output blocks. So we parallelize within k slices, this 00431 // gives us parallelism level of m x n. Before we can start any kernels 00432 // related to k-th slice, we need to issue m lhs packing tasks and n rhs 00433 // packing tasks. 00434 // 00435 // However, there is a bottleneck when we are finishing kernels for k-th 00436 // slice (at the very end there is only 1 runnable kernel). To mitigate this 00437 // bottleneck we allow kernels from k-th and k+1-th slices to run in 00438 // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same 00439 // output block, so they must not run in parallel. 00440 // 00441 // This gives us the following dependency graph. 00442 // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs 00443 // packing tasks. 00444 // Kernel (m, n, k) can start when: 00445 // - kernel (m, n, k-1) has finished 00446 // - lhs packing (m, k) has finished 00447 // - rhs packing (n, k) has finished 00448 // Lhs/rhs packing can start when: 00449 // - all k-1 packing has finished (artificially imposed to limit amount of 00450 // parallel packing) 00451 // 00452 // On top of that we limit runnable tasks to two consecutive k slices. 00453 // This is done to limit amount of memory we need for packed lhs/rhs 00454 // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_). 00455 // 00456 // state_switch_ tracks when we are ready to switch to the next k slice. 00457 // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n). 00458 // These variable are rolling over 3 consecutive k slices: first two we are 00459 // actively executing + one to track completion of kernels in the second 00460 // slice. 00461 static const Index P = 3; 00462 void* packed_mem_; 00463 std::vector<LhsScalar*> packed_lhs_[P - 1]; 00464 std::vector<RhsScalar*> packed_rhs_[P - 1]; 00465 std::atomic<uint8_t>** state_kernel_[P]; 00466 // state_switch_ is frequently modified by worker threads, while other 00467 // fields are read-only after constructor. Let's move it to a separate cache 00468 // line to reduce cache-coherency traffic. 00469 char pad_[128]; 00470 std::atomic<Index> state_packing_ready_[P]; 00471 std::atomic<Index> state_switch_[P]; 00472 00473 void pack_lhs(Index m, Index k) { 00474 const Index mend = m * gm_ + gm(m); 00475 for (Index m1 = m * gm_; m1 < mend; m1++) 00476 LhsPacker()(packed_lhs_[k % (P - 1)][m1], 00477 lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); 00478 00479 if (!parallel_pack_ && shard_by_col_) { 00480 signal_packing(k); 00481 } else { 00482 signal_switch(k + 1); 00483 for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0); 00484 } 00485 } 00486 00487 void pack_rhs(Index n, Index k) { 00488 const Index nend = n * gn_ + gn(n); 00489 for (Index n1 = n * gn_; n1 < nend; n1++) { 00490 if (k == 0) { 00491 // Zero the output memory in parallel. 00492 // On 10000x2x10000 mm zeroing can easily take half of time. 00493 // Zero (bn x m) row. Safe to do here because all kernels that will 00494 // write to this memory depend on completion of this task. 00495 // Note: don't call device_.memset() here. device_.memset() blocks on 00496 // thread pool worker thread, which can lead to underutilization and 00497 // deadlocks. 00498 memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); 00499 } 00500 RhsPacker()(packed_rhs_[k % (P - 1)][n1], 00501 rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); 00502 } 00503 00504 if (parallel_pack_ || shard_by_col_) { 00505 signal_switch(k + 1); 00506 for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0); 00507 } else { 00508 signal_packing(k); 00509 } 00510 } 00511 00512 void kernel(Index m, Index n, Index k) { 00513 // Note: order of iteration matters here. Iteration over m is innermost 00514 // because we want to reuse the same packed rhs in consequetive tasks 00515 // (rhs fits into L2$ while lhs only into L3$). 00516 const Index nend = n * gn_ + gn(n); 00517 const Index mend = m * gm_ + gm(m); 00518 if (shard_by_col_) { 00519 for (Index n1 = n * gn_; n1 < nend; n1++) { 00520 for (Index m1 = m * gm_; m1 < mend; m1++) 00521 GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), 00522 packed_lhs_[k % (P - 1)][m1], 00523 packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), 00524 Scalar(1), -1, -1, 0, 0); 00525 } 00526 } else { 00527 for (Index m1 = m * gm_; m1 < mend; m1++) 00528 for (Index n1 = n * gn_; n1 < nend; n1++) { 00529 GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), 00530 packed_lhs_[k % (P - 1)][m1], 00531 packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), 00532 Scalar(1), -1, -1, 0, 0); 00533 } 00534 } 00535 signal_kernel(m, n, k + 1, false); 00536 signal_switch(k + 2); 00537 } 00538 00539 void signal_packing(Index k) { 00540 eigen_assert(!parallel_pack_); 00541 Index s = state_packing_ready_[k % P].fetch_sub(1); 00542 eigen_assert(s > 0); 00543 if (s != 1) return; 00544 state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_; 00545 enqueue_packing(k, shard_by_col_); 00546 } 00547 00548 void signal_kernel(Index m, Index n, Index k, bool sync) { 00549 std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n]; 00550 Index s = state->load(); 00551 eigen_assert(s > 0); 00552 if (s != 1 && state->fetch_sub(1) != 1) return; 00553 state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); 00554 if (sync) 00555 kernel(m, n, k); 00556 else 00557 device_.enqueueNoNotification([=]() { kernel(m, n, k); }); 00558 } 00559 00560 void signal_switch(Index k, Index v = 1) { 00561 Index s = state_switch_[k % P].fetch_sub(v); 00562 eigen_assert(s >= v); 00563 if (s != v) return; 00564 00565 // Ready to switch to the next k slice. 00566 // Reset counter for the next iteration. 00567 state_switch_[k % P] = 00568 (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + 00569 nm_ * nn_; 00570 if (k < nk_) { 00571 // Issue lhs/rhs packing. Their completion will in turn kick off 00572 // kernels. 00573 if (parallel_pack_) { 00574 enqueue_packing(k, !shard_by_col_); 00575 enqueue_packing(k, shard_by_col_); 00576 } else if (shard_by_col_) { 00577 enqueue_packing(k, false); 00578 } else { 00579 enqueue_packing(k, true); 00580 } 00581 00582 // Termination handling. 00583 // Because kernel completion signals k + 2 switch, we need to finish nk 00584 // + 2 slices without issuing any tasks on nk + 1 slice. So here we 00585 // pretend that all nk + 1 packing tasks just finish instantly; so that 00586 // nk + 2 switch only waits for completion of nk kernels. 00587 } else if (k == nk_) { 00588 signal_switch(k + 1, 00589 parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)); 00590 } else { 00591 done_.Notify(); 00592 } 00593 } 00594 00595 // Enqueue all rhs/lhs packing for k-th slice. 00596 void enqueue_packing(Index k, bool rhs) { 00597 enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); 00598 } 00599 00600 void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) { 00601 if (end - start == 1) { 00602 if (rhs) 00603 pack_rhs(start, k); 00604 else 00605 pack_lhs(start, k); 00606 } else { 00607 Index mid = (start + end) / 2; 00608 device_.enqueueNoNotification( 00609 [=]() { enqueue_packing_helper(mid, end, k, rhs); }); 00610 device_.enqueueNoNotification( 00611 [=]() { enqueue_packing_helper(start, mid, k, rhs); }); 00612 } 00613 } 00614 00615 // Block sizes with accounting for potentially incomplete last block. 00616 Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; } 00617 Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; } 00618 Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; } 00619 // Task grain sizes accounting for potentially incomplete last task. 00620 Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; } 00621 Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; } 00622 00623 Context(const Context&) = delete; 00624 void operator=(const Context&) = delete; 00625 }; 00626 00627 // Decide whether we want to shard m x n contraction by columns or by rows. 00628 static bool shardByCol(Index m, Index n, Index num_threads) { 00629 // Note: we are comparing both n and m against Traits::nr, it is not 00630 // a mistake. We are trying to figure out how both n and m will fit into 00631 // the main sharding dimension. 00632 00633 // Sharding by column is the default 00634 // ... unless there is enough data for vectorization over rows 00635 if (m / num_threads >= Traits::nr && 00636 // and not enough data for vectorization over columns 00637 (n / num_threads < Traits::nr || 00638 // ... or barely enough data for vectorization over columns, 00639 // but it is not evenly dividable across threads 00640 (n / num_threads < 4 * Traits::nr && 00641 (n % (num_threads * Traits::nr)) != 0 && 00642 // ... and it is evenly dividable across threads for rows 00643 ((m % (num_threads * Traits::nr)) == 0 || 00644 // .. or it is not evenly dividable for both dimensions but 00645 // there is much more data over rows so that corner effects are 00646 // mitigated. 00647 (m / n >= 6))))) 00648 return false; 00649 // Wait, or if matrices are just substantially prolonged over the other 00650 // dimension. 00651 if (n / num_threads < 16 * Traits::nr && m > n * 32) return false; 00652 return true; 00653 } 00654 00655 Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, 00656 int num_threads, bool shard_by_col) const { 00657 Index gm = 1; 00658 Index gm1 = 1; 00659 Index nm0 = divup(m, bm); 00660 Index nm1 = nm0; 00661 for (;;) { 00662 // Find the next candidate for m grain size. It needs to result in 00663 // different number of blocks. E.g. if we have 10 kernels, we want to try 00664 // 5 and 10, but not 6, 7, 8 and 9. 00665 while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++; 00666 if (gm1 > nm0) break; 00667 // Check the candidate. 00668 int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, 00669 shard_by_col); 00670 if (res < 0) break; 00671 nm1 = divup(nm0, gm1); 00672 if (res == 0) continue; 00673 // Commit new grain size. 00674 gm = gm1; 00675 } 00676 return gm; 00677 } 00678 00679 Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, 00680 int num_threads, bool shard_by_col) const { 00681 Index gn = 1; 00682 Index gn1 = 1; 00683 Index nn0 = divup(n, bn); 00684 Index nn1 = nn0; 00685 for (;;) { 00686 while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++; 00687 if (gn1 > nn0) break; 00688 int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, 00689 shard_by_col); 00690 if (res < 0) break; 00691 nn1 = divup(nn0, gn1); 00692 if (res == 0) continue; 00693 gn = gn1; 00694 } 00695 return gn; 00696 } 00697 00698 // checkGrain checks whether grain (gm, gn) is suitable and is better than 00699 // (oldgm, oldgn). 00700 int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, 00701 Index gn, Index oldgm, Index oldgn, int num_threads, 00702 bool shard_by_col) const { 00703 const TensorOpCost cost = 00704 contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true); 00705 double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize( 00706 static_cast<double>(bm) * gm * bn * gn, cost); 00707 // If the task is too small, then we agree on it regardless of anything 00708 // else. Otherwise synchronization overheads will dominate. 00709 if (taskSize < 1) return 1; 00710 // If it is too large, then we reject it and all larger tasks. 00711 if (taskSize > 2) return -1; 00712 // Now we are in presumably good task size range. 00713 // The main deciding factor here is parallelism. Consider that we have 12 00714 // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes. 00715 // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4 00716 // of cores will be busy). While grain size 3 gives us 4 tasks, which gives 00717 // us parallelism of 1 (we can load all cores). 00718 Index nm0 = divup(m, bm); 00719 Index nn0 = divup(n, bn); 00720 Index new_tasks = divup(nm0, gm) * divup(nn0, gn); 00721 double new_parallelism = static_cast<double>(new_tasks) / 00722 (divup<int>(new_tasks, num_threads) * num_threads); 00723 Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn); 00724 double old_parallelism = static_cast<double>(old_tasks) / 00725 (divup<int>(old_tasks, num_threads) * num_threads); 00726 if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; 00727 return 0; 00728 } 00729 00730 #else // EIGEN_USE_SIMPLE_THREAD_POOL 00731 00732 template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> 00733 void evalProduct(Scalar* buffer) const { 00734 if (this->m_j_size == 1) { 00735 this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); 00736 return; 00737 } 00738 00739 evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); 00740 } 00741 00742 template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> 00743 void evalGemm(Scalar* buffer) const { 00744 // columns in left side, rows in right side 00745 const Index k = this->m_k_size; 00746 00747 // rows in left side 00748 const Index m = this->m_i_size; 00749 00750 // columns in right side 00751 const Index n = this->m_j_size; 00752 00753 // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) 00754 this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); 00755 00756 00757 const int lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size; 00758 const int rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size; 00759 00760 typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, 00761 LeftEvaluator, left_nocontract_t, 00762 contract_t, lhs_packet_size, 00763 lhs_inner_dim_contiguous, 00764 false, Unaligned> LhsMapper; 00765 00766 typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, 00767 RightEvaluator, right_nocontract_t, 00768 contract_t, rhs_packet_size, 00769 rhs_inner_dim_contiguous, 00770 rhs_inner_dim_reordered, Unaligned> RhsMapper; 00771 00772 typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; 00773 00774 // TODO: packing could be faster sometimes if we supported row major tensor mappers 00775 typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr, 00776 Traits::LhsProgress, ColMajor> LhsPacker; 00777 typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker; 00778 00779 // TODO: replace false, false with conjugate values? 00780 typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, 00781 Traits::mr, Traits::nr, false, false> GebpKernel; 00782 00783 typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg; 00784 typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg; 00785 00786 // initialize data mappers 00787 LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, 00788 this->m_left_contracting_strides, this->m_k_strides); 00789 00790 RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, 00791 this->m_right_contracting_strides, this->m_k_strides); 00792 00793 OutputMapper output(buffer, m); 00794 00795 // compute block sizes (which depend on number of threads) 00796 const Index num_threads = this->m_device.numThreads(); 00797 internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads); 00798 Index mc = blocking.mc(); 00799 Index nc = blocking.nc(); 00800 Index kc = blocking.kc(); 00801 eigen_assert(mc <= m); 00802 eigen_assert(nc <= n); 00803 eigen_assert(kc <= k); 00804 00805 #define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) 00806 const Index k_blocks = CEIL_DIV(k, kc); 00807 const Index n_blocks = CEIL_DIV(n, nc); 00808 const Index m_blocks = CEIL_DIV(m, mc); 00809 const Index sizeA = mc * kc; 00810 const Index sizeB = kc * nc; 00811 00812 /* cout << "m: " << m << " n: " << n << " k: " << k << endl; 00813 cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; 00814 cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; 00815 cout << "num threads: " << num_threads << endl; 00816 */ 00817 00818 // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB 00819 // aren't 16 byte aligned segfaults will happen due to SIMD instructions 00820 // note: You can get away with allocating just a single blockA and offsets and meet the 00821 // the alignment requirements with the assumption that 00822 // (Traits::mr * sizeof(ResScalar)) % 16 == 0 00823 const Index numBlockAs = numext::mini(num_threads, m_blocks); 00824 MaxSizeVector<LhsScalar *> blockAs(num_threads); 00825 for (int i = 0; i < num_threads; i++) { 00826 blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); 00827 } 00828 00829 // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread 00830 // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. 00831 // Other options: (1) reuse memory when a thread finishes. con: tricky 00832 // (2) allocate block B memory in each thread. con: overhead 00833 MaxSizeVector<RhsScalar *> blockBs(n_blocks); 00834 for (int i = 0; i < n_blocks; i++) { 00835 blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); 00836 } 00837 00838 // lhs_notifications starts with all null Notifications 00839 MaxSizeVector<Notification*> lhs_notifications(num_threads, nullptr); 00840 00841 // this should really be numBlockAs * n_blocks; 00842 const Index num_kernel_notifications = num_threads * n_blocks; 00843 MaxSizeVector<Notification*> kernel_notifications(num_kernel_notifications, 00844 nullptr); 00845 00846 for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { 00847 const Index k_start = k_block_idx * kc; 00848 // make sure we don't overshoot right edge of left matrix 00849 const Index actual_kc = numext::mini(k_start + kc, k) - k_start; 00850 00851 for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { 00852 const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs); 00853 00854 for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { 00855 const Index m_start = mt_block_idx * mc; 00856 const Index actual_mc = numext::mini(m_start + mc, m) - m_start; 00857 eigen_assert(actual_mc > 0); 00858 00859 Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; 00860 00861 for (int i = 0; i < n_blocks; ++i) { 00862 Index notification_id = (blockAId * n_blocks + i); 00863 // Wait for any current kernels using this slot to complete 00864 // before using it. 00865 if (kernel_notifications[notification_id]) { 00866 wait_until_ready(kernel_notifications[notification_id]); 00867 delete kernel_notifications[notification_id]; 00868 } 00869 kernel_notifications[notification_id] = new Notification(); 00870 } 00871 const packLArg arg = { 00872 blockAs[blockAId], // blockA 00873 lhs, // lhs 00874 m_start, // m 00875 k_start, // k 00876 actual_mc, // mc 00877 actual_kc, // kc 00878 }; 00879 00880 // Delete any existing notification since we may be 00881 // replacing it. The algorithm should ensure that there are 00882 // no existing waiters on this notification. 00883 delete lhs_notifications[blockAId]; 00884 lhs_notifications[blockAId] = 00885 this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg); 00886 } 00887 00888 // now start kernels. 00889 const Index m_base_start = m_block_idx * mc; 00890 const bool need_to_pack = m_block_idx == 0; 00891 00892 for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { 00893 const Index n_start = n_block_idx * nc; 00894 const Index actual_nc = numext::mini(n_start + nc, n) - n_start; 00895 00896 // first make sure the previous kernels are all done before overwriting rhs. Also wait if 00897 // we're going to start new k. In both cases need_to_pack is true. 00898 if (need_to_pack) { 00899 for (Index i = num_blocks; i < num_threads; ++i) { 00900 Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; 00901 Index future_id = (blockAId * n_blocks + n_block_idx); 00902 wait_until_ready(kernel_notifications[future_id]); 00903 } 00904 } 00905 00906 packRKArg arg = { 00907 &blockAs, // blockA 00908 blockBs[n_block_idx], // blockB 00909 rhs, // rhs 00910 output, // output 00911 m_base_start, // m 00912 k_start, // k 00913 n_start, // n 00914 mc, // mc 00915 actual_kc, // kc 00916 actual_nc, // nc 00917 num_threads, 00918 numBlockAs, 00919 m, 00920 k_block_idx, 00921 m_block_idx, 00922 n_block_idx, // n_block_idx 00923 m_blocks, // m_blocks 00924 n_blocks, // n_blocks 00925 &kernel_notifications, // kernel notifications 00926 &lhs_notifications, // lhs notifications 00927 need_to_pack, // need_to_pack 00928 }; 00929 00930 // We asynchronously kick off this function, which ends up 00931 // notifying the appropriate kernel_notifications objects, 00932 // which this thread waits on before exiting. 00933 this->m_device.enqueueNoNotification(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg); 00934 } 00935 } 00936 } 00937 00938 // Make sure all the kernels are done. 00939 for (size_t i = 0; i < kernel_notifications.size(); ++i) { 00940 wait_until_ready(kernel_notifications[i]); 00941 delete kernel_notifications[i]; 00942 } 00943 00944 // No need to wait for lhs notifications since they should have 00945 // already been waited on. Just clean them up. 00946 for (size_t i = 0; i < lhs_notifications.size(); ++i) { 00947 delete lhs_notifications[i]; 00948 } 00949 00950 // deallocate all of the memory for both A and B's 00951 for (size_t i = 0; i < blockAs.size(); i++) { 00952 this->m_device.deallocate(blockAs[i]); 00953 } 00954 for (size_t i = 0; i < blockBs.size(); i++) { 00955 this->m_device.deallocate(blockBs[i]); 00956 } 00957 00958 #undef CEIL_DIV 00959 } 00960 00961 /* 00962 * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing 00963 * the LHS block, check that all of the kernels that worked on the same 00964 * mt_block_idx in the previous m_block are done. 00965 */ 00966 template <typename packLArg, typename LhsPacker> 00967 static void packLhs(const packLArg arg) { 00968 // perform actual packing 00969 LhsPacker pack_lhs; 00970 pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); 00971 } 00972 00973 /* 00974 * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that 00975 * all kernels in the previous block are done. 00976 * Then for each LHS future, we wait on the future and then call GEBP 00977 * on the area packed by the future (which starts at 00978 * blockA + future_idx * mt * kc) on the LHS and with the full packed 00979 * RHS block. 00980 * The output of this GEBP is written to output(m + i * mt, n). 00981 */ 00982 template <typename packRKArg, typename RhsPacker, typename GebpKernel> 00983 static void packRhsAndKernel(packRKArg arg) { 00984 if (arg.need_to_pack) { 00985 RhsPacker pack_rhs; 00986 pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); 00987 } 00988 00989 GebpKernel gebp; 00990 for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { 00991 const Index m_base_start = arg.m + arg.mc*mt_block_idx; 00992 if (m_base_start < arg.max_m) { 00993 Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; 00994 wait_until_ready((*arg.lhs_notifications)[blockAId]); 00995 const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start; 00996 gebp(arg.output.getSubMapper(m_base_start, arg.n), 00997 (*arg.blockAs)[blockAId], arg.blockB, 00998 actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0); 00999 01000 // Notify that the kernel is done. 01001 const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; 01002 (*arg.kernel_notifications)[set_idx]->Notify(); 01003 } 01004 } 01005 } 01006 #endif // EIGEN_USE_SIMPLE_THREAD_POOL 01007 01008 TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, 01009 bool shard_by_col, bool prepacked) const { 01010 const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size, 01011 PacketType<RhsScalar, Device>::size); 01012 const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size; 01013 const double kd = static_cast<double>(bk); 01014 // Peak VFMA bandwidth is 0.5. However if we have not enough data for 01015 // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined 01016 // experimentally. 01017 double computeBandwidth = bk == 1 ? 4.0 : 01018 (shard_by_col ? bn : bm) < Traits::nr || 01019 (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5; 01020 #ifndef EIGEN_VECTORIZE_FMA 01021 // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. 01022 // However for MULPS/ADDPS we have dependent sequence of 2 such instructions, 01023 // so overall bandwidth is 1.0. 01024 if (computeBandwidth == 0.5) computeBandwidth = 1.0; 01025 #endif 01026 // Computations. 01027 TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size); 01028 // Output stores. 01029 cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); 01030 if (prepacked) { 01031 // Packing and kernels are executed in different tasks. When we calculate 01032 // task grain size we look only at kernel cost assuming that kernel 01033 // is more expensive than packing. 01034 return cost; 01035 } 01036 // Lhs/rhs loads + computations. 01037 TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n); 01038 TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m); 01039 // Lhs packing memory cost does not contribute considerably to overall 01040 // execution time because lhs is prefetched early and accessed sequentially. 01041 if (shard_by_col) 01042 lhsCost.dropMemoryCost(); 01043 else 01044 rhsCost.dropMemoryCost(); 01045 return cost + lhsCost + rhsCost; 01046 } 01047 }; 01048 01049 } // end namespace Eigen 01050 01051 #endif // EIGEN_USE_THREADS 01052 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H