![]() |
Eigen-unsupported
3.3.3
|
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 00005 // 00006 // This Source Code Form is subject to the terms of the Mozilla 00007 // Public License v. 2.0. If a copy of the MPL was not distributed 00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00009 00010 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 00011 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 00012 00013 namespace Eigen { 00014 00023 namespace internal { 00024 00025 // Default strategy: the expression is evaluated with a single cpu thread. 00026 template<typename Expression, typename Device, bool Vectorizable> 00027 class TensorExecutor 00028 { 00029 public: 00030 typedef typename Expression::Index Index; 00031 EIGEN_DEVICE_FUNC 00032 static inline void run(const Expression& expr, const Device& device = Device()) 00033 { 00034 TensorEvaluator<Expression, Device> evaluator(expr, device); 00035 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); 00036 if (needs_assign) 00037 { 00038 const Index size = array_prod(evaluator.dimensions()); 00039 for (Index i = 0; i < size; ++i) { 00040 evaluator.evalScalar(i); 00041 } 00042 } 00043 evaluator.cleanup(); 00044 } 00045 }; 00046 00047 00048 template<typename Expression> 00049 class TensorExecutor<Expression, DefaultDevice, true> 00050 { 00051 public: 00052 typedef typename Expression::Index Index; 00053 EIGEN_DEVICE_FUNC 00054 static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) 00055 { 00056 TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device); 00057 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); 00058 if (needs_assign) 00059 { 00060 const Index size = array_prod(evaluator.dimensions()); 00061 const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; 00062 // Give the compiler a strong hint to unroll the loop. But don't insist 00063 // on unrolling, because if the function is expensive the compiler should not 00064 // unroll the loop at the expense of inlining. 00065 const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; 00066 for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { 00067 for (Index j = 0; j < 4; j++) { 00068 evaluator.evalPacket(i + j * PacketSize); 00069 } 00070 } 00071 const Index VectorizedSize = (size / PacketSize) * PacketSize; 00072 for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { 00073 evaluator.evalPacket(i); 00074 } 00075 for (Index i = VectorizedSize; i < size; ++i) { 00076 evaluator.evalScalar(i); 00077 } 00078 } 00079 evaluator.cleanup(); 00080 } 00081 }; 00082 00083 00084 00085 // Multicore strategy: the index space is partitioned and each partition is executed on a single core 00086 #ifdef EIGEN_USE_THREADS 00087 template <typename Evaluator, typename Index, bool Vectorizable> 00088 struct EvalRange { 00089 static void run(Evaluator* evaluator_in, const Index first, const Index last) { 00090 Evaluator evaluator = *evaluator_in; 00091 eigen_assert(last >= first); 00092 for (Index i = first; i < last; ++i) { 00093 evaluator.evalScalar(i); 00094 } 00095 } 00096 00097 static Index alignBlockSize(Index size) { 00098 return size; 00099 } 00100 }; 00101 00102 template <typename Evaluator, typename Index> 00103 struct EvalRange<Evaluator, Index, true> { 00104 static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; 00105 00106 static void run(Evaluator* evaluator_in, const Index first, const Index last) { 00107 Evaluator evaluator = *evaluator_in; 00108 eigen_assert(last >= first); 00109 Index i = first; 00110 if (last - first >= PacketSize) { 00111 eigen_assert(first % PacketSize == 0); 00112 Index last_chunk_offset = last - 4 * PacketSize; 00113 // Give the compiler a strong hint to unroll the loop. But don't insist 00114 // on unrolling, because if the function is expensive the compiler should not 00115 // unroll the loop at the expense of inlining. 00116 for (; i <= last_chunk_offset; i += 4*PacketSize) { 00117 for (Index j = 0; j < 4; j++) { 00118 evaluator.evalPacket(i + j * PacketSize); 00119 } 00120 } 00121 last_chunk_offset = last - PacketSize; 00122 for (; i <= last_chunk_offset; i += PacketSize) { 00123 evaluator.evalPacket(i); 00124 } 00125 } 00126 for (; i < last; ++i) { 00127 evaluator.evalScalar(i); 00128 } 00129 } 00130 00131 static Index alignBlockSize(Index size) { 00132 // Align block size to packet size and account for unrolling in run above. 00133 if (size >= 16 * PacketSize) { 00134 return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); 00135 } 00136 // Aligning to 4 * PacketSize would increase block size by more than 25%. 00137 return (size + PacketSize - 1) & ~(PacketSize - 1); 00138 } 00139 }; 00140 00141 template <typename Expression, bool Vectorizable> 00142 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { 00143 public: 00144 typedef typename Expression::Index Index; 00145 static inline void run(const Expression& expr, const ThreadPoolDevice& device) 00146 { 00147 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; 00148 Evaluator evaluator(expr, device); 00149 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); 00150 if (needs_assign) 00151 { 00152 const Index size = array_prod(evaluator.dimensions()); 00153 #if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) 00154 device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), 00155 EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize, 00156 [&evaluator](Index first, Index last) { 00157 EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last); 00158 }); 00159 #else 00160 size_t num_threads = device.numThreads(); 00161 if (num_threads > 1) { 00162 num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( 00163 size, evaluator.costPerCoeff(Vectorizable), num_threads); 00164 } 00165 if (num_threads == 1) { 00166 EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size); 00167 } else { 00168 const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; 00169 Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1; 00170 const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); 00171 const Index numblocks = size / blocksize; 00172 00173 Barrier barrier(numblocks); 00174 for (int i = 0; i < numblocks; ++i) { 00175 device.enqueue_with_barrier( 00176 &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, 00177 &evaluator, i * blocksize, (i + 1) * blocksize); 00178 } 00179 if (numblocks * blocksize < size) { 00180 EvalRange<Evaluator, Index, Vectorizable>::run( 00181 &evaluator, numblocks * blocksize, size); 00182 } 00183 barrier.Wait(); 00184 } 00185 #endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) 00186 } 00187 evaluator.cleanup(); 00188 } 00189 }; 00190 #endif // EIGEN_USE_THREADS 00191 00192 00193 // GPU: the evaluation of the expression is offloaded to a GPU. 00194 #if defined(EIGEN_USE_GPU) 00195 00196 template <typename Expression, bool Vectorizable> 00197 class TensorExecutor<Expression, GpuDevice, Vectorizable> { 00198 public: 00199 typedef typename Expression::Index Index; 00200 static void run(const Expression& expr, const GpuDevice& device); 00201 }; 00202 00203 00204 #if defined(__CUDACC__) 00205 template <typename Evaluator, typename Index, bool Vectorizable> 00206 struct EigenMetaKernelEval { 00207 static __device__ EIGEN_ALWAYS_INLINE 00208 void run(Evaluator& eval, Index first, Index last, Index step_size) { 00209 for (Index i = first; i < last; i += step_size) { 00210 eval.evalScalar(i); 00211 } 00212 } 00213 }; 00214 00215 template <typename Evaluator, typename Index> 00216 struct EigenMetaKernelEval<Evaluator, Index, true> { 00217 static __device__ EIGEN_ALWAYS_INLINE 00218 void run(Evaluator& eval, Index first, Index last, Index step_size) { 00219 const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; 00220 const Index vectorized_size = (last / PacketSize) * PacketSize; 00221 const Index vectorized_step_size = step_size * PacketSize; 00222 00223 // Use the vector path 00224 for (Index i = first * PacketSize; i < vectorized_size; 00225 i += vectorized_step_size) { 00226 eval.evalPacket(i); 00227 } 00228 for (Index i = vectorized_size + first; i < last; i += step_size) { 00229 eval.evalScalar(i); 00230 } 00231 } 00232 }; 00233 00234 template <typename Evaluator, typename Index> 00235 __global__ void 00236 __launch_bounds__(1024) 00237 EigenMetaKernel(Evaluator eval, Index size) { 00238 00239 const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; 00240 const Index step_size = blockDim.x * gridDim.x; 00241 00242 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; 00243 EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size); 00244 } 00245 00246 /*static*/ 00247 template <typename Expression, bool Vectorizable> 00248 inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run( 00249 const Expression& expr, const GpuDevice& device) { 00250 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); 00251 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); 00252 if (needs_assign) { 00253 const int block_size = device.maxCudaThreadsPerBlock(); 00254 const int max_blocks = device.getNumCudaMultiProcessors() * 00255 device.maxCudaThreadsPerMultiProcessor() / block_size; 00256 const Index size = array_prod(evaluator.dimensions()); 00257 // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. 00258 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); 00259 00260 LAUNCH_CUDA_KERNEL( 00261 (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), 00262 num_blocks, block_size, 0, device, evaluator, size); 00263 } 00264 evaluator.cleanup(); 00265 } 00266 00267 #endif // __CUDACC__ 00268 #endif // EIGEN_USE_GPU 00269 00270 // SYCL Executor policy 00271 #ifdef EIGEN_USE_SYCL 00272 00273 template <typename Expression, bool Vectorizable> 00274 class TensorExecutor<Expression, SyclDevice, Vectorizable> { 00275 public: 00276 static inline void run(const Expression &expr, const SyclDevice &device) { 00277 // call TensorSYCL module 00278 TensorSycl::run(expr, device); 00279 } 00280 }; 00281 00282 #endif 00283 00284 } // end namespace internal 00285 00286 } // end namespace Eigen 00287 00288 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H