![]() |
Eigen-unsupported
3.3.3
|
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 00005 // Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com> 00006 // 00007 // This Source Code Form is subject to the terms of the Mozilla 00008 // Public License v. 2.0. If a copy of the MPL was not distributed 00009 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00010 00011 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H 00012 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H 00013 00014 namespace Eigen { 00015 00023 namespace internal { 00024 template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ > 00025 struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> > 00026 : traits<XprType> 00027 { 00028 typedef traits<XprType> XprTraits; 00029 typedef typename XprTraits::Scalar Scalar; 00030 typedef typename XprTraits::StorageKind StorageKind; 00031 typedef typename XprTraits::Index Index; 00032 typedef typename XprType::Nested Nested; 00033 static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value; 00034 static const int Layout = XprTraits::Layout; 00035 00036 template <class T> struct MakePointer { 00037 // Intermediate typedef to workaround MSVC issue. 00038 typedef MakePointer_<T> MakePointerT; 00039 typedef typename MakePointerT::Type Type; 00040 }; 00041 }; 00042 00043 template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_> 00044 struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense> 00045 { 00046 typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type; 00047 }; 00048 00049 template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_> 00050 struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type> 00051 { 00052 typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type; 00053 }; 00054 00055 00056 template <typename OutputDims> struct DimInitializer { 00057 template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC 00058 static void run(const InputDims& input_dims, 00059 const array<bool, internal::array_size<InputDims>::value>& reduced, 00060 OutputDims* output_dims, ReducedDims* reduced_dims) { 00061 const int NumInputDims = internal::array_size<InputDims>::value; 00062 int outputIndex = 0; 00063 int reduceIndex = 0; 00064 for (int i = 0; i < NumInputDims; ++i) { 00065 if (reduced[i]) { 00066 (*reduced_dims)[reduceIndex] = input_dims[i]; 00067 ++reduceIndex; 00068 } else { 00069 (*output_dims)[outputIndex] = input_dims[i]; 00070 ++outputIndex; 00071 } 00072 } 00073 } 00074 }; 00075 00076 template <> struct DimInitializer<Sizes<> > { 00077 template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC 00078 static void run(const InputDims& input_dims, const array<bool, Rank>&, 00079 Sizes<>*, array<Index, Rank>* reduced_dims) { 00080 const int NumInputDims = internal::array_size<InputDims>::value; 00081 for (int i = 0; i < NumInputDims; ++i) { 00082 (*reduced_dims)[i] = input_dims[i]; 00083 } 00084 } 00085 }; 00086 00087 00088 template <typename ReducedDims, int NumTensorDims, int Layout> 00089 struct are_inner_most_dims { 00090 static const bool value = false; 00091 }; 00092 template <typename ReducedDims, int NumTensorDims, int Layout> 00093 struct preserve_inner_most_dims { 00094 static const bool value = false; 00095 }; 00096 00097 #if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES 00098 template <typename ReducedDims, int NumTensorDims> 00099 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{ 00100 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>(); 00101 static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0); 00102 static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1); 00103 static const bool value = tmp1 & tmp2 & tmp3; 00104 }; 00105 template <typename ReducedDims, int NumTensorDims> 00106 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{ 00107 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>(); 00108 static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value); 00109 static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1); 00110 static const bool value = tmp1 & tmp2 & tmp3; 00111 00112 }; 00113 template <typename ReducedDims, int NumTensorDims> 00114 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{ 00115 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>(); 00116 static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0); 00117 static const bool value = tmp1 & tmp2; 00118 00119 }; 00120 template <typename ReducedDims, int NumTensorDims> 00121 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{ 00122 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>(); 00123 static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1); 00124 static const bool value = tmp1 & tmp2; 00125 }; 00126 #endif 00127 00128 00129 template <int DimIndex, typename Self, typename Op> 00130 struct GenericDimReducer { 00131 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { 00132 EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); 00133 for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { 00134 const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; 00135 GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum); 00136 } 00137 } 00138 }; 00139 template <typename Self, typename Op> 00140 struct GenericDimReducer<0, Self, Op> { 00141 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { 00142 for (int j = 0; j < self.m_reducedDims[0]; ++j) { 00143 const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; 00144 reducer.reduce(self.m_impl.coeff(input), accum); 00145 } 00146 } 00147 }; 00148 template <typename Self, typename Op> 00149 struct GenericDimReducer<-1, Self, Op> { 00150 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) { 00151 reducer.reduce(self.m_impl.coeff(index), accum); 00152 } 00153 }; 00154 00155 template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> 00156 struct InnerMostDimReducer { 00157 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { 00158 typename Self::CoeffReturnType accum = reducer.initialize(); 00159 for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { 00160 reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); 00161 } 00162 return reducer.finalize(accum); 00163 } 00164 }; 00165 00166 template <typename Self, typename Op> 00167 struct InnerMostDimReducer<Self, Op, true> { 00168 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { 00169 const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size; 00170 const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; 00171 typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>(); 00172 for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { 00173 reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p); 00174 } 00175 typename Self::CoeffReturnType accum = reducer.initialize(); 00176 for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { 00177 reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); 00178 } 00179 return reducer.finalizeBoth(accum, p); 00180 } 00181 }; 00182 00183 template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> 00184 struct InnerMostDimPreserver { 00185 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { 00186 eigen_assert(false && "should never be called"); 00187 } 00188 }; 00189 00190 template <int DimIndex, typename Self, typename Op> 00191 struct InnerMostDimPreserver<DimIndex, Self, Op, true> { 00192 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { 00193 EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); 00194 for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) { 00195 const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; 00196 InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum); 00197 } 00198 } 00199 }; 00200 00201 template <typename Self, typename Op> 00202 struct InnerMostDimPreserver<0, Self, Op, true> { 00203 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { 00204 for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) { 00205 const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; 00206 reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum); 00207 } 00208 } 00209 }; 00210 template <typename Self, typename Op> 00211 struct InnerMostDimPreserver<-1, Self, Op, true> { 00212 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { 00213 eigen_assert(false && "should never be called"); 00214 } 00215 }; 00216 00217 // Default full reducer 00218 template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> 00219 struct FullReducer { 00220 static const bool HasOptimizedImplementation = false; 00221 00222 static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { 00223 const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); 00224 *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer); 00225 } 00226 }; 00227 00228 00229 #ifdef EIGEN_USE_THREADS 00230 // Multithreaded full reducers 00231 template <typename Self, typename Op, 00232 bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> 00233 struct FullReducerShard { 00234 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, 00235 typename Self::Index numValuesToReduce, Op& reducer, 00236 typename Self::CoeffReturnType* output) { 00237 *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce( 00238 self, firstIndex, numValuesToReduce, reducer); 00239 } 00240 }; 00241 00242 // Multithreaded full reducer 00243 template <typename Self, typename Op, bool Vectorizable> 00244 struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> { 00245 static const bool HasOptimizedImplementation = !Op::IsStateful; 00246 static const int PacketSize = 00247 unpacket_traits<typename Self::PacketReturnType>::size; 00248 00249 // launch one reducer per thread and accumulate the result. 00250 static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, 00251 typename Self::CoeffReturnType* output) { 00252 typedef typename Self::Index Index; 00253 const Index num_coeffs = array_prod(self.m_impl.dimensions()); 00254 if (num_coeffs == 0) { 00255 *output = reducer.finalize(reducer.initialize()); 00256 return; 00257 } 00258 const TensorOpCost cost = 00259 self.m_impl.costPerCoeff(Vectorizable) + 00260 TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, 00261 PacketSize); 00262 const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( 00263 num_coeffs, cost, device.numThreads()); 00264 if (num_threads == 1) { 00265 *output = 00266 InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer); 00267 return; 00268 } 00269 const Index blocksize = 00270 std::floor<Index>(static_cast<float>(num_coeffs) / num_threads); 00271 const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; 00272 eigen_assert(num_coeffs >= numblocks * blocksize); 00273 00274 Barrier barrier(internal::convert_index<unsigned int>(numblocks)); 00275 MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize()); 00276 for (Index i = 0; i < numblocks; ++i) { 00277 device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run, 00278 self, i * blocksize, blocksize, reducer, 00279 &shards[i]); 00280 } 00281 typename Self::CoeffReturnType finalShard; 00282 if (numblocks * blocksize < num_coeffs) { 00283 finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce( 00284 self, numblocks * blocksize, num_coeffs - numblocks * blocksize, 00285 reducer); 00286 } else { 00287 finalShard = reducer.initialize(); 00288 } 00289 barrier.Wait(); 00290 00291 for (Index i = 0; i < numblocks; ++i) { 00292 reducer.reduce(shards[i], &finalShard); 00293 } 00294 *output = reducer.finalize(finalShard); 00295 } 00296 }; 00297 00298 #endif 00299 00300 00301 // Default inner reducer 00302 template <typename Self, typename Op, typename Device> 00303 struct InnerReducer { 00304 static const bool HasOptimizedImplementation = false; 00305 00306 EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { 00307 eigen_assert(false && "Not implemented"); 00308 return true; 00309 } 00310 }; 00311 00312 // Default outer reducer 00313 template <typename Self, typename Op, typename Device> 00314 struct OuterReducer { 00315 static const bool HasOptimizedImplementation = false; 00316 00317 EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { 00318 eigen_assert(false && "Not implemented"); 00319 return true; 00320 } 00321 }; 00322 00323 00324 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 00325 template <int B, int N, typename S, typename R, typename I> 00326 __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); 00327 00328 00329 #ifdef EIGEN_HAS_CUDA_FP16 00330 template <typename S, typename R, typename I> 00331 __global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); 00332 template <int B, int N, typename S, typename R, typename I> 00333 __global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); 00334 template <int NPT, typename S, typename R, typename I> 00335 __global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*); 00336 00337 #endif 00338 00339 template <int NPT, typename S, typename R, typename I> 00340 __global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); 00341 00342 template <int NPT, typename S, typename R, typename I> 00343 __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); 00344 #endif 00345 00346 } // end namespace internal 00347 00348 00349 template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_> 00350 class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> { 00351 public: 00352 typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar; 00353 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; 00354 typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; 00355 typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested; 00356 typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind; 00357 typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index; 00358 00359 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 00360 TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims) 00361 { } 00362 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 00363 TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer) 00364 { } 00365 00366 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 00367 const XprType& expression() const { return m_expr; } 00368 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 00369 const Dims& dims() const { return m_dims; } 00370 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 00371 const Op& reducer() const { return m_reducer; } 00372 00373 protected: 00374 typename XprType::Nested m_expr; 00375 const Dims m_dims; 00376 const Op m_reducer; 00377 }; 00378 00379 00380 // Eval as rvalue 00381 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device> 00382 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> 00383 { 00384 typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType; 00385 typedef typename XprType::Index Index; 00386 typedef ArgType ChildType; 00387 typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions; 00388 static const int NumInputDims = internal::array_size<InputDimensions>::value; 00389 static const int NumReducedDims = internal::array_size<Dims>::value; 00390 static const int NumOutputDims = NumInputDims - NumReducedDims; 00391 typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions; 00392 typedef typename XprType::Scalar Scalar; 00393 typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self; 00394 static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess; 00395 typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType; 00396 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; 00397 static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; 00398 00399 enum { 00400 IsAligned = false, 00401 PacketAccess = Self::InputPacketAccess && Op::PacketAccess, 00402 Layout = TensorEvaluator<ArgType, Device>::Layout, 00403 CoordAccess = false, // to be implemented 00404 RawAccess = false 00405 }; 00406 00407 static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value; 00408 static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value; 00409 static const bool RunningFullReduction = (NumOutputDims==0); 00410 00411 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) 00412 : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims()) 00413 { 00414 EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); 00415 EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), 00416 YOU_MADE_A_PROGRAMMING_MISTAKE); 00417 00418 // Build the bitmap indicating if an input dimension is reduced or not. 00419 for (int i = 0; i < NumInputDims; ++i) { 00420 m_reduced[i] = false; 00421 } 00422 for (int i = 0; i < NumReducedDims; ++i) { 00423 eigen_assert(op.dims()[i] >= 0); 00424 eigen_assert(op.dims()[i] < NumInputDims); 00425 m_reduced[op.dims()[i]] = true; 00426 } 00427 00428 const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); 00429 internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims); 00430 00431 // Precompute output strides. 00432 if (NumOutputDims > 0) { 00433 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00434 m_outputStrides[0] = 1; 00435 for (int i = 1; i < NumOutputDims; ++i) { 00436 m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; 00437 } 00438 } else { 00439 m_outputStrides.back() = 1; 00440 for (int i = NumOutputDims - 2; i >= 0; --i) { 00441 m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; 00442 } 00443 } 00444 } 00445 00446 // Precompute input strides. 00447 if (NumInputDims > 0) { 00448 array<Index, NumInputDims> input_strides; 00449 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00450 input_strides[0] = 1; 00451 for (int i = 1; i < NumInputDims; ++i) { 00452 input_strides[i] = input_strides[i-1] * input_dims[i-1]; 00453 } 00454 } else { 00455 input_strides.back() = 1; 00456 for (int i = NumInputDims - 2; i >= 0; --i) { 00457 input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; 00458 } 00459 } 00460 00461 int outputIndex = 0; 00462 int reduceIndex = 0; 00463 for (int i = 0; i < NumInputDims; ++i) { 00464 if (m_reduced[i]) { 00465 m_reducedStrides[reduceIndex] = input_strides[i]; 00466 ++reduceIndex; 00467 } else { 00468 m_preservedStrides[outputIndex] = input_strides[i]; 00469 ++outputIndex; 00470 } 00471 } 00472 } 00473 00474 // Special case for full reductions 00475 if (NumOutputDims == 0) { 00476 m_preservedStrides[0] = internal::array_prod(input_dims); 00477 } 00478 } 00479 00480 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } 00481 00482 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) { 00483 m_impl.evalSubExprsIfNeeded(NULL); 00484 00485 // Use the FullReducer if possible. 00486 if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction && 00487 internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation && 00488 ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || 00489 !RunningOnGPU))) { 00490 bool need_assign = false; 00491 if (!data) { 00492 m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType))); 00493 data = m_result; 00494 need_assign = true; 00495 } 00496 Op reducer(m_reducer); 00497 internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data); 00498 return need_assign; 00499 } 00500 else if(RunningOnSycl){ 00501 const Index num_values_to_reduce = internal::array_prod(m_reducedDims); 00502 const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); 00503 if (!data) { 00504 data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); 00505 m_result = data; 00506 } 00507 Op reducer(m_reducer); 00508 internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); 00509 return (m_result != NULL); 00510 } 00511 00512 // Attempt to use an optimized reduction. 00513 else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) { 00514 bool reducing_inner_dims = true; 00515 for (int i = 0; i < NumReducedDims; ++i) { 00516 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00517 reducing_inner_dims &= m_reduced[i]; 00518 } else { 00519 reducing_inner_dims &= m_reduced[NumInputDims - 1 - i]; 00520 } 00521 } 00522 if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation && 00523 (reducing_inner_dims || ReducingInnerMostDims)) { 00524 const Index num_values_to_reduce = internal::array_prod(m_reducedDims); 00525 const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); 00526 if (!data) { 00527 if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { 00528 data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); 00529 m_result = data; 00530 } 00531 else { 00532 return true; 00533 } 00534 } 00535 Op reducer(m_reducer); 00536 if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { 00537 if (m_result) { 00538 m_device.deallocate(m_result); 00539 m_result = NULL; 00540 } 00541 return true; 00542 } else { 00543 return (m_result != NULL); 00544 } 00545 } 00546 00547 bool preserving_inner_dims = true; 00548 for (int i = 0; i < NumReducedDims; ++i) { 00549 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00550 preserving_inner_dims &= m_reduced[NumInputDims - 1 - i]; 00551 } else { 00552 preserving_inner_dims &= m_reduced[i]; 00553 } 00554 } 00555 if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation && 00556 preserving_inner_dims) { 00557 const Index num_values_to_reduce = internal::array_prod(m_reducedDims); 00558 const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); 00559 if (!data) { 00560 if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { 00561 data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); 00562 m_result = data; 00563 } 00564 else { 00565 return true; 00566 } 00567 } 00568 Op reducer(m_reducer); 00569 if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { 00570 if (m_result) { 00571 m_device.deallocate(m_result); 00572 m_result = NULL; 00573 } 00574 return true; 00575 } else { 00576 return (m_result != NULL); 00577 } 00578 } 00579 } 00580 return true; 00581 } 00582 00583 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { 00584 m_impl.cleanup(); 00585 if (m_result) { 00586 m_device.deallocate(m_result); 00587 m_result = NULL; 00588 } 00589 } 00590 00591 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const 00592 { 00593 if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) { 00594 return *(m_result + index); 00595 } 00596 Op reducer(m_reducer); 00597 if (ReducingInnerMostDims || RunningFullReduction) { 00598 const Index num_values_to_reduce = 00599 (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; 00600 return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index), 00601 num_values_to_reduce, reducer); 00602 } else { 00603 typename Self::CoeffReturnType accum = reducer.initialize(); 00604 internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum); 00605 return reducer.finalize(accum); 00606 } 00607 } 00608 00609 // TODO(bsteiner): provide a more efficient implementation. 00610 template<int LoadMode> 00611 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const 00612 { 00613 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) 00614 eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions()))); 00615 00616 if (RunningOnGPU && m_result) { 00617 return internal::pload<PacketReturnType>(m_result + index); 00618 } 00619 00620 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; 00621 if (ReducingInnerMostDims) { 00622 const Index num_values_to_reduce = 00623 (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; 00624 const Index firstIndex = firstInput(index); 00625 for (Index i = 0; i < PacketSize; ++i) { 00626 Op reducer(m_reducer); 00627 values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce, 00628 num_values_to_reduce, reducer); 00629 } 00630 } else if (PreservingInnerMostDims) { 00631 const Index firstIndex = firstInput(index); 00632 const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1; 00633 // TBD: extend this the the n innermost dimensions that we preserve. 00634 if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) { 00635 Op reducer(m_reducer); 00636 typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>(); 00637 internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum); 00638 return reducer.finalizePacket(accum); 00639 } else { 00640 for (int i = 0; i < PacketSize; ++i) { 00641 values[i] = coeff(index + i); 00642 } 00643 } 00644 } else { 00645 for (int i = 0; i < PacketSize; ++i) { 00646 values[i] = coeff(index + i); 00647 } 00648 } 00649 PacketReturnType rslt = internal::pload<PacketReturnType>(values); 00650 return rslt; 00651 } 00652 00653 // Must be called after evalSubExprsIfNeeded(). 00654 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { 00655 if (RunningFullReduction && m_result) { 00656 return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); 00657 } else { 00658 const Index num_values_to_reduce = internal::array_prod(m_reducedDims); 00659 const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost; 00660 return m_impl.costPerCoeff(vectorized) * num_values_to_reduce + 00661 TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); 00662 } 00663 } 00664 00665 EIGEN_DEVICE_FUNC typename MakePointer_<Scalar>::Type data() const { return m_result; } 00667 const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; } 00669 const Device& device() const{return m_device;} 00671 const Dims& xprDims() const {return m_xpr_dims;} 00672 00673 00674 private: 00675 template <int, typename, typename> friend struct internal::GenericDimReducer; 00676 template <typename, typename, bool> friend struct internal::InnerMostDimReducer; 00677 template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver; 00678 template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer; 00679 #ifdef EIGEN_USE_THREADS 00680 template <typename S, typename O, bool V> friend struct internal::FullReducerShard; 00681 #endif 00682 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 00683 template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); 00684 #ifdef EIGEN_HAS_CUDA_FP16 00685 template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); 00686 template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); 00687 template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); 00688 #endif 00689 template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); 00690 00691 template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); 00692 #endif 00693 00694 template <typename S, typename O, typename D> friend struct internal::InnerReducer; 00695 00696 // Returns the Index in the input tensor of the first value that needs to be 00697 // used to compute the reduction at output index "index". 00698 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { 00699 if (ReducingInnerMostDims) { 00700 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00701 return index * m_preservedStrides[0]; 00702 } else { 00703 return index * m_preservedStrides[NumPreservedStrides - 1]; 00704 } 00705 } 00706 // TBD: optimize the case where we preserve the innermost dimensions. 00707 Index startInput = 0; 00708 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00709 for (int i = NumOutputDims - 1; i > 0; --i) { 00710 // This is index_i in the output tensor. 00711 const Index idx = index / m_outputStrides[i]; 00712 startInput += idx * m_preservedStrides[i]; 00713 index -= idx * m_outputStrides[i]; 00714 } 00715 if (PreservingInnerMostDims) { 00716 eigen_assert(m_preservedStrides[0] == 1); 00717 startInput += index; 00718 } else { 00719 startInput += index * m_preservedStrides[0]; 00720 } 00721 } else { 00722 for (int i = 0; i < NumOutputDims - 1; ++i) { 00723 // This is index_i in the output tensor. 00724 const Index idx = index / m_outputStrides[i]; 00725 startInput += idx * m_preservedStrides[i]; 00726 index -= idx * m_outputStrides[i]; 00727 } 00728 if (PreservingInnerMostDims) { 00729 eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1); 00730 startInput += index; 00731 } else { 00732 startInput += index * m_preservedStrides[NumPreservedStrides - 1]; 00733 } 00734 } 00735 return startInput; 00736 } 00737 00738 // Bitmap indicating if an input dimension is reduced or not. 00739 array<bool, NumInputDims> m_reduced; 00740 // Dimensions of the output of the operation. 00741 Dimensions m_dimensions; 00742 // Precomputed strides for the output tensor. 00743 array<Index, NumOutputDims> m_outputStrides; 00744 // Subset of strides of the input tensor for the non-reduced dimensions. 00745 // Indexed by output dimensions. 00746 static const int NumPreservedStrides = max_n_1<NumOutputDims>::size; 00747 array<Index, NumPreservedStrides> m_preservedStrides; 00748 00749 // Subset of strides of the input tensor for the reduced dimensions. 00750 // Indexed by reduced dimensions. 00751 array<Index, NumReducedDims> m_reducedStrides; 00752 // Size of the input dimensions that are reduced. 00753 // Indexed by reduced dimensions. 00754 array<Index, NumReducedDims> m_reducedDims; 00755 00756 // Evaluator for the input expression. 00757 TensorEvaluator<ArgType, Device> m_impl; 00758 00759 // Operation to apply for computing the reduction. 00760 Op m_reducer; 00761 00762 // For full reductions 00763 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 00764 static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value; 00765 static const bool RunningOnSycl = false; 00766 #elif defined(EIGEN_USE_SYCL) 00767 static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value; 00768 static const bool RunningOnGPU = false; 00769 #else 00770 static const bool RunningOnGPU = false; 00771 static const bool RunningOnSycl = false; 00772 #endif 00773 typename MakePointer_<CoeffReturnType>::Type m_result; 00774 00775 const Device& m_device; 00776 const Dims& m_xpr_dims; 00777 }; 00778 00779 } // end namespace Eigen 00780 00781 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H