TensorReduction.h
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
00005 // Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
00006 //
00007 // This Source Code Form is subject to the terms of the Mozilla
00008 // Public License v. 2.0. If a copy of the MPL was not distributed
00009 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
00010 
00011 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
00012 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
00013 
00014 namespace Eigen {
00015 
00023 namespace internal {
00024   template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ >
00025   struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> >
00026  : traits<XprType>
00027 {
00028   typedef traits<XprType> XprTraits;
00029   typedef typename XprTraits::Scalar Scalar;
00030   typedef typename XprTraits::StorageKind StorageKind;
00031   typedef typename XprTraits::Index Index;
00032   typedef typename XprType::Nested Nested;
00033   static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
00034   static const int Layout = XprTraits::Layout;
00035 
00036   template <class T> struct MakePointer {
00037     // Intermediate typedef to workaround MSVC issue.
00038     typedef MakePointer_<T> MakePointerT;
00039     typedef typename MakePointerT::Type Type;
00040   };
00041 };
00042 
00043 template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
00044 struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>
00045 {
00046   typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
00047 };
00048 
00049 template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
00050 struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type>
00051 {
00052   typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
00053 };
00054 
00055 
00056 template <typename OutputDims> struct DimInitializer {
00057   template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
00058   static void run(const InputDims& input_dims,
00059                   const array<bool, internal::array_size<InputDims>::value>& reduced,
00060                   OutputDims* output_dims, ReducedDims* reduced_dims) {
00061     const int NumInputDims = internal::array_size<InputDims>::value;
00062     int outputIndex = 0;
00063     int reduceIndex = 0;
00064     for (int i = 0; i < NumInputDims; ++i) {
00065       if (reduced[i]) {
00066         (*reduced_dims)[reduceIndex] = input_dims[i];
00067         ++reduceIndex;
00068       } else {
00069         (*output_dims)[outputIndex] = input_dims[i];
00070         ++outputIndex;
00071       }
00072     }
00073   }
00074 };
00075 
00076 template <> struct DimInitializer<Sizes<> > {
00077   template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
00078   static void run(const InputDims& input_dims, const array<bool, Rank>&,
00079                   Sizes<>*, array<Index, Rank>* reduced_dims) {
00080     const int NumInputDims = internal::array_size<InputDims>::value;
00081     for (int i = 0; i < NumInputDims; ++i) {
00082       (*reduced_dims)[i] = input_dims[i];
00083     }
00084   }
00085 };
00086 
00087 
00088 template <typename ReducedDims, int NumTensorDims, int Layout>
00089 struct are_inner_most_dims {
00090   static const bool value = false;
00091 };
00092 template <typename ReducedDims, int NumTensorDims, int Layout>
00093 struct preserve_inner_most_dims {
00094   static const bool value = false;
00095 };
00096 
00097 #if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
00098 template <typename ReducedDims, int NumTensorDims>
00099 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
00100   static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
00101   static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
00102   static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
00103   static const bool value = tmp1 & tmp2 & tmp3;
00104 };
00105 template <typename ReducedDims, int NumTensorDims>
00106 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
00107   static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
00108   static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
00109   static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
00110   static const bool value = tmp1 & tmp2 & tmp3;
00111 
00112 };
00113 template <typename ReducedDims, int NumTensorDims>
00114 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
00115   static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
00116   static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
00117   static const bool value = tmp1 & tmp2;
00118 
00119 };
00120 template <typename ReducedDims, int NumTensorDims>
00121 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
00122   static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
00123   static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
00124   static const bool value = tmp1 & tmp2;
00125 };
00126 #endif
00127 
00128 
00129 template <int DimIndex, typename Self, typename Op>
00130 struct GenericDimReducer {
00131   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
00132     EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
00133     for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
00134       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
00135       GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
00136     }
00137   }
00138 };
00139 template <typename Self, typename Op>
00140 struct GenericDimReducer<0, Self, Op> {
00141   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
00142     for (int j = 0; j < self.m_reducedDims[0]; ++j) {
00143       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
00144       reducer.reduce(self.m_impl.coeff(input), accum);
00145     }
00146   }
00147 };
00148 template <typename Self, typename Op>
00149 struct GenericDimReducer<-1, Self, Op> {
00150   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
00151     reducer.reduce(self.m_impl.coeff(index), accum);
00152   }
00153 };
00154 
00155 template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
00156 struct InnerMostDimReducer {
00157   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
00158     typename Self::CoeffReturnType accum = reducer.initialize();
00159     for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
00160       reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
00161     }
00162     return reducer.finalize(accum);
00163   }
00164 };
00165 
00166 template <typename Self, typename Op>
00167 struct InnerMostDimReducer<Self, Op, true> {
00168   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
00169     const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
00170     const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
00171     typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
00172     for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
00173       reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
00174     }
00175     typename Self::CoeffReturnType accum = reducer.initialize();
00176     for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
00177       reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
00178     }
00179     return reducer.finalizeBoth(accum, p);
00180   }
00181 };
00182 
00183 template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
00184 struct InnerMostDimPreserver {
00185   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
00186     eigen_assert(false && "should never be called");
00187   }
00188 };
00189 
00190 template <int DimIndex, typename Self, typename Op>
00191 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
00192   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
00193     EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
00194     for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
00195       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
00196       InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
00197     }
00198   }
00199 };
00200 
00201 template <typename Self, typename Op>
00202 struct InnerMostDimPreserver<0, Self, Op, true> {
00203   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
00204     for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
00205       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
00206       reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
00207     }
00208   }
00209 };
00210 template <typename Self, typename Op>
00211 struct InnerMostDimPreserver<-1, Self, Op, true> {
00212   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
00213     eigen_assert(false && "should never be called");
00214   }
00215 };
00216 
00217 // Default full reducer
00218 template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
00219 struct FullReducer {
00220   static const bool HasOptimizedImplementation = false;
00221 
00222   static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
00223     const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
00224     *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
00225   }
00226 };
00227 
00228 
00229 #ifdef EIGEN_USE_THREADS
00230 // Multithreaded full reducers
00231 template <typename Self, typename Op,
00232           bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
00233 struct FullReducerShard {
00234   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
00235                   typename Self::Index numValuesToReduce, Op& reducer,
00236                   typename Self::CoeffReturnType* output) {
00237     *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
00238         self, firstIndex, numValuesToReduce, reducer);
00239   }
00240 };
00241 
00242 // Multithreaded full reducer
00243 template <typename Self, typename Op, bool Vectorizable>
00244 struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
00245   static const bool HasOptimizedImplementation = !Op::IsStateful;
00246   static const int PacketSize =
00247       unpacket_traits<typename Self::PacketReturnType>::size;
00248 
00249   // launch one reducer per thread and accumulate the result.
00250   static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
00251                   typename Self::CoeffReturnType* output) {
00252     typedef typename Self::Index Index;
00253     const Index num_coeffs = array_prod(self.m_impl.dimensions());
00254     if (num_coeffs == 0) {
00255       *output = reducer.finalize(reducer.initialize());
00256       return;
00257     }
00258     const TensorOpCost cost =
00259         self.m_impl.costPerCoeff(Vectorizable) +
00260         TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
00261                      PacketSize);
00262     const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
00263         num_coeffs, cost, device.numThreads());
00264     if (num_threads == 1) {
00265       *output =
00266           InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
00267       return;
00268     }
00269     const Index blocksize =
00270         std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
00271     const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
00272     eigen_assert(num_coeffs >= numblocks * blocksize);
00273 
00274     Barrier barrier(internal::convert_index<unsigned int>(numblocks));
00275     MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
00276     for (Index i = 0; i < numblocks; ++i) {
00277       device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
00278                                   self, i * blocksize, blocksize, reducer,
00279                                   &shards[i]);
00280     }
00281     typename Self::CoeffReturnType finalShard;
00282     if (numblocks * blocksize < num_coeffs) {
00283       finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
00284           self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
00285           reducer);
00286     } else {
00287       finalShard = reducer.initialize();
00288     }
00289     barrier.Wait();
00290 
00291     for (Index i = 0; i < numblocks; ++i) {
00292       reducer.reduce(shards[i], &finalShard);
00293     }
00294     *output = reducer.finalize(finalShard);
00295   }
00296 };
00297 
00298 #endif
00299 
00300 
00301 // Default inner reducer
00302 template <typename Self, typename Op, typename Device>
00303 struct InnerReducer {
00304   static const bool HasOptimizedImplementation = false;
00305 
00306   EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
00307     eigen_assert(false && "Not implemented");
00308     return true;
00309   }
00310 };
00311 
00312 // Default outer reducer
00313 template <typename Self, typename Op, typename Device>
00314 struct OuterReducer {
00315   static const bool HasOptimizedImplementation = false;
00316 
00317   EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
00318     eigen_assert(false && "Not implemented");
00319     return true;
00320   }
00321 };
00322 
00323 
00324 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
00325 template <int B, int N, typename S, typename R, typename I>
00326 __global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
00327 
00328 
00329 #ifdef EIGEN_HAS_CUDA_FP16
00330 template <typename S, typename R, typename I>
00331 __global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
00332 template <int B, int N, typename S, typename R, typename I>
00333 __global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
00334 template <int NPT, typename S, typename R, typename I>
00335 __global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*);
00336 
00337 #endif
00338 
00339 template <int NPT, typename S, typename R, typename I>
00340 __global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
00341 
00342 template <int NPT, typename S, typename R, typename I>
00343 __global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
00344 #endif
00345 
00346 }  // end namespace internal
00347 
00348 
00349 template <typename Op, typename Dims, typename XprType,  template <class> class MakePointer_>
00350 class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
00351   public:
00352     typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
00353     typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
00354     typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
00355     typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
00356     typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
00357     typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
00358 
00359     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
00360     TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
00361     { }
00362     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
00363     TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
00364     { }
00365 
00366     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
00367     const XprType& expression() const { return m_expr; }
00368     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
00369     const Dims& dims() const { return m_dims; }
00370     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
00371     const Op& reducer() const { return m_reducer; }
00372 
00373   protected:
00374     typename XprType::Nested m_expr;
00375     const Dims m_dims;
00376     const Op m_reducer;
00377 };
00378 
00379 
00380 // Eval as rvalue
00381 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
00382 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
00383 {
00384   typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
00385   typedef typename XprType::Index Index;
00386   typedef ArgType ChildType;
00387   typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
00388   static const int NumInputDims = internal::array_size<InputDimensions>::value;
00389   static const int NumReducedDims = internal::array_size<Dims>::value;
00390   static const int NumOutputDims = NumInputDims - NumReducedDims;
00391   typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
00392   typedef typename XprType::Scalar Scalar;
00393   typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
00394   static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
00395   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
00396   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
00397   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
00398 
00399   enum {
00400     IsAligned = false,
00401     PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
00402     Layout = TensorEvaluator<ArgType, Device>::Layout,
00403     CoordAccess = false,  // to be implemented
00404     RawAccess = false
00405   };
00406 
00407   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
00408   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
00409   static const bool RunningFullReduction = (NumOutputDims==0);
00410 
00411   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
00412       : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims())
00413   {
00414     EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
00415     EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
00416                         YOU_MADE_A_PROGRAMMING_MISTAKE);
00417 
00418     // Build the bitmap indicating if an input dimension is reduced or not.
00419     for (int i = 0; i < NumInputDims; ++i) {
00420       m_reduced[i] = false;
00421     }
00422     for (int i = 0; i < NumReducedDims; ++i) {
00423       eigen_assert(op.dims()[i] >= 0);
00424       eigen_assert(op.dims()[i] < NumInputDims);
00425       m_reduced[op.dims()[i]] = true;
00426     }
00427 
00428     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
00429     internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
00430 
00431     // Precompute output strides.
00432     if (NumOutputDims > 0) {
00433       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00434         m_outputStrides[0] = 1;
00435         for (int i = 1; i < NumOutputDims; ++i) {
00436           m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
00437         }
00438       } else {
00439         m_outputStrides.back() = 1;
00440         for (int i = NumOutputDims - 2; i >= 0; --i) {
00441           m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
00442         }
00443       }
00444     }
00445 
00446     // Precompute input strides.
00447     if (NumInputDims > 0) {
00448       array<Index, NumInputDims> input_strides;
00449       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00450         input_strides[0] = 1;
00451         for (int i = 1; i < NumInputDims; ++i) {
00452           input_strides[i] = input_strides[i-1] * input_dims[i-1];
00453         }
00454       } else {
00455         input_strides.back() = 1;
00456         for (int i = NumInputDims - 2; i >= 0; --i) {
00457           input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
00458         }
00459       }
00460 
00461       int outputIndex = 0;
00462       int reduceIndex = 0;
00463       for (int i = 0; i < NumInputDims; ++i) {
00464         if (m_reduced[i]) {
00465           m_reducedStrides[reduceIndex] = input_strides[i];
00466           ++reduceIndex;
00467         } else {
00468           m_preservedStrides[outputIndex] = input_strides[i];
00469           ++outputIndex;
00470         }
00471       }
00472     }
00473 
00474     // Special case for full reductions
00475     if (NumOutputDims == 0) {
00476       m_preservedStrides[0] = internal::array_prod(input_dims);
00477     }
00478   }
00479 
00480   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
00481 
00482   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
00483     m_impl.evalSubExprsIfNeeded(NULL);
00484 
00485     // Use the FullReducer if possible.
00486     if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
00487         internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
00488         ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
00489          !RunningOnGPU))) {
00490       bool need_assign = false;
00491       if (!data) {
00492         m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
00493         data = m_result;
00494         need_assign = true;
00495       }
00496       Op reducer(m_reducer);
00497       internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
00498       return need_assign;
00499     }
00500     else if(RunningOnSycl){
00501       const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
00502       const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
00503       if (!data) {
00504         data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
00505         m_result = data;
00506       }
00507       Op reducer(m_reducer);
00508       internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
00509       return (m_result != NULL);
00510     }
00511 
00512     // Attempt to use an optimized reduction.
00513     else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
00514       bool reducing_inner_dims = true;
00515       for (int i = 0; i < NumReducedDims; ++i) {
00516         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00517           reducing_inner_dims &= m_reduced[i];
00518         } else {
00519           reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
00520         }
00521       }
00522       if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
00523           (reducing_inner_dims || ReducingInnerMostDims)) {
00524         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
00525         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
00526         if (!data) {
00527           if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
00528             data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
00529             m_result = data;
00530           }
00531           else {
00532             return true;
00533           }
00534         }
00535         Op reducer(m_reducer);
00536         if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
00537           if (m_result) {
00538             m_device.deallocate(m_result);
00539             m_result = NULL;
00540           }
00541           return true;
00542         } else {
00543           return (m_result != NULL);
00544         }
00545       }
00546 
00547       bool preserving_inner_dims = true;
00548       for (int i = 0; i < NumReducedDims; ++i) {
00549         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00550           preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
00551         } else {
00552           preserving_inner_dims &= m_reduced[i];
00553         }
00554       }
00555       if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation &&
00556           preserving_inner_dims) {
00557         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
00558         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
00559         if (!data) {
00560           if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
00561             data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
00562             m_result = data;
00563           }
00564           else {
00565             return true;
00566           }
00567         }
00568         Op reducer(m_reducer);
00569         if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
00570           if (m_result) {
00571             m_device.deallocate(m_result);
00572             m_result = NULL;
00573           }
00574           return true;
00575         } else {
00576           return (m_result != NULL);
00577         }
00578       }
00579     }
00580     return true;
00581   }
00582 
00583   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
00584     m_impl.cleanup();
00585     if (m_result) {
00586       m_device.deallocate(m_result);
00587       m_result = NULL;
00588     }
00589   }
00590 
00591   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
00592   {
00593     if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) {
00594       return *(m_result + index);
00595     }
00596     Op reducer(m_reducer);
00597     if (ReducingInnerMostDims || RunningFullReduction) {
00598       const Index num_values_to_reduce =
00599         (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
00600       return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
00601                                                              num_values_to_reduce, reducer);
00602     } else {
00603       typename Self::CoeffReturnType accum = reducer.initialize();
00604       internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
00605       return reducer.finalize(accum);
00606     }
00607   }
00608 
00609   // TODO(bsteiner): provide a more efficient implementation.
00610   template<int LoadMode>
00611   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
00612   {
00613     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
00614     eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
00615 
00616     if (RunningOnGPU && m_result) {
00617       return internal::pload<PacketReturnType>(m_result + index);
00618     }
00619 
00620     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
00621     if (ReducingInnerMostDims) {
00622       const Index num_values_to_reduce =
00623         (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
00624       const Index firstIndex = firstInput(index);
00625       for (Index i = 0; i < PacketSize; ++i) {
00626         Op reducer(m_reducer);
00627         values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
00628                                                                     num_values_to_reduce, reducer);
00629       }
00630     } else if (PreservingInnerMostDims) {
00631       const Index firstIndex = firstInput(index);
00632       const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
00633       // TBD: extend this the the n innermost dimensions that we preserve.
00634       if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
00635         Op reducer(m_reducer);
00636         typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
00637         internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
00638         return reducer.finalizePacket(accum);
00639       } else {
00640         for (int i = 0; i < PacketSize; ++i) {
00641           values[i] = coeff(index + i);
00642         }
00643       }
00644     } else {
00645       for (int i = 0; i < PacketSize; ++i) {
00646         values[i] = coeff(index + i);
00647       }
00648     }
00649     PacketReturnType rslt = internal::pload<PacketReturnType>(values);
00650     return rslt;
00651   }
00652 
00653   // Must be called after evalSubExprsIfNeeded().
00654   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
00655     if (RunningFullReduction && m_result) {
00656       return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
00657     } else {
00658       const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
00659       const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
00660       return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
00661           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
00662     }
00663   }
00664 
00665   EIGEN_DEVICE_FUNC typename MakePointer_<Scalar>::Type data() const { return m_result; }
00667   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
00669   const Device& device() const{return m_device;}
00671   const Dims& xprDims() const {return m_xpr_dims;}
00672 
00673 
00674   private:
00675   template <int, typename, typename> friend struct internal::GenericDimReducer;
00676   template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
00677   template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
00678   template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
00679 #ifdef EIGEN_USE_THREADS
00680   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
00681 #endif
00682 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
00683   template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
00684 #ifdef EIGEN_HAS_CUDA_FP16
00685   template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
00686   template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
00687   template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
00688 #endif
00689   template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
00690 
00691   template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
00692 #endif
00693 
00694   template <typename S, typename O, typename D> friend struct internal::InnerReducer;
00695 
00696   // Returns the Index in the input tensor of the first value that needs to be
00697   // used to compute the reduction at output index "index".
00698   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
00699     if (ReducingInnerMostDims) {
00700       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00701         return index * m_preservedStrides[0];
00702       } else {
00703         return index * m_preservedStrides[NumPreservedStrides - 1];
00704       }
00705     }
00706     // TBD: optimize the case where we preserve the innermost dimensions.
00707     Index startInput = 0;
00708     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00709       for (int i = NumOutputDims - 1; i > 0; --i) {
00710         // This is index_i in the output tensor.
00711         const Index idx = index / m_outputStrides[i];
00712         startInput += idx * m_preservedStrides[i];
00713         index -= idx * m_outputStrides[i];
00714       }
00715       if (PreservingInnerMostDims) {
00716         eigen_assert(m_preservedStrides[0] == 1);
00717         startInput += index;
00718       } else {
00719         startInput += index * m_preservedStrides[0];
00720       }
00721     } else {
00722       for (int i = 0; i < NumOutputDims - 1; ++i) {
00723         // This is index_i in the output tensor.
00724         const Index idx = index / m_outputStrides[i];
00725         startInput += idx * m_preservedStrides[i];
00726         index -= idx * m_outputStrides[i];
00727       }
00728       if (PreservingInnerMostDims) {
00729         eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
00730         startInput += index;
00731       } else {
00732         startInput += index * m_preservedStrides[NumPreservedStrides - 1];
00733       }
00734     }
00735     return startInput;
00736   }
00737 
00738   // Bitmap indicating if an input dimension is reduced or not.
00739   array<bool, NumInputDims> m_reduced;
00740   // Dimensions of the output of the operation.
00741   Dimensions m_dimensions;
00742   // Precomputed strides for the output tensor.
00743   array<Index, NumOutputDims> m_outputStrides;
00744   // Subset of strides of the input tensor for the non-reduced dimensions.
00745   // Indexed by output dimensions.
00746   static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
00747   array<Index, NumPreservedStrides> m_preservedStrides;
00748 
00749   // Subset of strides of the input tensor for the reduced dimensions.
00750   // Indexed by reduced dimensions.
00751   array<Index, NumReducedDims> m_reducedStrides;
00752   // Size of the input dimensions that are reduced.
00753   // Indexed by reduced dimensions.
00754   array<Index, NumReducedDims> m_reducedDims;
00755 
00756   // Evaluator for the input expression.
00757   TensorEvaluator<ArgType, Device> m_impl;
00758 
00759   // Operation to apply for computing the reduction.
00760   Op m_reducer;
00761 
00762   // For full reductions
00763 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
00764   static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
00765   static const bool RunningOnSycl = false;
00766 #elif defined(EIGEN_USE_SYCL)
00767 static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
00768 static const bool RunningOnGPU = false;
00769 #else
00770   static const bool RunningOnGPU = false;
00771   static const bool RunningOnSycl = false;
00772 #endif
00773   typename MakePointer_<CoeffReturnType>::Type m_result;
00774 
00775   const Device& m_device;
00776   const Dims& m_xpr_dims;
00777 };
00778 
00779 } // end namespace Eigen
00780 
00781 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 All Classes Functions Variables Typedefs Enumerator