TensorScan.h
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
00005 //
00006 // This Source Code Form is subject to the terms of the Mozilla
00007 // Public License v. 2.0. If a copy of the MPL was not distributed
00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
00009 
00010 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
00011 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
00012 
00013 namespace Eigen {
00014 
00015 namespace internal {
00016 
00017 template <typename Op, typename XprType>
00018 struct traits<TensorScanOp<Op, XprType> >
00019     : public traits<XprType> {
00020   typedef typename XprType::Scalar Scalar;
00021   typedef traits<XprType> XprTraits;
00022   typedef typename XprTraits::StorageKind StorageKind;
00023   typedef typename XprType::Nested Nested;
00024   typedef typename remove_reference<Nested>::type _Nested;
00025   static const int NumDimensions = XprTraits::NumDimensions;
00026   static const int Layout = XprTraits::Layout;
00027 };
00028 
00029 template<typename Op, typename XprType>
00030 struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
00031 {
00032   typedef const TensorScanOp<Op, XprType>& type;
00033 };
00034 
00035 template<typename Op, typename XprType>
00036 struct nested<TensorScanOp<Op, XprType>, 1,
00037             typename eval<TensorScanOp<Op, XprType> >::type>
00038 {
00039   typedef TensorScanOp<Op, XprType> type;
00040 };
00041 } // end namespace internal
00042 
00048 template <typename Op, typename XprType>
00049 class TensorScanOp
00050     : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
00051 public:
00052   typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
00053   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
00054   typedef typename XprType::CoeffReturnType CoeffReturnType;
00055   typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
00056   typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
00057   typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
00058 
00059   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
00060       const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
00061       : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
00062 
00063   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
00064   const Index axis() const { return m_axis; }
00065   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
00066   const XprType& expression() const { return m_expr; }
00067   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
00068   const Op accumulator() const { return m_accumulator; }
00069   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
00070   bool exclusive() const { return m_exclusive; }
00071 
00072 protected:
00073   typename XprType::Nested m_expr;
00074   const Index m_axis;
00075   const Op m_accumulator;
00076   const bool m_exclusive;
00077 };
00078 
00079 template <typename Self, typename Reducer, typename Device>
00080 struct ScanLauncher;
00081 
00082 // Eval as rvalue
00083 template <typename Op, typename ArgType, typename Device>
00084 struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
00085 
00086   typedef TensorScanOp<Op, ArgType> XprType;
00087   typedef typename XprType::Index Index;
00088   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
00089   typedef DSizes<Index, NumDims> Dimensions;
00090   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
00091   typedef typename XprType::CoeffReturnType CoeffReturnType;
00092   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
00093   typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
00094 
00095   enum {
00096     IsAligned = false,
00097     PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
00098     BlockAccess = false,
00099     Layout = TensorEvaluator<ArgType, Device>::Layout,
00100     CoordAccess = false,
00101     RawAccess = true
00102   };
00103 
00104   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
00105                                                         const Device& device)
00106       : m_impl(op.expression(), device),
00107         m_device(device),
00108         m_exclusive(op.exclusive()),
00109         m_accumulator(op.accumulator()),
00110         m_size(m_impl.dimensions()[op.axis()]),
00111         m_stride(1),
00112         m_output(NULL) {
00113 
00114     // Accumulating a scalar isn't supported.
00115     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
00116     eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
00117 
00118     // Compute stride of scan axis
00119     const Dimensions& dims = m_impl.dimensions();
00120     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00121       for (int i = 0; i < op.axis(); ++i) {
00122         m_stride = m_stride * dims[i];
00123       }
00124     } else {
00125       for (int i = NumDims - 1; i > op.axis(); --i) {
00126         m_stride = m_stride * dims[i];
00127       }
00128     }
00129   }
00130 
00131   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
00132     return m_impl.dimensions();
00133   }
00134 
00135   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
00136     return m_stride;
00137   }
00138 
00139   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
00140     return m_size;
00141   }
00142 
00143   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
00144     return m_accumulator;
00145   }
00146 
00147   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
00148     return m_exclusive;
00149   }
00150 
00151   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
00152     return m_impl;
00153   }
00154 
00155   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
00156     return m_device;
00157   }
00158 
00159   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
00160     m_impl.evalSubExprsIfNeeded(NULL);
00161     ScanLauncher<Self, Op, Device> launcher;
00162     if (data) {
00163       launcher(*this, data);
00164       return false;
00165     }
00166 
00167     const Index total_size = internal::array_prod(dimensions());
00168     m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
00169     launcher(*this, m_output);
00170     return true;
00171   }
00172 
00173   template<int LoadMode>
00174   EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
00175     return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
00176   }
00177 
00178   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
00179   {
00180     return m_output;
00181   }
00182 
00183   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
00184   {
00185     return m_output[index];
00186   }
00187 
00188   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
00189     return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
00190   }
00191 
00192   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
00193     if (m_output != NULL) {
00194       m_device.deallocate(m_output);
00195       m_output = NULL;
00196     }
00197     m_impl.cleanup();
00198   }
00199 
00200 protected:
00201   TensorEvaluator<ArgType, Device> m_impl;
00202   const Device& m_device;
00203   const bool m_exclusive;
00204   Op m_accumulator;
00205   const Index m_size;
00206   Index m_stride;
00207   CoeffReturnType* m_output;
00208 };
00209 
00210 // CPU implementation of scan
00211 // TODO(ibab) This single-threaded implementation should be parallelized,
00212 // at least by running multiple scans at the same time.
00213 template <typename Self, typename Reducer, typename Device>
00214 struct ScanLauncher {
00215   void operator()(Self& self, typename Self::CoeffReturnType *data) {
00216     Index total_size = internal::array_prod(self.dimensions());
00217 
00218     // We fix the index along the scan axis to 0 and perform a
00219     // scan per remaining entry. The iteration is split into two nested
00220     // loops to avoid an integer division by keeping track of each idx1 and idx2.
00221     for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
00222       for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
00223         // Calculate the starting offset for the scan
00224         Index offset = idx1 + idx2;
00225 
00226         // Compute the scan along the axis, starting at the calculated offset
00227         typename Self::CoeffReturnType accum = self.accumulator().initialize();
00228         for (Index idx3 = 0; idx3 < self.size(); idx3++) {
00229           Index curr = offset + idx3 * self.stride();
00230 
00231           if (self.exclusive()) {
00232             data[curr] = self.accumulator().finalize(accum);
00233             self.accumulator().reduce(self.inner().coeff(curr), &accum);
00234           } else {
00235             self.accumulator().reduce(self.inner().coeff(curr), &accum);
00236             data[curr] = self.accumulator().finalize(accum);
00237           }
00238         }
00239       }
00240     }
00241   }
00242 };
00243 
00244 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
00245 
00246 // GPU implementation of scan
00247 // TODO(ibab) This placeholder implementation performs multiple scans in
00248 // parallel, but it would be better to use a parallel scan algorithm and
00249 // optimize memory access.
00250 template <typename Self, typename Reducer>
00251 __global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
00252   // Compute offset as in the CPU version
00253   Index val = threadIdx.x + blockIdx.x * blockDim.x;
00254   Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
00255 
00256   if (offset + (self.size() - 1) * self.stride() < total_size) {
00257     // Compute the scan along the axis, starting at the calculated offset
00258     typename Self::CoeffReturnType accum = self.accumulator().initialize();
00259     for (Index idx = 0; idx < self.size(); idx++) {
00260       Index curr = offset + idx * self.stride();
00261       if (self.exclusive()) {
00262         data[curr] = self.accumulator().finalize(accum);
00263         self.accumulator().reduce(self.inner().coeff(curr), &accum);
00264       } else {
00265         self.accumulator().reduce(self.inner().coeff(curr), &accum);
00266         data[curr] = self.accumulator().finalize(accum);
00267       }
00268     }
00269   }
00270   __syncthreads();
00271 
00272 }
00273 
00274 template <typename Self, typename Reducer>
00275 struct ScanLauncher<Self, Reducer, GpuDevice> {
00276   void operator()(const Self& self, typename Self::CoeffReturnType* data) {
00277      Index total_size = internal::array_prod(self.dimensions());
00278      Index num_blocks = (total_size / self.size() + 63) / 64;
00279      Index block_size = 64;
00280      LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
00281   }
00282 };
00283 #endif  // EIGEN_USE_GPU && __CUDACC__
00284 
00285 }  // end namespace Eigen
00286 
00287 #endif  // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
 All Classes Functions Variables Typedefs Enumerator