![]() |
Eigen-unsupported
3.3.3
|
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in> 00005 // 00006 // This Source Code Form is subject to the terms of the Mozilla 00007 // Public License v. 2.0. If a copy of the MPL was not distributed 00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00009 00010 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H 00011 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H 00012 00013 namespace Eigen { 00014 00015 namespace internal { 00016 00017 template <typename Op, typename XprType> 00018 struct traits<TensorScanOp<Op, XprType> > 00019 : public traits<XprType> { 00020 typedef typename XprType::Scalar Scalar; 00021 typedef traits<XprType> XprTraits; 00022 typedef typename XprTraits::StorageKind StorageKind; 00023 typedef typename XprType::Nested Nested; 00024 typedef typename remove_reference<Nested>::type _Nested; 00025 static const int NumDimensions = XprTraits::NumDimensions; 00026 static const int Layout = XprTraits::Layout; 00027 }; 00028 00029 template<typename Op, typename XprType> 00030 struct eval<TensorScanOp<Op, XprType>, Eigen::Dense> 00031 { 00032 typedef const TensorScanOp<Op, XprType>& type; 00033 }; 00034 00035 template<typename Op, typename XprType> 00036 struct nested<TensorScanOp<Op, XprType>, 1, 00037 typename eval<TensorScanOp<Op, XprType> >::type> 00038 { 00039 typedef TensorScanOp<Op, XprType> type; 00040 }; 00041 } // end namespace internal 00042 00048 template <typename Op, typename XprType> 00049 class TensorScanOp 00050 : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> { 00051 public: 00052 typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar; 00053 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; 00054 typedef typename XprType::CoeffReturnType CoeffReturnType; 00055 typedef typename Eigen::internal::nested<TensorScanOp>::type Nested; 00056 typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind; 00057 typedef typename Eigen::internal::traits<TensorScanOp>::Index Index; 00058 00059 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp( 00060 const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op()) 00061 : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {} 00062 00063 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 00064 const Index axis() const { return m_axis; } 00065 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 00066 const XprType& expression() const { return m_expr; } 00067 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 00068 const Op accumulator() const { return m_accumulator; } 00069 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 00070 bool exclusive() const { return m_exclusive; } 00071 00072 protected: 00073 typename XprType::Nested m_expr; 00074 const Index m_axis; 00075 const Op m_accumulator; 00076 const bool m_exclusive; 00077 }; 00078 00079 template <typename Self, typename Reducer, typename Device> 00080 struct ScanLauncher; 00081 00082 // Eval as rvalue 00083 template <typename Op, typename ArgType, typename Device> 00084 struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> { 00085 00086 typedef TensorScanOp<Op, ArgType> XprType; 00087 typedef typename XprType::Index Index; 00088 static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; 00089 typedef DSizes<Index, NumDims> Dimensions; 00090 typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar; 00091 typedef typename XprType::CoeffReturnType CoeffReturnType; 00092 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; 00093 typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self; 00094 00095 enum { 00096 IsAligned = false, 00097 PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), 00098 BlockAccess = false, 00099 Layout = TensorEvaluator<ArgType, Device>::Layout, 00100 CoordAccess = false, 00101 RawAccess = true 00102 }; 00103 00104 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, 00105 const Device& device) 00106 : m_impl(op.expression(), device), 00107 m_device(device), 00108 m_exclusive(op.exclusive()), 00109 m_accumulator(op.accumulator()), 00110 m_size(m_impl.dimensions()[op.axis()]), 00111 m_stride(1), 00112 m_output(NULL) { 00113 00114 // Accumulating a scalar isn't supported. 00115 EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); 00116 eigen_assert(op.axis() >= 0 && op.axis() < NumDims); 00117 00118 // Compute stride of scan axis 00119 const Dimensions& dims = m_impl.dimensions(); 00120 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00121 for (int i = 0; i < op.axis(); ++i) { 00122 m_stride = m_stride * dims[i]; 00123 } 00124 } else { 00125 for (int i = NumDims - 1; i > op.axis(); --i) { 00126 m_stride = m_stride * dims[i]; 00127 } 00128 } 00129 } 00130 00131 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { 00132 return m_impl.dimensions(); 00133 } 00134 00135 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const { 00136 return m_stride; 00137 } 00138 00139 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { 00140 return m_size; 00141 } 00142 00143 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const { 00144 return m_accumulator; 00145 } 00146 00147 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { 00148 return m_exclusive; 00149 } 00150 00151 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const { 00152 return m_impl; 00153 } 00154 00155 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { 00156 return m_device; 00157 } 00158 00159 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { 00160 m_impl.evalSubExprsIfNeeded(NULL); 00161 ScanLauncher<Self, Op, Device> launcher; 00162 if (data) { 00163 launcher(*this, data); 00164 return false; 00165 } 00166 00167 const Index total_size = internal::array_prod(dimensions()); 00168 m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar))); 00169 launcher(*this, m_output); 00170 return true; 00171 } 00172 00173 template<int LoadMode> 00174 EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { 00175 return internal::ploadt<PacketReturnType, LoadMode>(m_output + index); 00176 } 00177 00178 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const 00179 { 00180 return m_output; 00181 } 00182 00183 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const 00184 { 00185 return m_output[index]; 00186 } 00187 00188 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { 00189 return TensorOpCost(sizeof(CoeffReturnType), 0, 0); 00190 } 00191 00192 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { 00193 if (m_output != NULL) { 00194 m_device.deallocate(m_output); 00195 m_output = NULL; 00196 } 00197 m_impl.cleanup(); 00198 } 00199 00200 protected: 00201 TensorEvaluator<ArgType, Device> m_impl; 00202 const Device& m_device; 00203 const bool m_exclusive; 00204 Op m_accumulator; 00205 const Index m_size; 00206 Index m_stride; 00207 CoeffReturnType* m_output; 00208 }; 00209 00210 // CPU implementation of scan 00211 // TODO(ibab) This single-threaded implementation should be parallelized, 00212 // at least by running multiple scans at the same time. 00213 template <typename Self, typename Reducer, typename Device> 00214 struct ScanLauncher { 00215 void operator()(Self& self, typename Self::CoeffReturnType *data) { 00216 Index total_size = internal::array_prod(self.dimensions()); 00217 00218 // We fix the index along the scan axis to 0 and perform a 00219 // scan per remaining entry. The iteration is split into two nested 00220 // loops to avoid an integer division by keeping track of each idx1 and idx2. 00221 for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) { 00222 for (Index idx2 = 0; idx2 < self.stride(); idx2++) { 00223 // Calculate the starting offset for the scan 00224 Index offset = idx1 + idx2; 00225 00226 // Compute the scan along the axis, starting at the calculated offset 00227 typename Self::CoeffReturnType accum = self.accumulator().initialize(); 00228 for (Index idx3 = 0; idx3 < self.size(); idx3++) { 00229 Index curr = offset + idx3 * self.stride(); 00230 00231 if (self.exclusive()) { 00232 data[curr] = self.accumulator().finalize(accum); 00233 self.accumulator().reduce(self.inner().coeff(curr), &accum); 00234 } else { 00235 self.accumulator().reduce(self.inner().coeff(curr), &accum); 00236 data[curr] = self.accumulator().finalize(accum); 00237 } 00238 } 00239 } 00240 } 00241 } 00242 }; 00243 00244 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 00245 00246 // GPU implementation of scan 00247 // TODO(ibab) This placeholder implementation performs multiple scans in 00248 // parallel, but it would be better to use a parallel scan algorithm and 00249 // optimize memory access. 00250 template <typename Self, typename Reducer> 00251 __global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) { 00252 // Compute offset as in the CPU version 00253 Index val = threadIdx.x + blockIdx.x * blockDim.x; 00254 Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride(); 00255 00256 if (offset + (self.size() - 1) * self.stride() < total_size) { 00257 // Compute the scan along the axis, starting at the calculated offset 00258 typename Self::CoeffReturnType accum = self.accumulator().initialize(); 00259 for (Index idx = 0; idx < self.size(); idx++) { 00260 Index curr = offset + idx * self.stride(); 00261 if (self.exclusive()) { 00262 data[curr] = self.accumulator().finalize(accum); 00263 self.accumulator().reduce(self.inner().coeff(curr), &accum); 00264 } else { 00265 self.accumulator().reduce(self.inner().coeff(curr), &accum); 00266 data[curr] = self.accumulator().finalize(accum); 00267 } 00268 } 00269 } 00270 __syncthreads(); 00271 00272 } 00273 00274 template <typename Self, typename Reducer> 00275 struct ScanLauncher<Self, Reducer, GpuDevice> { 00276 void operator()(const Self& self, typename Self::CoeffReturnType* data) { 00277 Index total_size = internal::array_prod(self.dimensions()); 00278 Index num_blocks = (total_size / self.size() + 63) / 64; 00279 Index block_size = 64; 00280 LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data); 00281 } 00282 }; 00283 #endif // EIGEN_USE_GPU && __CUDACC__ 00284 00285 } // end namespace Eigen 00286 00287 #endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H