TensorConvolution.h
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
00005 //
00006 // This Source Code Form is subject to the terms of the Mozilla
00007 // Public License v. 2.0. If a copy of the MPL was not distributed
00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
00009 
00010 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
00011 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
00012 
00013 namespace Eigen {
00014 
00022 namespace internal {
00023 
00024 template <typename Index, typename InputDims, int NumKernelDims, int Layout>
00025 class IndexMapper {
00026  public:
00027   IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
00028               const array<Index, NumKernelDims>& indices) {
00029 
00030     array<Index, NumDims> dimensions = input_dims;
00031     for (int i = 0; i < NumKernelDims; ++i) {
00032       const Index index = indices[i];
00033       const Index input_dim = input_dims[index];
00034       const Index kernel_dim = kernel_dims[i];
00035       const Index result_dim = input_dim - kernel_dim + 1;
00036       dimensions[index] = result_dim;
00037     }
00038 
00039     array<Index, NumDims> inputStrides;
00040     array<Index, NumDims> outputStrides;
00041     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00042       inputStrides[0] = 1;
00043       outputStrides[0] = 1;
00044       for (int i = 1; i < NumDims; ++i) {
00045         inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
00046         outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
00047       }
00048     } else {
00049       inputStrides[NumDims - 1] = 1;
00050       outputStrides[NumDims - 1] = 1;
00051       for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
00052         inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
00053         outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
00054       }
00055     }
00056 
00057     array<Index, NumDims> cudaInputDimensions;
00058     array<Index, NumDims> cudaOutputDimensions;
00059     array<Index, NumDims> tmp = dimensions;
00060     array<Index, NumDims> ordering;
00061     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
00062                               ? 0
00063                               : NumDims - NumKernelDims;
00064     for (int i = 0; i < NumKernelDims; ++i) {
00065       const Index index = i + offset;
00066       ordering[index] = indices[i];
00067       tmp[indices[i]] = -1;
00068       cudaInputDimensions[index] = input_dims[indices[i]];
00069       cudaOutputDimensions[index] = dimensions[indices[i]];
00070     }
00071 
00072     int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
00073                       ? NumKernelDims
00074                       : 0;
00075     for (int i = 0; i < NumDims; ++i) {
00076       if (tmp[i] >= 0) {
00077         ordering[written] = i;
00078         cudaInputDimensions[written] = input_dims[i];
00079         cudaOutputDimensions[written] = dimensions[i];
00080         ++written;
00081       }
00082     }
00083 
00084     for (int i = 0; i < NumDims; ++i) {
00085       m_inputStrides[i] = inputStrides[ordering[i]];
00086       m_outputStrides[i] = outputStrides[ordering[i]];
00087     }
00088 
00089     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00090       for (int i = 0; i < NumDims; ++i) {
00091         if (i > NumKernelDims) {
00092           m_cudaInputStrides[i] =
00093               m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
00094           m_cudaOutputStrides[i] =
00095               m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
00096         } else {
00097           m_cudaInputStrides[i] = 1;
00098           m_cudaOutputStrides[i] = 1;
00099         }
00100       }
00101     } else {
00102       for (int i = NumDims - 1; i >= 0; --i) {
00103         if (i + 1 < offset) {
00104           m_cudaInputStrides[i] =
00105               m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
00106           m_cudaOutputStrides[i] =
00107               m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
00108         } else {
00109           m_cudaInputStrides[i] = 1;
00110           m_cudaOutputStrides[i] = 1;
00111         }
00112       }
00113     }
00114   }
00115 
00116   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
00117     Index inputIndex = 0;
00118     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00119       for (int d = NumDims - 1; d > NumKernelDims; --d) {
00120         const Index idx = p / m_cudaInputStrides[d];
00121         inputIndex += idx * m_inputStrides[d];
00122         p -= idx * m_cudaInputStrides[d];
00123       }
00124       inputIndex += p * m_inputStrides[NumKernelDims];
00125     } else {
00126       std::ptrdiff_t limit = 0;
00127       if (NumKernelDims < NumDims) {
00128         limit = NumDims - NumKernelDims - 1;
00129       }
00130       for (int d = 0; d < limit; ++d) {
00131         const Index idx = p / m_cudaInputStrides[d];
00132         inputIndex += idx * m_inputStrides[d];
00133         p -= idx * m_cudaInputStrides[d];
00134       }
00135       inputIndex += p * m_inputStrides[limit];
00136     }
00137     return inputIndex;
00138   }
00139 
00140   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
00141     Index outputIndex = 0;
00142     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00143       for (int d = NumDims - 1; d > NumKernelDims; --d) {
00144         const Index idx = p / m_cudaOutputStrides[d];
00145         outputIndex += idx * m_outputStrides[d];
00146         p -= idx * m_cudaOutputStrides[d];
00147       }
00148       outputIndex += p * m_outputStrides[NumKernelDims];
00149     } else {
00150       std::ptrdiff_t limit = 0;
00151       if (NumKernelDims < NumDims) {
00152         limit = NumDims - NumKernelDims - 1;
00153       }
00154       for (int d = 0; d < limit; ++d) {
00155         const Index idx = p / m_cudaOutputStrides[d];
00156         outputIndex += idx * m_outputStrides[d];
00157         p -= idx * m_cudaOutputStrides[d];
00158       }
00159       outputIndex += p * m_outputStrides[limit];
00160     }
00161     return outputIndex;
00162   }
00163 
00164   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
00165     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
00166                               ? 0
00167                               : NumDims - NumKernelDims;
00168     return i * m_inputStrides[offset];
00169   }
00170 
00171   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
00172     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
00173                               ? 0
00174                               : NumDims - NumKernelDims;
00175     return i * m_outputStrides[offset];
00176   }
00177 
00178   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
00179     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
00180                               ? 0
00181                               : NumDims - NumKernelDims;
00182     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
00183   }
00184 
00185   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
00186     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
00187                               ? 0
00188                               : NumDims - NumKernelDims;
00189     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
00190   }
00191 
00192   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
00193     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
00194                               ? 0
00195                               : NumDims - NumKernelDims;
00196     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
00197            k * m_inputStrides[offset + 2];
00198   }
00199 
00200   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
00201     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
00202                               ? 0
00203                               : NumDims - NumKernelDims;
00204     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
00205            k * m_outputStrides[offset + 2];
00206   }
00207 
00208  private:
00209   static const int NumDims = internal::array_size<InputDims>::value;
00210   array<Index, NumDims> m_inputStrides;
00211   array<Index, NumDims> m_outputStrides;
00212   array<Index, NumDims> m_cudaInputStrides;
00213   array<Index, NumDims> m_cudaOutputStrides;
00214 };
00215 
00216 
00217 
00218 template<typename Dimensions, typename InputXprType, typename KernelXprType>
00219 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
00220 {
00221   // Type promotion to handle the case where the types of the lhs and the rhs are different.
00222   typedef typename promote_storage_type<typename InputXprType::Scalar,
00223                                         typename KernelXprType::Scalar>::ret Scalar;
00224   typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
00225                                         typename traits<KernelXprType>::StorageKind>::ret StorageKind;
00226   typedef typename promote_index_type<typename traits<InputXprType>::Index,
00227                                       typename traits<KernelXprType>::Index>::type Index;
00228   typedef typename InputXprType::Nested LhsNested;
00229   typedef typename KernelXprType::Nested RhsNested;
00230   typedef typename remove_reference<LhsNested>::type _LhsNested;
00231   typedef typename remove_reference<RhsNested>::type _RhsNested;
00232   static const int NumDimensions = traits<InputXprType>::NumDimensions;
00233   static const int Layout = traits<InputXprType>::Layout;
00234 
00235   enum {
00236     Flags = 0
00237   };
00238 };
00239 
00240 template<typename Dimensions, typename InputXprType, typename KernelXprType>
00241 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
00242 {
00243   typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
00244 };
00245 
00246 template<typename Dimensions, typename InputXprType, typename KernelXprType>
00247 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
00248 {
00249   typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
00250 };
00251 
00252 }  // end namespace internal
00253 
00254 
00255 
00256 template<typename Indices, typename InputXprType, typename KernelXprType>
00257 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
00258 {
00259   public:
00260   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
00261   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
00262   typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
00263                                                   typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
00264   typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
00265   typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
00266   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
00267 
00268   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
00269       : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
00270 
00271     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
00272     const Indices& indices() const { return m_indices; }
00273 
00275     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
00276     const typename internal::remove_all<typename InputXprType::Nested>::type&
00277     inputExpression() const { return m_input_xpr; }
00278 
00279     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
00280     const typename internal::remove_all<typename KernelXprType::Nested>::type&
00281     kernelExpression() const { return m_kernel_xpr; }
00282 
00283   protected:
00284     typename InputXprType::Nested m_input_xpr;
00285     typename KernelXprType::Nested m_kernel_xpr;
00286     const Indices m_indices;
00287 };
00288 
00289 
00290 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
00291 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
00292 {
00293   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
00294 
00295   static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
00296   static const int NumKernelDims = internal::array_size<Indices>::value;
00297   typedef typename XprType::Index Index;
00298   typedef DSizes<Index, NumDims> Dimensions;
00299 
00300   typedef typename XprType::Scalar Scalar;
00301   typedef typename XprType::CoeffReturnType CoeffReturnType;
00302   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
00303   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
00304 
00305   enum {
00306     IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
00307     PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
00308     Layout = TensorEvaluator<InputArgType, Device>::Layout,
00309     CoordAccess = false,  // to be implemented
00310     RawAccess = false
00311   };
00312 
00313   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
00314       : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
00315   {
00316     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
00317 
00318     const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
00319     const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
00320 
00321     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00322       m_inputStride[0] = 1;
00323       for (int i = 1; i < NumDims; ++i) {
00324         m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
00325       }
00326     } else {
00327       m_inputStride[NumDims - 1] = 1;
00328       for (int i = NumDims - 2; i >= 0; --i) {
00329         m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
00330       }
00331     }
00332 
00333     m_dimensions = m_inputImpl.dimensions();
00334     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00335       for (int i = 0; i < NumKernelDims; ++i) {
00336         const Index index = op.indices()[i];
00337         const Index input_dim = input_dims[index];
00338         const Index kernel_dim = kernel_dims[i];
00339         const Index result_dim = input_dim - kernel_dim + 1;
00340         m_dimensions[index] = result_dim;
00341         if (i > 0) {
00342           m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
00343         } else {
00344           m_kernelStride[0] = 1;
00345         }
00346         m_indexStride[i] = m_inputStride[index];
00347       }
00348 
00349       m_outputStride[0] = 1;
00350       for (int i = 1; i < NumDims; ++i) {
00351         m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
00352       }
00353     } else {
00354       for (int i = NumKernelDims - 1; i >= 0; --i) {
00355         const Index index = op.indices()[i];
00356         const Index input_dim = input_dims[index];
00357         const Index kernel_dim = kernel_dims[i];
00358         const Index result_dim = input_dim - kernel_dim + 1;
00359         m_dimensions[index] = result_dim;
00360         if (i < NumKernelDims - 1) {
00361           m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
00362         } else {
00363           m_kernelStride[NumKernelDims - 1] = 1;
00364         }
00365         m_indexStride[i] = m_inputStride[index];
00366       }
00367 
00368       m_outputStride[NumDims - 1] = 1;
00369       for (int i = NumDims - 2; i >= 0; --i) {
00370         m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
00371       }
00372     }
00373   }
00374 
00375   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
00376 
00377   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
00378     m_inputImpl.evalSubExprsIfNeeded(NULL);
00379     preloadKernel();
00380     return true;
00381   }
00382   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
00383     m_inputImpl.cleanup();
00384     if (m_local_kernel) {
00385       m_device.deallocate((void*)m_kernel);
00386       m_local_kernel = false;
00387     }
00388     m_kernel = NULL;
00389   }
00390 
00391   void evalTo(typename XprType::Scalar* buffer) {
00392     evalSubExprsIfNeeded(NULL);
00393     for (int i = 0; i < dimensions().TotalSize(); ++i) {
00394       buffer[i] += coeff(i);
00395     }
00396     cleanup();
00397   }
00398 
00399   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
00400   {
00401     CoeffReturnType result = CoeffReturnType(0);
00402     convolve(firstInput(index), 0, NumKernelDims-1, result);
00403     return result;
00404   }
00405 
00406   template<int LoadMode>
00407   EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
00408   {
00409     Index indices[2] = {index, index+PacketSize-1};
00410     Index startInputs[2] = {0, 0};
00411     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00412       for (int i = NumDims - 1; i > 0; --i) {
00413         const Index idx0 = indices[0] / m_outputStride[i];
00414         const Index idx1 = indices[1] / m_outputStride[i];
00415         startInputs[0] += idx0 * m_inputStride[i];
00416         startInputs[1] += idx1 * m_inputStride[i];
00417         indices[0] -= idx0 * m_outputStride[i];
00418         indices[1] -= idx1 * m_outputStride[i];
00419       }
00420     } else {
00421       for (int i = 0; i < NumDims - 1; ++i) {
00422         const Index idx0 = indices[0] / m_outputStride[i];
00423         const Index idx1 = indices[1] / m_outputStride[i];
00424         startInputs[0] += idx0 * m_inputStride[i];
00425         startInputs[1] += idx1 * m_inputStride[i];
00426         indices[0] -= idx0 * m_outputStride[i];
00427         indices[1] -= idx1 * m_outputStride[i];
00428       }
00429     }
00430     startInputs[0] += indices[0];
00431     startInputs[1] += indices[1];
00432 
00433     if (startInputs[1]-startInputs[0] == PacketSize-1) {
00434       PacketReturnType result = internal::pset1<PacketReturnType>(0);
00435       convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
00436       return result;
00437     } else {
00438       EIGEN_ALIGN_MAX Scalar data[PacketSize];
00439       data[0] = Scalar(0);
00440       convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
00441       for (int i = 1; i < PacketSize-1; ++i) {
00442         data[i] = Scalar(0);
00443         convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
00444       }
00445       data[PacketSize-1] = Scalar(0);
00446       convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
00447       return internal::pload<PacketReturnType>(data);
00448     }
00449   }
00450 
00451   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
00452   costPerCoeff(bool vectorized) const {
00453     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
00454     // We ignore the use of fused multiply-add.
00455     const double convolve_compute_cost =
00456         TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
00457     const double firstIndex_compute_cost =
00458         NumDims *
00459         (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
00460          TensorOpCost::DivCost<Index>());
00461     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
00462            kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
00463                           m_kernelImpl.costPerCoeff(vectorized) +
00464                           TensorOpCost(0, 0, convolve_compute_cost, vectorized,
00465                                        PacketSize));
00466   }
00467 
00468   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
00469 
00470  private:
00471   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
00472     Index startInput = 0;
00473     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
00474       for (int i = NumDims - 1; i > 0; --i) {
00475         const Index idx = index / m_outputStride[i];
00476         startInput += idx * m_inputStride[i];
00477         index -= idx * m_outputStride[i];
00478       }
00479     } else {
00480       for (int i = 0; i < NumDims - 1; ++i) {
00481         const Index idx = index / m_outputStride[i];
00482         startInput += idx * m_inputStride[i];
00483         index -= idx * m_outputStride[i];
00484       }
00485     }
00486     startInput += index;
00487     return startInput;
00488   }
00489 
00490   EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
00491     for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
00492       const Index input = firstIndex + j * m_indexStride[DimIndex];
00493       const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
00494       if (DimIndex > 0) {
00495         convolve(input, kernel, DimIndex-1, accum);
00496       } else {
00497         accum += m_inputImpl.coeff(input) * m_kernel[kernel];
00498       }
00499     }
00500   }
00501 
00502   template <typename Packet>
00503   EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
00504     for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
00505       const Index input = firstIndex + j * m_indexStride[DimIndex];
00506       const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
00507       if (DimIndex > 0) {
00508         convolvePacket(input, kernel, DimIndex-1, accum);
00509       } else {
00510         accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
00511       }
00512     }
00513   }
00514 
00515   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
00516     // Don't make a local copy of the kernel unless we have to (i.e. it's an
00517     // expression that needs to be evaluated)
00518     const Scalar* in_place = m_kernelImpl.data();
00519     if (in_place) {
00520       m_kernel = in_place;
00521       m_local_kernel = false;
00522     } else {
00523       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
00524       Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
00525       typedef TensorEvalToOp<const KernelArgType> EvalTo;
00526       EvalTo evalToTmp(local, m_kernelArg);
00527       const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
00528       internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
00529 
00530       m_kernel = local;
00531       m_local_kernel = true;
00532     }
00533   }
00534 
00535   array<Index, NumDims> m_inputStride;
00536   array<Index, NumDims> m_outputStride;
00537 
00538   array<Index, NumKernelDims> m_indexStride;
00539   array<Index, NumKernelDims> m_kernelStride;
00540   TensorEvaluator<InputArgType, Device> m_inputImpl;
00541   TensorEvaluator<KernelArgType, Device> m_kernelImpl;
00542   Dimensions m_dimensions;
00543 
00544   KernelArgType m_kernelArg;
00545   const Scalar* m_kernel;
00546   bool m_local_kernel;
00547   const Device& m_device;
00548 };
00549 
00550 
00551 
00552 
00553 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
00554 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
00555 
00556 template <int StaticKernelSize>
00557 struct GetKernelSize {
00558   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
00559     return StaticKernelSize;
00560   }
00561 };
00562 template <>
00563 struct GetKernelSize<Dynamic> {
00564   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
00565     return kernelSize;
00566   }
00567 };
00568 
00569 template <typename InputEvaluator, typename Index, typename InputDims,
00570           int StaticKernelSize>
00571 __global__ void EigenConvolutionKernel1D(
00572     InputEvaluator eval,
00573     const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
00574         indexMapper,
00575     const float* __restrict kernel, const int numPlanes, const int numX,
00576     const int maxX, const int kernelSize, float* buffer) {
00577   extern __shared__ float s[];
00578 
00579   const int first_x = blockIdx.x * maxX;
00580   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
00581   const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
00582   const int num_x_output = last_x - first_x + 1;
00583 
00584   const int first_plane = blockIdx.y * blockDim.y;
00585   const int plane_stride = blockDim.y * gridDim.y;
00586 
00587   for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
00588     // Load inputs to shared memory
00589     const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
00590     const int plane_kernel_offset = threadIdx.y * num_x_input;
00591     #pragma unroll
00592     for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
00593       const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
00594       s[i + plane_kernel_offset] = eval.coeff(tensor_index);
00595     }
00596 
00597     __syncthreads();
00598 
00599     // Compute the convolution
00600     const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
00601 
00602     #pragma unroll
00603     for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
00604       const int kernel_offset = plane_kernel_offset + i;
00605       float result = 0.0f;
00606       #pragma unroll
00607       for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
00608         result += s[k + kernel_offset] * kernel[k];
00609       }
00610       const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
00611       buffer[tensor_index] = result;
00612     }
00613     __syncthreads();
00614   }
00615 };
00616 
00617 template <typename InputEvaluator, typename Index, typename InputDims,
00618           int StaticKernelSizeX, int StaticKernelSizeY>
00619 __global__ void EigenConvolutionKernel2D(
00620     InputEvaluator eval,
00621     const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
00622         indexMapper,
00623     const float* __restrict kernel, const int numPlanes, const int numX,
00624     const int maxX, const int numY, const int maxY, const int kernelSizeX,
00625     const int kernelSizeY, float* buffer) {
00626   extern __shared__ float s[];
00627 
00628   const int first_x = blockIdx.x * maxX;
00629   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
00630   const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
00631   const int num_x_output = last_x - first_x + 1;
00632 
00633   const int first_y = blockIdx.y * maxY;
00634   const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
00635   const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
00636   const int num_y_output = last_y - first_y + 1;
00637 
00638   const int first_plane = blockIdx.z * blockDim.z;
00639   const int plane_stride = blockDim.z * gridDim.z;
00640 
00641   for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
00642 
00643     const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
00644     const int plane_kernel_offset = threadIdx.z * num_y_input;
00645 
00646     // Load inputs to shared memory
00647     #pragma unroll
00648     for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
00649       const int input_offset = num_x_input * (j + plane_kernel_offset);
00650       #pragma unroll
00651       for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
00652         const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
00653         s[i + input_offset] = eval.coeff(tensor_index);
00654       }
00655     }
00656 
00657     __syncthreads();
00658 
00659     // Convolution
00660     const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
00661 
00662     #pragma unroll
00663     for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
00664       #pragma unroll
00665       for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
00666         float result = 0.0f;
00667         #pragma unroll
00668         for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
00669           const int kernel_offset = kernelSizeX * l;
00670           const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
00671           #pragma unroll
00672           for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
00673             result += s[k + input_offset] * kernel[k + kernel_offset];
00674           }
00675         }
00676         const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
00677         buffer[tensor_index] = result;
00678       }
00679     }
00680 
00681     __syncthreads();
00682   }
00683 };
00684 
00685 template <typename InputEvaluator, typename Index, typename InputDims>
00686 __global__ void EigenConvolutionKernel3D(
00687     InputEvaluator eval,
00688     const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
00689         indexMapper,
00690     const float* __restrict kernel, const size_t numPlanes, const size_t numX,
00691     const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
00692     const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
00693     const size_t kernelSizeZ, float* buffer) {
00694   extern __shared__ float s[];
00695 
00696   // Load inputs to shared memory
00697   const int first_x = blockIdx.x * maxX;
00698   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
00699   const int num_x_input = last_x - first_x + kernelSizeX;
00700 
00701   const int first_y = blockIdx.y * maxY;
00702   const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
00703   const int num_y_input = last_y - first_y + kernelSizeY;
00704 
00705   const int first_z = blockIdx.z * maxZ;
00706   const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
00707   const int num_z_input = last_z - first_z + kernelSizeZ;
00708 
00709   for (int p = 0; p < numPlanes; ++p) {
00710 
00711     const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
00712     const int plane_kernel_offset = 0;
00713 
00714     for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
00715       for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
00716         for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
00717           const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
00718           s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
00719         }
00720       }
00721     }
00722 
00723     __syncthreads();
00724 
00725     // Convolution
00726     const int num_z_output = last_z - first_z + 1;
00727     const int num_y_output = last_y - first_y + 1;
00728     const int num_x_output = last_x - first_x + 1;
00729     const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
00730 
00731     for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
00732       for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
00733         for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
00734           float result = 0.0f;
00735           for (int n = 0; n < kernelSizeZ; ++n) {
00736             for (int m = 0; m < kernelSizeY; ++m) {
00737               for (int l = 0; l < kernelSizeX; ++l) {
00738                 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
00739               }
00740             }
00741           }
00742           const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
00743           buffer[tensor_index] = result;
00744         }
00745       }
00746     }
00747     __syncthreads();
00748   }
00749 };
00750 
00751 
00752 
00753 template<typename Indices, typename InputArgType, typename KernelArgType>
00754 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
00755 {
00756   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
00757 
00758   static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
00759   static const int NumKernelDims = internal::array_size<Indices>::value;
00760   typedef typename XprType::Index Index;
00761   typedef DSizes<Index, NumDims> Dimensions;
00762   typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
00763 
00764   enum {
00765     IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
00766     PacketAccess = false,
00767     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
00768     CoordAccess = false,  // to be implemented
00769     RawAccess = false
00770   };
00771 
00772   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
00773       : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
00774   {
00775     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
00776 
00777     const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
00778     const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
00779 
00780     m_dimensions = m_inputImpl.dimensions();
00781     for (int i = 0; i < NumKernelDims; ++i) {
00782       const Index index = op.indices()[i];
00783       const Index input_dim = input_dims[index];
00784       const Index kernel_dim = kernel_dims[i];
00785       const Index result_dim = input_dim - kernel_dim + 1;
00786       m_dimensions[index] = result_dim;
00787     }
00788   }
00789 
00790   typedef typename XprType::CoeffReturnType CoeffReturnType;
00791   typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
00792   typedef typename InputArgType::Scalar Scalar;
00793   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
00794 
00795   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
00796 
00797   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
00798     preloadKernel();
00799     m_inputImpl.evalSubExprsIfNeeded(NULL);
00800     if (data) {
00801       executeEval(data);
00802       return false;
00803     } else {
00804       m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
00805       executeEval(m_buf);
00806       return true;
00807     }
00808   }
00809 
00810   EIGEN_STRONG_INLINE void cleanup() {
00811     m_inputImpl.cleanup();
00812     if (m_buf) {
00813       m_device.deallocate(m_buf);
00814       m_buf = NULL;
00815     }
00816     if (m_local_kernel) {
00817       m_device.deallocate((void*)m_kernel);
00818       m_local_kernel = false;
00819     }
00820     m_kernel = NULL;
00821   }
00822 
00823   EIGEN_STRONG_INLINE void preloadKernel() {
00824     // Don't make a local copy of the kernel unless we have to (i.e. it's an
00825     // expression that needs to be evaluated)
00826     const Scalar* in_place = m_kernelImpl.data();
00827     if (in_place) {
00828       m_kernel = in_place;
00829       m_local_kernel = false;
00830     } else {
00831       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
00832       Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
00833       typedef TensorEvalToOp<const KernelArgType> EvalTo;
00834       EvalTo evalToTmp(local, m_kernelArg);
00835       const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
00836       internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
00837 
00838       m_kernel = local;
00839       m_local_kernel = true;
00840     }
00841   }
00842 
00843   static unsigned int ceil(unsigned int num, unsigned int denom) {
00844     const unsigned int rounded_toward_zero = num / denom;
00845     if (num > rounded_toward_zero * denom) {
00846       return rounded_toward_zero + 1;
00847     }
00848     return rounded_toward_zero;
00849   }
00850 
00851   void executeEval(Scalar* data) const {
00852     typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
00853 
00854     const int maxSharedMem = m_device.sharedMemPerBlock();
00855     const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
00856     const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
00857     const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
00858     const int warpSize = 32;
00859 
00860     switch (NumKernelDims) {
00861       case 1: {
00862         const int kernel_size = m_kernelImpl.dimensions().TotalSize();
00863 
00864         const int numX = dimensions()[m_indices[0]];
00865         const int numP = dimensions().TotalSize() / numX;
00866         int maxX;
00867         dim3 block_size;
00868 
00869         const int single_stride_dim =
00870             static_cast<int>(Layout) == static_cast<int>(ColMajor)
00871                 ? 0
00872                 : m_inputImpl.dimensions().rank() - 1;
00873         if (m_indices[0] == single_stride_dim) {
00874           // Maximum the reuse
00875           const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
00876           maxX = numext::mini<int>(inner_dim, numX);
00877           const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
00878           block_size.x = numext::mini(maxThreadsPerBlock, maxX);
00879           block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
00880         }
00881         else {
00882           // Read as much as possible alongside the inner most dimension, that is the plane
00883           const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
00884           const int maxP = numext::mini<int>(inner_dim, numP);
00885           maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
00886 
00887           block_size.x = numext::mini(warpSize, maxX);
00888           block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
00889         }
00890 
00891         const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
00892         assert(shared_mem <= maxSharedMem);
00893 
00894         const int num_x_blocks = ceil(numX, maxX);
00895         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
00896         const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
00897 
00898         dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
00899 
00900 
00901         //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
00902 
00903         const array<Index, 1> indices(m_indices[0]);
00904         const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
00905         internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
00906             m_inputImpl.dimensions(), kernel_dims, indices);
00907         switch(kernel_size) {
00908           case 4: {
00909             LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
00910             break;
00911           }
00912           case 7: {
00913             LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
00914             break;
00915           }
00916           default: {
00917             LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
00918           }
00919         }
00920         break;
00921       }
00922 
00923       case 2: {
00924         const int idxX =
00925             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
00926         const int idxY =
00927             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
00928         const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
00929         const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
00930 
00931         const int numX = dimensions()[m_indices[idxX]];
00932         const int numY = dimensions()[m_indices[idxY]];
00933         const int numP = dimensions().TotalSize() / (numX*numY);
00934 
00935         const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
00936 
00937         // Snap maxX to warp size
00938         int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
00939         const int maxX = numext::mini<int>(inner_dim, numX);
00940         const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
00941         const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
00942 
00943         dim3 block_size;
00944         block_size.x = numext::mini(1024, maxX);
00945         block_size.y = numext::mini<int>(1024/block_size.x, maxY);
00946         block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
00947 
00948         const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
00949         assert(shared_mem <= maxSharedMem);
00950 
00951         const int num_x_blocks = ceil(numX, maxX);
00952         const int num_y_blocks = ceil(numY, maxY);
00953         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
00954         const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
00955 
00956         dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
00957 
00958 
00959         //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
00960 
00961         const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
00962         const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
00963                                           m_kernelImpl.dimensions()[idxY]);
00964         internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
00965             m_inputImpl.dimensions(), kernel_dims, indices);
00966         switch (kernel_size_x) {
00967           case 4: {
00968             switch (kernel_size_y) {
00969               case 7: {
00970                 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
00971                 break;
00972               }
00973               default: {
00974                 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
00975                 break;
00976               }
00977             }
00978             break;
00979           }
00980           case 7: {
00981             switch (kernel_size_y) {
00982               case 4: {
00983                 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
00984                 break;
00985               }
00986               default: {
00987                 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
00988                 break;
00989               }
00990             }
00991             break;
00992           }
00993           default: {
00994             LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
00995             break;
00996           }
00997         }
00998         break;
00999       }
01000 
01001       case 3: {
01002         const int idxX =
01003             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
01004         const int idxY =
01005             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
01006         const int idxZ =
01007             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
01008 
01009         const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
01010         const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
01011         const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
01012 
01013         const int numX = dimensions()[m_indices[idxX]];
01014         const int numY = dimensions()[m_indices[idxY]];
01015         const int numZ = dimensions()[m_indices[idxZ]];
01016         const int numP = dimensions().TotalSize() / (numX*numY*numZ);
01017 
01018         const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
01019         const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
01020         const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
01021 
01022         dim3 block_size;
01023         block_size.x = numext::mini(32, maxX);
01024         block_size.y = numext::mini(32, maxY);
01025         block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
01026         dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
01027 
01028         const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
01029         assert(shared_mem <= maxSharedMem);
01030 
01031         //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
01032         const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
01033                                       m_indices[idxZ]);
01034         const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
01035                                           m_kernelImpl.dimensions()[idxY],
01036                                           m_kernelImpl.dimensions()[idxZ]);
01037         internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
01038             m_inputImpl.dimensions(), kernel_dims, indices);
01039 
01040         LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
01041         break;
01042       }
01043 
01044       default: {
01045         EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
01046       }
01047     }
01048   }
01049 
01050   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
01051   {
01052     eigen_assert(m_buf);
01053     eigen_assert(index < m_dimensions.TotalSize());
01054     return m_buf[index];
01055   }
01056 
01057   template<int LoadMode>
01058   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
01059   {
01060     eigen_assert(m_buf);
01061     eigen_assert(index < m_dimensions.TotalSize());
01062     return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
01063   }
01064 
01065   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
01066   costPerCoeff(bool vectorized) const {
01067     // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
01068     // model.
01069     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
01070     // We ignore the use of fused multiply-add.
01071     const double convolve_compute_cost =
01072         TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
01073     const double firstIndex_compute_cost =
01074         NumDims *
01075         (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
01076          TensorOpCost::DivCost<Index>());
01077     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
01078            kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
01079                           m_kernelImpl.costPerCoeff(vectorized) +
01080                           TensorOpCost(0, 0, convolve_compute_cost, vectorized,
01081                                        PacketSize));
01082   }
01083 
01084  private:
01085   // No assignment (copies are needed by the kernels)
01086   TensorEvaluator& operator = (const TensorEvaluator&);
01087 
01088   TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
01089   TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
01090   KernelArgType m_kernelArg;
01091   Indices m_indices;
01092   Dimensions m_dimensions;
01093   Scalar* m_buf;
01094   const Scalar* m_kernel;
01095   bool m_local_kernel;
01096 
01097   const GpuDevice& m_device;
01098 };
01099 #endif
01100 
01101 
01102 } // end namespace Eigen
01103 
01104 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
 All Classes Functions Variables Typedefs Enumerator