![]() |
Eigen-unsupported
3.3.3
|
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 00005 // 00006 // This Source Code Form is subject to the terms of the Mozilla 00007 // Public License v. 2.0. If a copy of the MPL was not distributed 00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00009 00010 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 00011 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 00012 00013 namespace Eigen { 00014 00022 namespace internal { 00023 00024 template <typename Index, typename InputDims, int NumKernelDims, int Layout> 00025 class IndexMapper { 00026 public: 00027 IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims, 00028 const array<Index, NumKernelDims>& indices) { 00029 00030 array<Index, NumDims> dimensions = input_dims; 00031 for (int i = 0; i < NumKernelDims; ++i) { 00032 const Index index = indices[i]; 00033 const Index input_dim = input_dims[index]; 00034 const Index kernel_dim = kernel_dims[i]; 00035 const Index result_dim = input_dim - kernel_dim + 1; 00036 dimensions[index] = result_dim; 00037 } 00038 00039 array<Index, NumDims> inputStrides; 00040 array<Index, NumDims> outputStrides; 00041 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00042 inputStrides[0] = 1; 00043 outputStrides[0] = 1; 00044 for (int i = 1; i < NumDims; ++i) { 00045 inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; 00046 outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; 00047 } 00048 } else { 00049 inputStrides[NumDims - 1] = 1; 00050 outputStrides[NumDims - 1] = 1; 00051 for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) { 00052 inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; 00053 outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; 00054 } 00055 } 00056 00057 array<Index, NumDims> cudaInputDimensions; 00058 array<Index, NumDims> cudaOutputDimensions; 00059 array<Index, NumDims> tmp = dimensions; 00060 array<Index, NumDims> ordering; 00061 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 00062 ? 0 00063 : NumDims - NumKernelDims; 00064 for (int i = 0; i < NumKernelDims; ++i) { 00065 const Index index = i + offset; 00066 ordering[index] = indices[i]; 00067 tmp[indices[i]] = -1; 00068 cudaInputDimensions[index] = input_dims[indices[i]]; 00069 cudaOutputDimensions[index] = dimensions[indices[i]]; 00070 } 00071 00072 int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) 00073 ? NumKernelDims 00074 : 0; 00075 for (int i = 0; i < NumDims; ++i) { 00076 if (tmp[i] >= 0) { 00077 ordering[written] = i; 00078 cudaInputDimensions[written] = input_dims[i]; 00079 cudaOutputDimensions[written] = dimensions[i]; 00080 ++written; 00081 } 00082 } 00083 00084 for (int i = 0; i < NumDims; ++i) { 00085 m_inputStrides[i] = inputStrides[ordering[i]]; 00086 m_outputStrides[i] = outputStrides[ordering[i]]; 00087 } 00088 00089 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00090 for (int i = 0; i < NumDims; ++i) { 00091 if (i > NumKernelDims) { 00092 m_cudaInputStrides[i] = 00093 m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; 00094 m_cudaOutputStrides[i] = 00095 m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; 00096 } else { 00097 m_cudaInputStrides[i] = 1; 00098 m_cudaOutputStrides[i] = 1; 00099 } 00100 } 00101 } else { 00102 for (int i = NumDims - 1; i >= 0; --i) { 00103 if (i + 1 < offset) { 00104 m_cudaInputStrides[i] = 00105 m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; 00106 m_cudaOutputStrides[i] = 00107 m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; 00108 } else { 00109 m_cudaInputStrides[i] = 1; 00110 m_cudaOutputStrides[i] = 1; 00111 } 00112 } 00113 } 00114 } 00115 00116 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { 00117 Index inputIndex = 0; 00118 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00119 for (int d = NumDims - 1; d > NumKernelDims; --d) { 00120 const Index idx = p / m_cudaInputStrides[d]; 00121 inputIndex += idx * m_inputStrides[d]; 00122 p -= idx * m_cudaInputStrides[d]; 00123 } 00124 inputIndex += p * m_inputStrides[NumKernelDims]; 00125 } else { 00126 std::ptrdiff_t limit = 0; 00127 if (NumKernelDims < NumDims) { 00128 limit = NumDims - NumKernelDims - 1; 00129 } 00130 for (int d = 0; d < limit; ++d) { 00131 const Index idx = p / m_cudaInputStrides[d]; 00132 inputIndex += idx * m_inputStrides[d]; 00133 p -= idx * m_cudaInputStrides[d]; 00134 } 00135 inputIndex += p * m_inputStrides[limit]; 00136 } 00137 return inputIndex; 00138 } 00139 00140 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { 00141 Index outputIndex = 0; 00142 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00143 for (int d = NumDims - 1; d > NumKernelDims; --d) { 00144 const Index idx = p / m_cudaOutputStrides[d]; 00145 outputIndex += idx * m_outputStrides[d]; 00146 p -= idx * m_cudaOutputStrides[d]; 00147 } 00148 outputIndex += p * m_outputStrides[NumKernelDims]; 00149 } else { 00150 std::ptrdiff_t limit = 0; 00151 if (NumKernelDims < NumDims) { 00152 limit = NumDims - NumKernelDims - 1; 00153 } 00154 for (int d = 0; d < limit; ++d) { 00155 const Index idx = p / m_cudaOutputStrides[d]; 00156 outputIndex += idx * m_outputStrides[d]; 00157 p -= idx * m_cudaOutputStrides[d]; 00158 } 00159 outputIndex += p * m_outputStrides[limit]; 00160 } 00161 return outputIndex; 00162 } 00163 00164 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { 00165 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 00166 ? 0 00167 : NumDims - NumKernelDims; 00168 return i * m_inputStrides[offset]; 00169 } 00170 00171 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { 00172 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 00173 ? 0 00174 : NumDims - NumKernelDims; 00175 return i * m_outputStrides[offset]; 00176 } 00177 00178 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { 00179 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 00180 ? 0 00181 : NumDims - NumKernelDims; 00182 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; 00183 } 00184 00185 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { 00186 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 00187 ? 0 00188 : NumDims - NumKernelDims; 00189 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; 00190 } 00191 00192 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { 00193 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 00194 ? 0 00195 : NumDims - NumKernelDims; 00196 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + 00197 k * m_inputStrides[offset + 2]; 00198 } 00199 00200 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { 00201 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 00202 ? 0 00203 : NumDims - NumKernelDims; 00204 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + 00205 k * m_outputStrides[offset + 2]; 00206 } 00207 00208 private: 00209 static const int NumDims = internal::array_size<InputDims>::value; 00210 array<Index, NumDims> m_inputStrides; 00211 array<Index, NumDims> m_outputStrides; 00212 array<Index, NumDims> m_cudaInputStrides; 00213 array<Index, NumDims> m_cudaOutputStrides; 00214 }; 00215 00216 00217 00218 template<typename Dimensions, typename InputXprType, typename KernelXprType> 00219 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > 00220 { 00221 // Type promotion to handle the case where the types of the lhs and the rhs are different. 00222 typedef typename promote_storage_type<typename InputXprType::Scalar, 00223 typename KernelXprType::Scalar>::ret Scalar; 00224 typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind, 00225 typename traits<KernelXprType>::StorageKind>::ret StorageKind; 00226 typedef typename promote_index_type<typename traits<InputXprType>::Index, 00227 typename traits<KernelXprType>::Index>::type Index; 00228 typedef typename InputXprType::Nested LhsNested; 00229 typedef typename KernelXprType::Nested RhsNested; 00230 typedef typename remove_reference<LhsNested>::type _LhsNested; 00231 typedef typename remove_reference<RhsNested>::type _RhsNested; 00232 static const int NumDimensions = traits<InputXprType>::NumDimensions; 00233 static const int Layout = traits<InputXprType>::Layout; 00234 00235 enum { 00236 Flags = 0 00237 }; 00238 }; 00239 00240 template<typename Dimensions, typename InputXprType, typename KernelXprType> 00241 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> 00242 { 00243 typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type; 00244 }; 00245 00246 template<typename Dimensions, typename InputXprType, typename KernelXprType> 00247 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> 00248 { 00249 typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type; 00250 }; 00251 00252 } // end namespace internal 00253 00254 00255 00256 template<typename Indices, typename InputXprType, typename KernelXprType> 00257 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> 00258 { 00259 public: 00260 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar; 00261 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; 00262 typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType, 00263 typename KernelXprType::CoeffReturnType>::ret CoeffReturnType; 00264 typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested; 00265 typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind; 00266 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index; 00267 00268 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) 00269 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} 00270 00271 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 00272 const Indices& indices() const { return m_indices; } 00273 00275 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 00276 const typename internal::remove_all<typename InputXprType::Nested>::type& 00277 inputExpression() const { return m_input_xpr; } 00278 00279 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 00280 const typename internal::remove_all<typename KernelXprType::Nested>::type& 00281 kernelExpression() const { return m_kernel_xpr; } 00282 00283 protected: 00284 typename InputXprType::Nested m_input_xpr; 00285 typename KernelXprType::Nested m_kernel_xpr; 00286 const Indices m_indices; 00287 }; 00288 00289 00290 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device> 00291 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device> 00292 { 00293 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; 00294 00295 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value; 00296 static const int NumKernelDims = internal::array_size<Indices>::value; 00297 typedef typename XprType::Index Index; 00298 typedef DSizes<Index, NumDims> Dimensions; 00299 00300 typedef typename XprType::Scalar Scalar; 00301 typedef typename XprType::CoeffReturnType CoeffReturnType; 00302 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; 00303 static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; 00304 00305 enum { 00306 IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned, 00307 PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess, 00308 Layout = TensorEvaluator<InputArgType, Device>::Layout, 00309 CoordAccess = false, // to be implemented 00310 RawAccess = false 00311 }; 00312 00313 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) 00314 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) 00315 { 00316 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); 00317 00318 const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions(); 00319 const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); 00320 00321 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00322 m_inputStride[0] = 1; 00323 for (int i = 1; i < NumDims; ++i) { 00324 m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; 00325 } 00326 } else { 00327 m_inputStride[NumDims - 1] = 1; 00328 for (int i = NumDims - 2; i >= 0; --i) { 00329 m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; 00330 } 00331 } 00332 00333 m_dimensions = m_inputImpl.dimensions(); 00334 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00335 for (int i = 0; i < NumKernelDims; ++i) { 00336 const Index index = op.indices()[i]; 00337 const Index input_dim = input_dims[index]; 00338 const Index kernel_dim = kernel_dims[i]; 00339 const Index result_dim = input_dim - kernel_dim + 1; 00340 m_dimensions[index] = result_dim; 00341 if (i > 0) { 00342 m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; 00343 } else { 00344 m_kernelStride[0] = 1; 00345 } 00346 m_indexStride[i] = m_inputStride[index]; 00347 } 00348 00349 m_outputStride[0] = 1; 00350 for (int i = 1; i < NumDims; ++i) { 00351 m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; 00352 } 00353 } else { 00354 for (int i = NumKernelDims - 1; i >= 0; --i) { 00355 const Index index = op.indices()[i]; 00356 const Index input_dim = input_dims[index]; 00357 const Index kernel_dim = kernel_dims[i]; 00358 const Index result_dim = input_dim - kernel_dim + 1; 00359 m_dimensions[index] = result_dim; 00360 if (i < NumKernelDims - 1) { 00361 m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; 00362 } else { 00363 m_kernelStride[NumKernelDims - 1] = 1; 00364 } 00365 m_indexStride[i] = m_inputStride[index]; 00366 } 00367 00368 m_outputStride[NumDims - 1] = 1; 00369 for (int i = NumDims - 2; i >= 0; --i) { 00370 m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; 00371 } 00372 } 00373 } 00374 00375 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } 00376 00377 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { 00378 m_inputImpl.evalSubExprsIfNeeded(NULL); 00379 preloadKernel(); 00380 return true; 00381 } 00382 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { 00383 m_inputImpl.cleanup(); 00384 if (m_local_kernel) { 00385 m_device.deallocate((void*)m_kernel); 00386 m_local_kernel = false; 00387 } 00388 m_kernel = NULL; 00389 } 00390 00391 void evalTo(typename XprType::Scalar* buffer) { 00392 evalSubExprsIfNeeded(NULL); 00393 for (int i = 0; i < dimensions().TotalSize(); ++i) { 00394 buffer[i] += coeff(i); 00395 } 00396 cleanup(); 00397 } 00398 00399 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const 00400 { 00401 CoeffReturnType result = CoeffReturnType(0); 00402 convolve(firstInput(index), 0, NumKernelDims-1, result); 00403 return result; 00404 } 00405 00406 template<int LoadMode> 00407 EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const 00408 { 00409 Index indices[2] = {index, index+PacketSize-1}; 00410 Index startInputs[2] = {0, 0}; 00411 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00412 for (int i = NumDims - 1; i > 0; --i) { 00413 const Index idx0 = indices[0] / m_outputStride[i]; 00414 const Index idx1 = indices[1] / m_outputStride[i]; 00415 startInputs[0] += idx0 * m_inputStride[i]; 00416 startInputs[1] += idx1 * m_inputStride[i]; 00417 indices[0] -= idx0 * m_outputStride[i]; 00418 indices[1] -= idx1 * m_outputStride[i]; 00419 } 00420 } else { 00421 for (int i = 0; i < NumDims - 1; ++i) { 00422 const Index idx0 = indices[0] / m_outputStride[i]; 00423 const Index idx1 = indices[1] / m_outputStride[i]; 00424 startInputs[0] += idx0 * m_inputStride[i]; 00425 startInputs[1] += idx1 * m_inputStride[i]; 00426 indices[0] -= idx0 * m_outputStride[i]; 00427 indices[1] -= idx1 * m_outputStride[i]; 00428 } 00429 } 00430 startInputs[0] += indices[0]; 00431 startInputs[1] += indices[1]; 00432 00433 if (startInputs[1]-startInputs[0] == PacketSize-1) { 00434 PacketReturnType result = internal::pset1<PacketReturnType>(0); 00435 convolvePacket(startInputs[0], 0, NumKernelDims-1, result); 00436 return result; 00437 } else { 00438 EIGEN_ALIGN_MAX Scalar data[PacketSize]; 00439 data[0] = Scalar(0); 00440 convolve(startInputs[0], 0, NumKernelDims-1, data[0]); 00441 for (int i = 1; i < PacketSize-1; ++i) { 00442 data[i] = Scalar(0); 00443 convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); 00444 } 00445 data[PacketSize-1] = Scalar(0); 00446 convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); 00447 return internal::pload<PacketReturnType>(data); 00448 } 00449 } 00450 00451 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost 00452 costPerCoeff(bool vectorized) const { 00453 const double kernel_size = m_kernelImpl.dimensions().TotalSize(); 00454 // We ignore the use of fused multiply-add. 00455 const double convolve_compute_cost = 00456 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); 00457 const double firstIndex_compute_cost = 00458 NumDims * 00459 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + 00460 TensorOpCost::DivCost<Index>()); 00461 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + 00462 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + 00463 m_kernelImpl.costPerCoeff(vectorized) + 00464 TensorOpCost(0, 0, convolve_compute_cost, vectorized, 00465 PacketSize)); 00466 } 00467 00468 EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } 00469 00470 private: 00471 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { 00472 Index startInput = 0; 00473 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 00474 for (int i = NumDims - 1; i > 0; --i) { 00475 const Index idx = index / m_outputStride[i]; 00476 startInput += idx * m_inputStride[i]; 00477 index -= idx * m_outputStride[i]; 00478 } 00479 } else { 00480 for (int i = 0; i < NumDims - 1; ++i) { 00481 const Index idx = index / m_outputStride[i]; 00482 startInput += idx * m_inputStride[i]; 00483 index -= idx * m_outputStride[i]; 00484 } 00485 } 00486 startInput += index; 00487 return startInput; 00488 } 00489 00490 EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { 00491 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { 00492 const Index input = firstIndex + j * m_indexStride[DimIndex]; 00493 const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; 00494 if (DimIndex > 0) { 00495 convolve(input, kernel, DimIndex-1, accum); 00496 } else { 00497 accum += m_inputImpl.coeff(input) * m_kernel[kernel]; 00498 } 00499 } 00500 } 00501 00502 template <typename Packet> 00503 EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { 00504 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { 00505 const Index input = firstIndex + j * m_indexStride[DimIndex]; 00506 const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; 00507 if (DimIndex > 0) { 00508 convolvePacket(input, kernel, DimIndex-1, accum); 00509 } else { 00510 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum); 00511 } 00512 } 00513 } 00514 00515 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { 00516 // Don't make a local copy of the kernel unless we have to (i.e. it's an 00517 // expression that needs to be evaluated) 00518 const Scalar* in_place = m_kernelImpl.data(); 00519 if (in_place) { 00520 m_kernel = in_place; 00521 m_local_kernel = false; 00522 } else { 00523 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); 00524 Scalar* local = (Scalar*)m_device.allocate(kernel_sz); 00525 typedef TensorEvalToOp<const KernelArgType> EvalTo; 00526 EvalTo evalToTmp(local, m_kernelArg); 00527 const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value; 00528 internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device); 00529 00530 m_kernel = local; 00531 m_local_kernel = true; 00532 } 00533 } 00534 00535 array<Index, NumDims> m_inputStride; 00536 array<Index, NumDims> m_outputStride; 00537 00538 array<Index, NumKernelDims> m_indexStride; 00539 array<Index, NumKernelDims> m_kernelStride; 00540 TensorEvaluator<InputArgType, Device> m_inputImpl; 00541 TensorEvaluator<KernelArgType, Device> m_kernelImpl; 00542 Dimensions m_dimensions; 00543 00544 KernelArgType m_kernelArg; 00545 const Scalar* m_kernel; 00546 bool m_local_kernel; 00547 const Device& m_device; 00548 }; 00549 00550 00551 00552 00553 // Use an optimized implementation of the evaluation code for GPUs whenever possible. 00554 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 00555 00556 template <int StaticKernelSize> 00557 struct GetKernelSize { 00558 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { 00559 return StaticKernelSize; 00560 } 00561 }; 00562 template <> 00563 struct GetKernelSize<Dynamic> { 00564 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { 00565 return kernelSize; 00566 } 00567 }; 00568 00569 template <typename InputEvaluator, typename Index, typename InputDims, 00570 int StaticKernelSize> 00571 __global__ void EigenConvolutionKernel1D( 00572 InputEvaluator eval, 00573 const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> 00574 indexMapper, 00575 const float* __restrict kernel, const int numPlanes, const int numX, 00576 const int maxX, const int kernelSize, float* buffer) { 00577 extern __shared__ float s[]; 00578 00579 const int first_x = blockIdx.x * maxX; 00580 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; 00581 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize); 00582 const int num_x_output = last_x - first_x + 1; 00583 00584 const int first_plane = blockIdx.y * blockDim.y; 00585 const int plane_stride = blockDim.y * gridDim.y; 00586 00587 for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { 00588 // Load inputs to shared memory 00589 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); 00590 const int plane_kernel_offset = threadIdx.y * num_x_input; 00591 #pragma unroll 00592 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { 00593 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); 00594 s[i + plane_kernel_offset] = eval.coeff(tensor_index); 00595 } 00596 00597 __syncthreads(); 00598 00599 // Compute the convolution 00600 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); 00601 00602 #pragma unroll 00603 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { 00604 const int kernel_offset = plane_kernel_offset + i; 00605 float result = 0.0f; 00606 #pragma unroll 00607 for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) { 00608 result += s[k + kernel_offset] * kernel[k]; 00609 } 00610 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); 00611 buffer[tensor_index] = result; 00612 } 00613 __syncthreads(); 00614 } 00615 }; 00616 00617 template <typename InputEvaluator, typename Index, typename InputDims, 00618 int StaticKernelSizeX, int StaticKernelSizeY> 00619 __global__ void EigenConvolutionKernel2D( 00620 InputEvaluator eval, 00621 const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> 00622 indexMapper, 00623 const float* __restrict kernel, const int numPlanes, const int numX, 00624 const int maxX, const int numY, const int maxY, const int kernelSizeX, 00625 const int kernelSizeY, float* buffer) { 00626 extern __shared__ float s[]; 00627 00628 const int first_x = blockIdx.x * maxX; 00629 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; 00630 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX); 00631 const int num_x_output = last_x - first_x + 1; 00632 00633 const int first_y = blockIdx.y * maxY; 00634 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; 00635 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY); 00636 const int num_y_output = last_y - first_y + 1; 00637 00638 const int first_plane = blockIdx.z * blockDim.z; 00639 const int plane_stride = blockDim.z * gridDim.z; 00640 00641 for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { 00642 00643 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); 00644 const int plane_kernel_offset = threadIdx.z * num_y_input; 00645 00646 // Load inputs to shared memory 00647 #pragma unroll 00648 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { 00649 const int input_offset = num_x_input * (j + plane_kernel_offset); 00650 #pragma unroll 00651 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { 00652 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); 00653 s[i + input_offset] = eval.coeff(tensor_index); 00654 } 00655 } 00656 00657 __syncthreads(); 00658 00659 // Convolution 00660 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); 00661 00662 #pragma unroll 00663 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { 00664 #pragma unroll 00665 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { 00666 float result = 0.0f; 00667 #pragma unroll 00668 for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) { 00669 const int kernel_offset = kernelSizeX * l; 00670 const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); 00671 #pragma unroll 00672 for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) { 00673 result += s[k + input_offset] * kernel[k + kernel_offset]; 00674 } 00675 } 00676 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); 00677 buffer[tensor_index] = result; 00678 } 00679 } 00680 00681 __syncthreads(); 00682 } 00683 }; 00684 00685 template <typename InputEvaluator, typename Index, typename InputDims> 00686 __global__ void EigenConvolutionKernel3D( 00687 InputEvaluator eval, 00688 const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> 00689 indexMapper, 00690 const float* __restrict kernel, const size_t numPlanes, const size_t numX, 00691 const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, 00692 const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, 00693 const size_t kernelSizeZ, float* buffer) { 00694 extern __shared__ float s[]; 00695 00696 // Load inputs to shared memory 00697 const int first_x = blockIdx.x * maxX; 00698 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; 00699 const int num_x_input = last_x - first_x + kernelSizeX; 00700 00701 const int first_y = blockIdx.y * maxY; 00702 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; 00703 const int num_y_input = last_y - first_y + kernelSizeY; 00704 00705 const int first_z = blockIdx.z * maxZ; 00706 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; 00707 const int num_z_input = last_z - first_z + kernelSizeZ; 00708 00709 for (int p = 0; p < numPlanes; ++p) { 00710 00711 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); 00712 const int plane_kernel_offset = 0; 00713 00714 for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { 00715 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { 00716 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { 00717 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); 00718 s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); 00719 } 00720 } 00721 } 00722 00723 __syncthreads(); 00724 00725 // Convolution 00726 const int num_z_output = last_z - first_z + 1; 00727 const int num_y_output = last_y - first_y + 1; 00728 const int num_x_output = last_x - first_x + 1; 00729 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); 00730 00731 for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { 00732 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { 00733 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { 00734 float result = 0.0f; 00735 for (int n = 0; n < kernelSizeZ; ++n) { 00736 for (int m = 0; m < kernelSizeY; ++m) { 00737 for (int l = 0; l < kernelSizeX; ++l) { 00738 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; 00739 } 00740 } 00741 } 00742 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); 00743 buffer[tensor_index] = result; 00744 } 00745 } 00746 } 00747 __syncthreads(); 00748 } 00749 }; 00750 00751 00752 00753 template<typename Indices, typename InputArgType, typename KernelArgType> 00754 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> 00755 { 00756 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; 00757 00758 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value; 00759 static const int NumKernelDims = internal::array_size<Indices>::value; 00760 typedef typename XprType::Index Index; 00761 typedef DSizes<Index, NumDims> Dimensions; 00762 typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions; 00763 00764 enum { 00765 IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned, 00766 PacketAccess = false, 00767 Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout, 00768 CoordAccess = false, // to be implemented 00769 RawAccess = false 00770 }; 00771 00772 EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) 00773 : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) 00774 { 00775 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); 00776 00777 const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); 00778 const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); 00779 00780 m_dimensions = m_inputImpl.dimensions(); 00781 for (int i = 0; i < NumKernelDims; ++i) { 00782 const Index index = op.indices()[i]; 00783 const Index input_dim = input_dims[index]; 00784 const Index kernel_dim = kernel_dims[i]; 00785 const Index result_dim = input_dim - kernel_dim + 1; 00786 m_dimensions[index] = result_dim; 00787 } 00788 } 00789 00790 typedef typename XprType::CoeffReturnType CoeffReturnType; 00791 typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType; 00792 typedef typename InputArgType::Scalar Scalar; 00793 static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; 00794 00795 EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } 00796 00797 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { 00798 preloadKernel(); 00799 m_inputImpl.evalSubExprsIfNeeded(NULL); 00800 if (data) { 00801 executeEval(data); 00802 return false; 00803 } else { 00804 m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); 00805 executeEval(m_buf); 00806 return true; 00807 } 00808 } 00809 00810 EIGEN_STRONG_INLINE void cleanup() { 00811 m_inputImpl.cleanup(); 00812 if (m_buf) { 00813 m_device.deallocate(m_buf); 00814 m_buf = NULL; 00815 } 00816 if (m_local_kernel) { 00817 m_device.deallocate((void*)m_kernel); 00818 m_local_kernel = false; 00819 } 00820 m_kernel = NULL; 00821 } 00822 00823 EIGEN_STRONG_INLINE void preloadKernel() { 00824 // Don't make a local copy of the kernel unless we have to (i.e. it's an 00825 // expression that needs to be evaluated) 00826 const Scalar* in_place = m_kernelImpl.data(); 00827 if (in_place) { 00828 m_kernel = in_place; 00829 m_local_kernel = false; 00830 } else { 00831 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); 00832 Scalar* local = (Scalar*)m_device.allocate(kernel_sz); 00833 typedef TensorEvalToOp<const KernelArgType> EvalTo; 00834 EvalTo evalToTmp(local, m_kernelArg); 00835 const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value; 00836 internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device); 00837 00838 m_kernel = local; 00839 m_local_kernel = true; 00840 } 00841 } 00842 00843 static unsigned int ceil(unsigned int num, unsigned int denom) { 00844 const unsigned int rounded_toward_zero = num / denom; 00845 if (num > rounded_toward_zero * denom) { 00846 return rounded_toward_zero + 1; 00847 } 00848 return rounded_toward_zero; 00849 } 00850 00851 void executeEval(Scalar* data) const { 00852 typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims; 00853 00854 const int maxSharedMem = m_device.sharedMemPerBlock(); 00855 const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock(); 00856 const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; 00857 const int numMultiProcessors = m_device.getNumCudaMultiProcessors(); 00858 const int warpSize = 32; 00859 00860 switch (NumKernelDims) { 00861 case 1: { 00862 const int kernel_size = m_kernelImpl.dimensions().TotalSize(); 00863 00864 const int numX = dimensions()[m_indices[0]]; 00865 const int numP = dimensions().TotalSize() / numX; 00866 int maxX; 00867 dim3 block_size; 00868 00869 const int single_stride_dim = 00870 static_cast<int>(Layout) == static_cast<int>(ColMajor) 00871 ? 0 00872 : m_inputImpl.dimensions().rank() - 1; 00873 if (m_indices[0] == single_stride_dim) { 00874 // Maximum the reuse 00875 const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; 00876 maxX = numext::mini<int>(inner_dim, numX); 00877 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); 00878 block_size.x = numext::mini(maxThreadsPerBlock, maxX); 00879 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP); 00880 } 00881 else { 00882 // Read as much as possible alongside the inner most dimension, that is the plane 00883 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); 00884 const int maxP = numext::mini<int>(inner_dim, numP); 00885 maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); 00886 00887 block_size.x = numext::mini(warpSize, maxX); 00888 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP); 00889 } 00890 00891 const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); 00892 assert(shared_mem <= maxSharedMem); 00893 00894 const int num_x_blocks = ceil(numX, maxX); 00895 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); 00896 const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); 00897 00898 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y))); 00899 00900 00901 //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; 00902 00903 const array<Index, 1> indices(m_indices[0]); 00904 const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]); 00905 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper( 00906 m_inputImpl.dimensions(), kernel_dims, indices); 00907 switch(kernel_size) { 00908 case 4: { 00909 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); 00910 break; 00911 } 00912 case 7: { 00913 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); 00914 break; 00915 } 00916 default: { 00917 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); 00918 } 00919 } 00920 break; 00921 } 00922 00923 case 2: { 00924 const int idxX = 00925 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1; 00926 const int idxY = 00927 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0; 00928 const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; 00929 const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; 00930 00931 const int numX = dimensions()[m_indices[idxX]]; 00932 const int numY = dimensions()[m_indices[idxY]]; 00933 const int numP = dimensions().TotalSize() / (numX*numY); 00934 00935 const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); 00936 00937 // Snap maxX to warp size 00938 int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; 00939 const int maxX = numext::mini<int>(inner_dim, numX); 00940 const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); 00941 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); 00942 00943 dim3 block_size; 00944 block_size.x = numext::mini(1024, maxX); 00945 block_size.y = numext::mini<int>(1024/block_size.x, maxY); 00946 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP); 00947 00948 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); 00949 assert(shared_mem <= maxSharedMem); 00950 00951 const int num_x_blocks = ceil(numX, maxX); 00952 const int num_y_blocks = ceil(numY, maxY); 00953 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); 00954 const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); 00955 00956 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z))); 00957 00958 00959 //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; 00960 00961 const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]); 00962 const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX], 00963 m_kernelImpl.dimensions()[idxY]); 00964 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper( 00965 m_inputImpl.dimensions(), kernel_dims, indices); 00966 switch (kernel_size_x) { 00967 case 4: { 00968 switch (kernel_size_y) { 00969 case 7: { 00970 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); 00971 break; 00972 } 00973 default: { 00974 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); 00975 break; 00976 } 00977 } 00978 break; 00979 } 00980 case 7: { 00981 switch (kernel_size_y) { 00982 case 4: { 00983 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); 00984 break; 00985 } 00986 default: { 00987 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); 00988 break; 00989 } 00990 } 00991 break; 00992 } 00993 default: { 00994 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); 00995 break; 00996 } 00997 } 00998 break; 00999 } 01000 01001 case 3: { 01002 const int idxX = 01003 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2; 01004 const int idxY = 01005 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1; 01006 const int idxZ = 01007 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0; 01008 01009 const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; 01010 const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; 01011 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; 01012 01013 const int numX = dimensions()[m_indices[idxX]]; 01014 const int numY = dimensions()[m_indices[idxY]]; 01015 const int numZ = dimensions()[m_indices[idxZ]]; 01016 const int numP = dimensions().TotalSize() / (numX*numY*numZ); 01017 01018 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); 01019 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); 01020 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); 01021 01022 dim3 block_size; 01023 block_size.x = numext::mini(32, maxX); 01024 block_size.y = numext::mini(32, maxY); 01025 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ); 01026 dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); 01027 01028 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); 01029 assert(shared_mem <= maxSharedMem); 01030 01031 //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; 01032 const array<Index, 3> indices(m_indices[idxX], m_indices[idxY], 01033 m_indices[idxZ]); 01034 const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX], 01035 m_kernelImpl.dimensions()[idxY], 01036 m_kernelImpl.dimensions()[idxZ]); 01037 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper( 01038 m_inputImpl.dimensions(), kernel_dims, indices); 01039 01040 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); 01041 break; 01042 } 01043 01044 default: { 01045 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); 01046 } 01047 } 01048 } 01049 01050 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const 01051 { 01052 eigen_assert(m_buf); 01053 eigen_assert(index < m_dimensions.TotalSize()); 01054 return m_buf[index]; 01055 } 01056 01057 template<int LoadMode> 01058 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const 01059 { 01060 eigen_assert(m_buf); 01061 eigen_assert(index < m_dimensions.TotalSize()); 01062 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index); 01063 } 01064 01065 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost 01066 costPerCoeff(bool vectorized) const { 01067 // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost 01068 // model. 01069 const double kernel_size = m_kernelImpl.dimensions().TotalSize(); 01070 // We ignore the use of fused multiply-add. 01071 const double convolve_compute_cost = 01072 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); 01073 const double firstIndex_compute_cost = 01074 NumDims * 01075 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + 01076 TensorOpCost::DivCost<Index>()); 01077 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + 01078 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + 01079 m_kernelImpl.costPerCoeff(vectorized) + 01080 TensorOpCost(0, 0, convolve_compute_cost, vectorized, 01081 PacketSize)); 01082 } 01083 01084 private: 01085 // No assignment (copies are needed by the kernels) 01086 TensorEvaluator& operator = (const TensorEvaluator&); 01087 01088 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl; 01089 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl; 01090 KernelArgType m_kernelArg; 01091 Indices m_indices; 01092 Dimensions m_dimensions; 01093 Scalar* m_buf; 01094 const Scalar* m_kernel; 01095 bool m_local_kernel; 01096 01097 const GpuDevice& m_device; 01098 }; 01099 #endif 01100 01101 01102 } // end namespace Eigen 01103 01104 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H