Eigen  3.3.3
PacketMath.h
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
00005 //
00006 // This Source Code Form is subject to the terms of the Mozilla
00007 // Public License v. 2.0. If a copy of the MPL was not distributed
00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
00009 
00010 #ifndef EIGEN_PACKET_MATH_CUDA_H
00011 #define EIGEN_PACKET_MATH_CUDA_H
00012 
00013 namespace Eigen {
00014 
00015 namespace internal {
00016 
00017 // Make sure this is only available when targeting a GPU: we don't want to
00018 // introduce conflicts between these packet_traits definitions and the ones
00019 // we'll use on the host side (SSE, AVX, ...)
00020 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
00021 template<> struct is_arithmetic<float4>  { enum { value = true }; };
00022 template<> struct is_arithmetic<double2> { enum { value = true }; };
00023 
00024 template<> struct packet_traits<float> : default_packet_traits
00025 {
00026   typedef float4 type;
00027   typedef float4 half;
00028   enum {
00029     Vectorizable = 1,
00030     AlignedOnScalar = 1,
00031     size=4,
00032     HasHalfPacket = 0,
00033 
00034     HasDiv  = 1,
00035     HasSin  = 0,
00036     HasCos  = 0,
00037     HasLog  = 1,
00038     HasExp  = 1,
00039     HasSqrt = 1,
00040     HasRsqrt = 1,
00041     HasLGamma = 1,
00042     HasDiGamma = 1,
00043     HasZeta = 1,
00044     HasPolygamma = 1,
00045     HasErf = 1,
00046     HasErfc = 1,
00047     HasIGamma = 1,
00048     HasIGammac = 1,
00049     HasBetaInc = 1,
00050 
00051     HasBlend = 0,
00052   };
00053 };
00054 
00055 template<> struct packet_traits<double> : default_packet_traits
00056 {
00057   typedef double2 type;
00058   typedef double2 half;
00059   enum {
00060     Vectorizable = 1,
00061     AlignedOnScalar = 1,
00062     size=2,
00063     HasHalfPacket = 0,
00064 
00065     HasDiv  = 1,
00066     HasLog  = 1,
00067     HasExp  = 1,
00068     HasSqrt = 1,
00069     HasRsqrt = 1,
00070     HasLGamma = 1,
00071     HasDiGamma = 1,
00072     HasZeta = 1,
00073     HasPolygamma = 1,
00074     HasErf = 1,
00075     HasErfc = 1,
00076     HasIGamma = 1,
00077     HasIGammac = 1,
00078     HasBetaInc = 1,
00079 
00080     HasBlend = 0,
00081   };
00082 };
00083 
00084 
00085 template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16}; typedef float4 half; };
00086 template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; };
00087 
00088 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
00089   return make_float4(from, from, from, from);
00090 }
00091 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
00092   return make_double2(from, from);
00093 }
00094 
00095 
00096 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
00097   return make_float4(a, a+1, a+2, a+3);
00098 }
00099 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
00100   return make_double2(a, a+1);
00101 }
00102 
00103 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
00104   return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
00105 }
00106 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
00107   return make_double2(a.x+b.x, a.y+b.y);
00108 }
00109 
00110 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
00111   return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
00112 }
00113 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
00114   return make_double2(a.x-b.x, a.y-b.y);
00115 }
00116 
00117 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
00118   return make_float4(-a.x, -a.y, -a.z, -a.w);
00119 }
00120 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
00121   return make_double2(-a.x, -a.y);
00122 }
00123 
00124 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
00125 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
00126 
00127 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
00128   return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
00129 }
00130 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
00131   return make_double2(a.x*b.x, a.y*b.y);
00132 }
00133 
00134 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
00135   return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
00136 }
00137 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
00138   return make_double2(a.x/b.x, a.y/b.y);
00139 }
00140 
00141 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
00142   return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
00143 }
00144 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
00145   return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
00146 }
00147 
00148 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
00149   return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
00150 }
00151 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
00152   return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
00153 }
00154 
00155 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
00156   return *reinterpret_cast<const float4*>(from);
00157 }
00158 
00159 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
00160   return *reinterpret_cast<const double2*>(from);
00161 }
00162 
00163 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
00164   return make_float4(from[0], from[1], from[2], from[3]);
00165 }
00166 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
00167   return make_double2(from[0], from[1]);
00168 }
00169 
00170 template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
00171   return make_float4(from[0], from[0], from[1], from[1]);
00172 }
00173 template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
00174   return make_double2(from[0], from[0]);
00175 }
00176 
00177 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float*   to, const float4& from) {
00178   *reinterpret_cast<float4*>(to) = from;
00179 }
00180 
00181 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
00182   *reinterpret_cast<double2*>(to) = from;
00183 }
00184 
00185 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const float4& from) {
00186   to[0] = from.x;
00187   to[1] = from.y;
00188   to[2] = from.z;
00189   to[3] = from.w;
00190 }
00191 
00192 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
00193   to[0] = from.x;
00194   to[1] = from.y;
00195 }
00196 
00197 template<>
00198 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
00199 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
00200   return __ldg((const float4*)from);
00201 #else
00202   return make_float4(from[0], from[1], from[2], from[3]);
00203 #endif
00204 }
00205 template<>
00206 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
00207 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
00208   return __ldg((const double2*)from);
00209 #else
00210   return make_double2(from[0], from[1]);
00211 #endif
00212 }
00213 
00214 template<>
00215 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
00216 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
00217   return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
00218 #else
00219   return make_float4(from[0], from[1], from[2], from[3]);
00220 #endif
00221 }
00222 template<>
00223 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
00224 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
00225   return make_double2(__ldg(from+0), __ldg(from+1));
00226 #else
00227   return make_double2(from[0], from[1]);
00228 #endif
00229 }
00230 
00231 template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
00232   return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
00233 }
00234 
00235 template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
00236   return make_double2(from[0*stride], from[1*stride]);
00237 }
00238 
00239 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
00240   to[stride*0] = from.x;
00241   to[stride*1] = from.y;
00242   to[stride*2] = from.z;
00243   to[stride*3] = from.w;
00244 }
00245 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
00246   to[stride*0] = from.x;
00247   to[stride*1] = from.y;
00248 }
00249 
00250 template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
00251   return a.x;
00252 }
00253 template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
00254   return a.x;
00255 }
00256 
00257 template<> EIGEN_DEVICE_FUNC inline float  predux<float4>(const float4& a) {
00258   return a.x + a.y + a.z + a.w;
00259 }
00260 template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
00261   return a.x + a.y;
00262 }
00263 
00264 template<> EIGEN_DEVICE_FUNC inline float  predux_max<float4>(const float4& a) {
00265   return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
00266 }
00267 template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
00268   return fmax(a.x, a.y);
00269 }
00270 
00271 template<> EIGEN_DEVICE_FUNC inline float  predux_min<float4>(const float4& a) {
00272   return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
00273 }
00274 template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
00275   return fmin(a.x, a.y);
00276 }
00277 
00278 template<> EIGEN_DEVICE_FUNC inline float  predux_mul<float4>(const float4& a) {
00279   return a.x * a.y * a.z * a.w;
00280 }
00281 template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
00282   return a.x * a.y;
00283 }
00284 
00285 template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
00286   return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
00287 }
00288 template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
00289   return make_double2(fabs(a.x), fabs(a.y));
00290 }
00291 
00292 EIGEN_DEVICE_FUNC inline void
00293 ptranspose(PacketBlock<float4,4>& kernel) {
00294   double tmp = kernel.packet[0].y;
00295   kernel.packet[0].y = kernel.packet[1].x;
00296   kernel.packet[1].x = tmp;
00297 
00298   tmp = kernel.packet[0].z;
00299   kernel.packet[0].z = kernel.packet[2].x;
00300   kernel.packet[2].x = tmp;
00301 
00302   tmp = kernel.packet[0].w;
00303   kernel.packet[0].w = kernel.packet[3].x;
00304   kernel.packet[3].x = tmp;
00305 
00306   tmp = kernel.packet[1].z;
00307   kernel.packet[1].z = kernel.packet[2].y;
00308   kernel.packet[2].y = tmp;
00309 
00310   tmp = kernel.packet[1].w;
00311   kernel.packet[1].w = kernel.packet[3].y;
00312   kernel.packet[3].y = tmp;
00313 
00314   tmp = kernel.packet[2].w;
00315   kernel.packet[2].w = kernel.packet[3].z;
00316   kernel.packet[3].z = tmp;
00317 }
00318 
00319 EIGEN_DEVICE_FUNC inline void
00320 ptranspose(PacketBlock<double2,2>& kernel) {
00321   double tmp = kernel.packet[0].y;
00322   kernel.packet[0].y = kernel.packet[1].x;
00323   kernel.packet[1].x = tmp;
00324 }
00325 
00326 #endif
00327 
00328 } // end namespace internal
00329 
00330 } // end namespace Eigen
00331 
00332 
00333 #endif // EIGEN_PACKET_MATH_CUDA_H
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends