Eigen  3.3.3
PacketMath.h
00001 // This file is part of Eigen, a lightweight C++ template library
00002 // for linear algebra.
00003 //
00004 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
00005 //
00006 // This Source Code Form is subject to the terms of the Mozilla
00007 // Public License v. 2.0. If a copy of the MPL was not distributed
00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
00009 
00010 #ifndef EIGEN_PACKET_MATH_SSE_H
00011 #define EIGEN_PACKET_MATH_SSE_H
00012 
00013 namespace Eigen {
00014 
00015 namespace internal {
00016 
00017 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
00018 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
00019 #endif
00020 
00021 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
00022 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
00023 #endif
00024 
00025 #ifdef __FMA__
00026 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
00027 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
00028 #endif
00029 #endif
00030 
00031 #if (defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)
00032 // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
00033 // have overloads for both types without linking error.
00034 // One solution is to increase ABI version using -fabi-version=4 (or greater).
00035 // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
00036 // structure:
00037 template<typename T>
00038 struct eigen_packet_wrapper
00039 {
00040   EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
00041   EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
00042   EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
00043   EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
00044   EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
00045     m_val = v;
00046     return *this;
00047   }
00048   
00049   T m_val;
00050 };
00051 typedef eigen_packet_wrapper<__m128>  Packet4f;
00052 typedef eigen_packet_wrapper<__m128i> Packet4i;
00053 typedef eigen_packet_wrapper<__m128d> Packet2d;
00054 #else
00055 typedef __m128  Packet4f;
00056 typedef __m128i Packet4i;
00057 typedef __m128d Packet2d;
00058 #endif
00059 
00060 template<> struct is_arithmetic<__m128>  { enum { value = true }; };
00061 template<> struct is_arithmetic<__m128i> { enum { value = true }; };
00062 template<> struct is_arithmetic<__m128d> { enum { value = true }; };
00063 
00064 #define vec4f_swizzle1(v,p,q,r,s) \
00065   (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
00066 
00067 #define vec4i_swizzle1(v,p,q,r,s) \
00068   (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
00069 
00070 #define vec2d_swizzle1(v,p,q) \
00071   (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
00072   
00073 #define vec4f_swizzle2(a,b,p,q,r,s) \
00074   (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
00075 
00076 #define vec4i_swizzle2(a,b,p,q,r,s) \
00077   (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
00078 
00079 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
00080   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
00081 
00082 #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
00083   const Packet2d p2d_##NAME = pset1<Packet2d>(X)
00084 
00085 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
00086   const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
00087 
00088 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
00089   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
00090 
00091 
00092 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
00093 // to leverage AVX instructions.
00094 #ifndef EIGEN_VECTORIZE_AVX
00095 template<> struct packet_traits<float>  : default_packet_traits
00096 {
00097   typedef Packet4f type;
00098   typedef Packet4f half;
00099   enum {
00100     Vectorizable = 1,
00101     AlignedOnScalar = 1,
00102     size=4,
00103     HasHalfPacket = 0,
00104 
00105     HasDiv  = 1,
00106     HasSin  = EIGEN_FAST_MATH,
00107     HasCos  = EIGEN_FAST_MATH,
00108     HasLog  = 1,
00109     HasExp  = 1,
00110     HasSqrt = 1,
00111     HasRsqrt = 1,
00112     HasTanh  = EIGEN_FAST_MATH,
00113     HasBlend = 1
00114 
00115 #ifdef EIGEN_VECTORIZE_SSE4_1
00116     ,
00117     HasRound = 1,
00118     HasFloor = 1,
00119     HasCeil = 1
00120 #endif
00121   };
00122 };
00123 template<> struct packet_traits<double> : default_packet_traits
00124 {
00125   typedef Packet2d type;
00126   typedef Packet2d half;
00127   enum {
00128     Vectorizable = 1,
00129     AlignedOnScalar = 1,
00130     size=2,
00131     HasHalfPacket = 0,
00132 
00133     HasDiv  = 1,
00134     HasExp  = 1,
00135     HasSqrt = 1,
00136     HasRsqrt = 1,
00137     HasBlend = 1
00138 
00139 #ifdef EIGEN_VECTORIZE_SSE4_1
00140     ,
00141     HasRound = 1,
00142     HasFloor = 1,
00143     HasCeil = 1
00144 #endif
00145   };
00146 };
00147 #endif
00148 template<> struct packet_traits<int>    : default_packet_traits
00149 {
00150   typedef Packet4i type;
00151   typedef Packet4i half;
00152   enum {
00153     Vectorizable = 1,
00154     AlignedOnScalar = 1,
00155     size=4,
00156 
00157     HasBlend = 1
00158   };
00159 };
00160 
00161 template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
00162 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
00163 template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
00164 
00165 #ifndef EIGEN_VECTORIZE_AVX
00166 template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
00167 template<> struct scalar_div_cost<double,true> { enum { value = 8 }; };
00168 #endif
00169 
00170 #if EIGEN_COMP_MSVC==1500
00171 // Workaround MSVC 9 internal compiler error.
00172 // TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
00173 // TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)).
00174 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps(from,from,from,from); }
00175 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
00176 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set_epi32(from,from,from,from); }
00177 #else
00178 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps1(from); }
00179 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
00180 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
00181 #endif
00182 
00183 // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
00184 // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
00185 // Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
00186 // Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
00187 // Also note that with AVX, we want it to generate a vbroadcastss.
00188 #if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
00189 template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
00190   return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
00191 }
00192 #endif
00193   
00194 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
00195 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
00196 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
00197 
00198 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
00199 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
00200 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
00201 
00202 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
00203 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
00204 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
00205 
00206 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
00207 {
00208   const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
00209   return _mm_xor_ps(a,mask);
00210 }
00211 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
00212 {
00213   const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
00214   return _mm_xor_pd(a,mask);
00215 }
00216 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
00217 {
00218   return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
00219 }
00220 
00221 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
00222 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
00223 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
00224 
00225 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
00226 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
00227 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
00228 {
00229 #ifdef EIGEN_VECTORIZE_SSE4_1
00230   return _mm_mullo_epi32(a,b);
00231 #else
00232   // this version is slightly faster than 4 scalar products
00233   return vec4i_swizzle1(
00234             vec4i_swizzle2(
00235               _mm_mul_epu32(a,b),
00236               _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
00237                             vec4i_swizzle1(b,1,0,3,2)),
00238               0,2,0,2),
00239             0,2,1,3);
00240 #endif
00241 }
00242 
00243 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
00244 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
00245 
00246 // for some weird raisons, it has to be overloaded for packet of integers
00247 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
00248 #ifdef __FMA__
00249 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
00250 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
00251 #endif
00252 
00253 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
00254 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
00255 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
00256 {
00257 #ifdef EIGEN_VECTORIZE_SSE4_1
00258   return _mm_min_epi32(a,b);
00259 #else
00260   // after some bench, this version *is* faster than a scalar implementation
00261   Packet4i mask = _mm_cmplt_epi32(a,b);
00262   return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
00263 #endif
00264 }
00265 
00266 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
00267 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
00268 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
00269 {
00270 #ifdef EIGEN_VECTORIZE_SSE4_1
00271   return _mm_max_epi32(a,b);
00272 #else
00273   // after some bench, this version *is* faster than a scalar implementation
00274   Packet4i mask = _mm_cmpgt_epi32(a,b);
00275   return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
00276 #endif
00277 }
00278 
00279 #ifdef EIGEN_VECTORIZE_SSE4_1
00280 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
00281 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
00282 
00283 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
00284 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
00285 
00286 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
00287 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
00288 #endif
00289 
00290 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
00291 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
00292 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
00293 
00294 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
00295 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
00296 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
00297 
00298 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
00299 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
00300 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
00301 
00302 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
00303 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
00304 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
00305 
00306 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
00307 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
00308 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
00309 
00310 #if EIGEN_COMP_MSVC
00311   template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
00312     EIGEN_DEBUG_UNALIGNED_LOAD
00313     #if (EIGEN_COMP_MSVC==1600)
00314     // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
00315     // (i.e., it does not generate an unaligned load!!
00316     __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
00317     res = _mm_loadh_pi(res, (const __m64*)(from+2));
00318     return res;
00319     #else
00320     return _mm_loadu_ps(from);
00321     #endif
00322   }
00323 #else
00324 // NOTE: with the code below, MSVC's compiler crashes!
00325 
00326 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
00327 {
00328   EIGEN_DEBUG_UNALIGNED_LOAD
00329   return _mm_loadu_ps(from);
00330 }
00331 #endif
00332 
00333 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
00334 {
00335   EIGEN_DEBUG_UNALIGNED_LOAD
00336   return _mm_loadu_pd(from);
00337 }
00338 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
00339 {
00340   EIGEN_DEBUG_UNALIGNED_LOAD
00341   return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
00342 }
00343 
00344 
00345 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
00346 {
00347   return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
00348 }
00349 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from)
00350 { return pset1<Packet2d>(from[0]); }
00351 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
00352 {
00353   Packet4i tmp;
00354   tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
00355   return vec4i_swizzle1(tmp, 0, 0, 1, 1);
00356 }
00357 
00358 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
00359 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
00360 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
00361 
00362 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
00363 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
00364 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
00365 
00366 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
00367 {
00368  return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
00369 }
00370 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
00371 {
00372  return _mm_set_pd(from[1*stride], from[0*stride]);
00373 }
00374 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
00375 {
00376  return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
00377  }
00378 
00379 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
00380 {
00381   to[stride*0] = _mm_cvtss_f32(from);
00382   to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
00383   to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
00384   to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
00385 }
00386 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
00387 {
00388   to[stride*0] = _mm_cvtsd_f64(from);
00389   to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
00390 }
00391 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
00392 {
00393   to[stride*0] = _mm_cvtsi128_si32(from);
00394   to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
00395   to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
00396   to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
00397 }
00398 
00399 // some compilers might be tempted to perform multiple moves instead of using a vector path.
00400 template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
00401 {
00402   Packet4f pa = _mm_set_ss(a);
00403   pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
00404 }
00405 // some compilers might be tempted to perform multiple moves instead of using a vector path.
00406 template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
00407 {
00408   Packet2d pa = _mm_set_sd(a);
00409   pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
00410 }
00411 
00412 #ifndef EIGEN_VECTORIZE_AVX
00413 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
00414 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
00415 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
00416 #endif
00417 
00418 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
00419 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
00420 // Direct of the struct members fixed bug #62.
00421 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
00422 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
00423 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
00424 #elif EIGEN_COMP_MSVC_STRICT
00425 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
00426 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
00427 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
00428 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
00429 #else
00430 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
00431 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
00432 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
00433 #endif
00434 
00435 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
00436 { return _mm_shuffle_ps(a,a,0x1B); }
00437 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
00438 { return _mm_shuffle_pd(a,a,0x1); }
00439 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
00440 { return _mm_shuffle_epi32(a,0x1B); }
00441 
00442 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
00443 {
00444   const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
00445   return _mm_and_ps(a,mask);
00446 }
00447 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
00448 {
00449   const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
00450   return _mm_and_pd(a,mask);
00451 }
00452 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
00453 {
00454   #ifdef EIGEN_VECTORIZE_SSSE3
00455   return _mm_abs_epi32(a);
00456   #else
00457   Packet4i aux = _mm_srai_epi32(a,31);
00458   return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
00459   #endif
00460 }
00461 
00462 // with AVX, the default implementations based on pload1 are faster
00463 #ifndef __AVX__
00464 template<> EIGEN_STRONG_INLINE void
00465 pbroadcast4<Packet4f>(const float *a,
00466                       Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
00467 {
00468   a3 = pload<Packet4f>(a);
00469   a0 = vec4f_swizzle1(a3, 0,0,0,0);
00470   a1 = vec4f_swizzle1(a3, 1,1,1,1);
00471   a2 = vec4f_swizzle1(a3, 2,2,2,2);
00472   a3 = vec4f_swizzle1(a3, 3,3,3,3);
00473 }
00474 template<> EIGEN_STRONG_INLINE void
00475 pbroadcast4<Packet2d>(const double *a,
00476                       Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
00477 {
00478 #ifdef EIGEN_VECTORIZE_SSE3
00479   a0 = _mm_loaddup_pd(a+0);
00480   a1 = _mm_loaddup_pd(a+1);
00481   a2 = _mm_loaddup_pd(a+2);
00482   a3 = _mm_loaddup_pd(a+3);
00483 #else
00484   a1 = pload<Packet2d>(a);
00485   a0 = vec2d_swizzle1(a1, 0,0);
00486   a1 = vec2d_swizzle1(a1, 1,1);
00487   a3 = pload<Packet2d>(a+2);
00488   a2 = vec2d_swizzle1(a3, 0,0);
00489   a3 = vec2d_swizzle1(a3, 1,1);
00490 #endif
00491 }
00492 #endif
00493 
00494 EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
00495 {
00496   vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
00497   vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
00498   vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
00499   vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
00500 }
00501 
00502 #ifdef EIGEN_VECTORIZE_SSE3
00503 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
00504 {
00505   return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
00506 }
00507 
00508 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
00509 {
00510   return _mm_hadd_pd(vecs[0], vecs[1]);
00511 }
00512 
00513 #else
00514 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
00515 {
00516   Packet4f tmp0, tmp1, tmp2;
00517   tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
00518   tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
00519   tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
00520   tmp0 = _mm_add_ps(tmp0, tmp1);
00521   tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
00522   tmp1 = _mm_add_ps(tmp1, tmp2);
00523   tmp2 = _mm_movehl_ps(tmp1, tmp0);
00524   tmp0 = _mm_movelh_ps(tmp0, tmp1);
00525   return _mm_add_ps(tmp0, tmp2);
00526 }
00527 
00528 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
00529 {
00530   return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
00531 }
00532 #endif  // SSE3
00533 
00534 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
00535 {
00536   // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
00537   // (from Nehalem to Haswell)
00538 // #ifdef EIGEN_VECTORIZE_SSE3
00539 //   Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
00540 //   return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
00541 // #else
00542   Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
00543   return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
00544 // #endif
00545 }
00546 
00547 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
00548 {
00549   // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
00550   // (from Nehalem to Haswell)
00551 // #ifdef EIGEN_VECTORIZE_SSE3
00552 //   return pfirst<Packet2d>(_mm_hadd_pd(a, a));
00553 // #else
00554   return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
00555 // #endif
00556 }
00557 
00558 #ifdef EIGEN_VECTORIZE_SSSE3
00559 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
00560 {
00561   return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
00562 }
00563 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
00564 {
00565   Packet4i tmp0 = _mm_hadd_epi32(a,a);
00566   return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
00567 }
00568 #else
00569 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
00570 {
00571   Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
00572   return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
00573 }
00574 
00575 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
00576 {
00577   Packet4i tmp0, tmp1, tmp2;
00578   tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
00579   tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
00580   tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
00581   tmp0 = _mm_add_epi32(tmp0, tmp1);
00582   tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
00583   tmp1 = _mm_add_epi32(tmp1, tmp2);
00584   tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
00585   tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
00586   return _mm_add_epi32(tmp0, tmp2);
00587 }
00588 #endif
00589 // Other reduction functions:
00590 
00591 // mul
00592 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
00593 {
00594   Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
00595   return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
00596 }
00597 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
00598 {
00599   return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
00600 }
00601 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
00602 {
00603   // after some experiments, it is seems this is the fastest way to implement it
00604   // for GCC (eg., reusing pmul is very slow !)
00605   // TODO try to call _mm_mul_epu32 directly
00606   EIGEN_ALIGN16 int aux[4];
00607   pstore(aux, a);
00608   return  (aux[0] * aux[1]) * (aux[2] * aux[3]);;
00609 }
00610 
00611 // min
00612 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
00613 {
00614   Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
00615   return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
00616 }
00617 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
00618 {
00619   return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
00620 }
00621 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
00622 {
00623 #ifdef EIGEN_VECTORIZE_SSE4_1
00624   Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
00625   return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
00626 #else
00627   // after some experiments, it is seems this is the fastest way to implement it
00628   // for GCC (eg., it does not like using std::min after the pstore !!)
00629   EIGEN_ALIGN16 int aux[4];
00630   pstore(aux, a);
00631   int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
00632   int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
00633   return aux0<aux2 ? aux0 : aux2;
00634 #endif // EIGEN_VECTORIZE_SSE4_1
00635 }
00636 
00637 // max
00638 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
00639 {
00640   Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
00641   return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
00642 }
00643 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
00644 {
00645   return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
00646 }
00647 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
00648 {
00649 #ifdef EIGEN_VECTORIZE_SSE4_1
00650   Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
00651   return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
00652 #else
00653   // after some experiments, it is seems this is the fastest way to implement it
00654   // for GCC (eg., it does not like using std::min after the pstore !!)
00655   EIGEN_ALIGN16 int aux[4];
00656   pstore(aux, a);
00657   int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
00658   int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
00659   return aux0>aux2 ? aux0 : aux2;
00660 #endif // EIGEN_VECTORIZE_SSE4_1
00661 }
00662 
00663 #if EIGEN_COMP_GNUC
00664 // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
00665 // {
00666 //   Packet4f res = b;
00667 //   asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
00668 //   return res;
00669 // }
00670 // EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i&  a, const Packet4i&  b, const int i)
00671 // {
00672 //   Packet4i res = a;
00673 //   asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
00674 //   return res;
00675 // }
00676 #endif
00677 
00678 #ifdef EIGEN_VECTORIZE_SSSE3
00679 // SSSE3 versions
00680 template<int Offset>
00681 struct palign_impl<Offset,Packet4f>
00682 {
00683   static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
00684   {
00685     if (Offset!=0)
00686       first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
00687   }
00688 };
00689 
00690 template<int Offset>
00691 struct palign_impl<Offset,Packet4i>
00692 {
00693   static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
00694   {
00695     if (Offset!=0)
00696       first = _mm_alignr_epi8(second,first, Offset*4);
00697   }
00698 };
00699 
00700 template<int Offset>
00701 struct palign_impl<Offset,Packet2d>
00702 {
00703   static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
00704   {
00705     if (Offset==1)
00706       first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
00707   }
00708 };
00709 #else
00710 // SSE2 versions
00711 template<int Offset>
00712 struct palign_impl<Offset,Packet4f>
00713 {
00714   static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
00715   {
00716     if (Offset==1)
00717     {
00718       first = _mm_move_ss(first,second);
00719       first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
00720     }
00721     else if (Offset==2)
00722     {
00723       first = _mm_movehl_ps(first,first);
00724       first = _mm_movelh_ps(first,second);
00725     }
00726     else if (Offset==3)
00727     {
00728       first = _mm_move_ss(first,second);
00729       first = _mm_shuffle_ps(first,second,0x93);
00730     }
00731   }
00732 };
00733 
00734 template<int Offset>
00735 struct palign_impl<Offset,Packet4i>
00736 {
00737   static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
00738   {
00739     if (Offset==1)
00740     {
00741       first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
00742       first = _mm_shuffle_epi32(first,0x39);
00743     }
00744     else if (Offset==2)
00745     {
00746       first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
00747       first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
00748     }
00749     else if (Offset==3)
00750     {
00751       first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
00752       first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
00753     }
00754   }
00755 };
00756 
00757 template<int Offset>
00758 struct palign_impl<Offset,Packet2d>
00759 {
00760   static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
00761   {
00762     if (Offset==1)
00763     {
00764       first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
00765       first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
00766     }
00767   }
00768 };
00769 #endif
00770 
00771 EIGEN_DEVICE_FUNC inline void
00772 ptranspose(PacketBlock<Packet4f,4>& kernel) {
00773   _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
00774 }
00775 
00776 EIGEN_DEVICE_FUNC inline void
00777 ptranspose(PacketBlock<Packet2d,2>& kernel) {
00778   __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
00779   kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
00780   kernel.packet[1] = tmp;
00781 }
00782 
00783 EIGEN_DEVICE_FUNC inline void
00784 ptranspose(PacketBlock<Packet4i,4>& kernel) {
00785   __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
00786   __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
00787   __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
00788   __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
00789 
00790   kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
00791   kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
00792   kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
00793   kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
00794 }
00795 
00796 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
00797   const __m128i zero = _mm_setzero_si128();
00798   const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
00799   __m128i false_mask = _mm_cmpeq_epi32(select, zero);
00800 #ifdef EIGEN_VECTORIZE_SSE4_1
00801   return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
00802 #else
00803   return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
00804 #endif
00805 }
00806 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
00807   const __m128 zero = _mm_setzero_ps();
00808   const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
00809   __m128 false_mask = _mm_cmpeq_ps(select, zero);
00810 #ifdef EIGEN_VECTORIZE_SSE4_1
00811   return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
00812 #else
00813   return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
00814 #endif
00815 }
00816 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
00817   const __m128d zero = _mm_setzero_pd();
00818   const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
00819   __m128d false_mask = _mm_cmpeq_pd(select, zero);
00820 #ifdef EIGEN_VECTORIZE_SSE4_1
00821   return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
00822 #else
00823   return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
00824 #endif
00825 }
00826 
00827 template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b)
00828 {
00829 #ifdef EIGEN_VECTORIZE_SSE4_1
00830   return _mm_blend_ps(a,pset1<Packet4f>(b),1);
00831 #else
00832   return _mm_move_ss(a, _mm_load_ss(&b));
00833 #endif
00834 }
00835 
00836 template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b)
00837 {
00838 #ifdef EIGEN_VECTORIZE_SSE4_1
00839   return _mm_blend_pd(a,pset1<Packet2d>(b),1);
00840 #else
00841   return _mm_move_sd(a, _mm_load_sd(&b));
00842 #endif
00843 }
00844 
00845 template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b)
00846 {
00847 #ifdef EIGEN_VECTORIZE_SSE4_1
00848   return _mm_blend_ps(a,pset1<Packet4f>(b),(1<<3));
00849 #else
00850   const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF));
00851   return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1<Packet4f>(b)));
00852 #endif
00853 }
00854 
00855 template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b)
00856 {
00857 #ifdef EIGEN_VECTORIZE_SSE4_1
00858   return _mm_blend_pd(a,pset1<Packet2d>(b),(1<<1));
00859 #else
00860   const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF));
00861   return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1<Packet2d>(b)));
00862 #endif
00863 }
00864 
00865 // Scalar path for pmadd with FMA to ensure consistency with vectorized path.
00866 #ifdef __FMA__
00867 template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
00868   return ::fmaf(a,b,c);
00869 }
00870 template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
00871   return ::fma(a,b,c);
00872 }
00873 #endif
00874 
00875 } // end namespace internal
00876 
00877 } // end namespace Eigen
00878 
00879 #endif // EIGEN_PACKET_MATH_SSE_H
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends