![]() |
Eigen
3.3.3
|
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) 00005 // 00006 // This Source Code Form is subject to the terms of the Mozilla 00007 // Public License v. 2.0. If a copy of the MPL was not distributed 00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00009 00010 #ifndef EIGEN_PACKET_MATH_AVX_H 00011 #define EIGEN_PACKET_MATH_AVX_H 00012 00013 namespace Eigen { 00014 00015 namespace internal { 00016 00017 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 00018 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 00019 #endif 00020 00021 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 00022 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) 00023 #endif 00024 00025 #ifdef __FMA__ 00026 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 00027 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 00028 #endif 00029 #endif 00030 00031 typedef __m256 Packet8f; 00032 typedef __m256i Packet8i; 00033 typedef __m256d Packet4d; 00034 00035 template<> struct is_arithmetic<__m256> { enum { value = true }; }; 00036 template<> struct is_arithmetic<__m256i> { enum { value = true }; }; 00037 template<> struct is_arithmetic<__m256d> { enum { value = true }; }; 00038 00039 #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ 00040 const Packet8f p8f_##NAME = pset1<Packet8f>(X) 00041 00042 #define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \ 00043 const Packet4d p4d_##NAME = pset1<Packet4d>(X) 00044 00045 #define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \ 00046 const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1<Packet8i>(X)) 00047 00048 #define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \ 00049 const Packet8i p8i_##NAME = pset1<Packet8i>(X) 00050 00051 // Use the packet_traits defined in AVX512/PacketMath.h instead if we're going 00052 // to leverage AVX512 instructions. 00053 #ifndef EIGEN_VECTORIZE_AVX512 00054 template<> struct packet_traits<float> : default_packet_traits 00055 { 00056 typedef Packet8f type; 00057 typedef Packet4f half; 00058 enum { 00059 Vectorizable = 1, 00060 AlignedOnScalar = 1, 00061 size=8, 00062 HasHalfPacket = 1, 00063 00064 HasDiv = 1, 00065 HasSin = EIGEN_FAST_MATH, 00066 HasCos = 0, 00067 HasLog = 1, 00068 HasExp = 1, 00069 HasSqrt = 1, 00070 HasRsqrt = 1, 00071 HasTanh = EIGEN_FAST_MATH, 00072 HasBlend = 1, 00073 HasRound = 1, 00074 HasFloor = 1, 00075 HasCeil = 1 00076 }; 00077 }; 00078 template<> struct packet_traits<double> : default_packet_traits 00079 { 00080 typedef Packet4d type; 00081 typedef Packet2d half; 00082 enum { 00083 Vectorizable = 1, 00084 AlignedOnScalar = 1, 00085 size=4, 00086 HasHalfPacket = 1, 00087 00088 HasDiv = 1, 00089 HasExp = 1, 00090 HasSqrt = 1, 00091 HasRsqrt = 1, 00092 HasBlend = 1, 00093 HasRound = 1, 00094 HasFloor = 1, 00095 HasCeil = 1 00096 }; 00097 }; 00098 #endif 00099 00100 template<> struct scalar_div_cost<float,true> { enum { value = 14 }; }; 00101 template<> struct scalar_div_cost<double,true> { enum { value = 16 }; }; 00102 00103 /* Proper support for integers is only provided by AVX2. In the meantime, we'll 00104 use SSE instructions and packets to deal with integers. 00105 template<> struct packet_traits<int> : default_packet_traits 00106 { 00107 typedef Packet8i type; 00108 enum { 00109 Vectorizable = 1, 00110 AlignedOnScalar = 1, 00111 size=8 00112 }; 00113 }; 00114 */ 00115 00116 template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; }; 00117 template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; }; 00118 template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; 00119 00120 template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); } 00121 template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); } 00122 template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); } 00123 00124 template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); } 00125 template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); } 00126 00127 template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } 00128 template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } 00129 00130 template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } 00131 template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } 00132 00133 template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } 00134 template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } 00135 00136 template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) 00137 { 00138 return _mm256_sub_ps(_mm256_set1_ps(0.0),a); 00139 } 00140 template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) 00141 { 00142 return _mm256_sub_pd(_mm256_set1_pd(0.0),a); 00143 } 00144 00145 template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } 00146 template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; } 00147 template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; } 00148 00149 template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } 00150 template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } 00151 00152 00153 template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } 00154 template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } 00155 template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/) 00156 { eigen_assert(false && "packet integer division are not supported by AVX"); 00157 return pset1<Packet8i>(0); 00158 } 00159 00160 #ifdef __FMA__ 00161 template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { 00162 #if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) 00163 // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, 00164 // and gcc stupidly generates a vfmadd132ps instruction, 00165 // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate 00166 // the result of the product. 00167 Packet8f res = c; 00168 __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); 00169 return res; 00170 #else 00171 return _mm256_fmadd_ps(a,b,c); 00172 #endif 00173 } 00174 template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { 00175 #if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) 00176 // see above 00177 Packet4d res = c; 00178 __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); 00179 return res; 00180 #else 00181 return _mm256_fmadd_pd(a,b,c); 00182 #endif 00183 } 00184 #endif 00185 00186 template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } 00187 template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } 00188 00189 template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } 00190 template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } 00191 00192 template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } 00193 template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } 00194 00195 template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); } 00196 template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); } 00197 00198 template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); } 00199 template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); } 00200 00201 template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } 00202 template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } 00203 00204 template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } 00205 template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } 00206 00207 template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } 00208 template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } 00209 00210 template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } 00211 template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } 00212 00213 template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } 00214 template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } 00215 template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); } 00216 00217 template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); } 00218 template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } 00219 template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); } 00220 00221 // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} 00222 template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from) 00223 { 00224 // TODO try to find a way to avoid the need of a temporary register 00225 // Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); 00226 // tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); 00227 // return _mm256_unpacklo_ps(tmp,tmp); 00228 00229 // _mm256_insertf128_ps is very slow on Haswell, thus: 00230 Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); 00231 // mimic an "inplace" permutation of the lower 128bits using a blend 00232 tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15); 00233 // then we can perform a consistent permutation on the global register to get everything in shape: 00234 return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); 00235 } 00236 // Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} 00237 template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from) 00238 { 00239 Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from); 00240 return _mm256_permute_pd(tmp, 3<<2); 00241 } 00242 00243 // Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1} 00244 template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from) 00245 { 00246 Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from)); 00247 return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1); 00248 } 00249 00250 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } 00251 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); } 00252 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } 00253 00254 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); } 00255 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } 00256 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } 00257 00258 // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available 00259 // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); 00260 template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride) 00261 { 00262 return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride], 00263 from[3*stride], from[2*stride], from[1*stride], from[0*stride]); 00264 } 00265 template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride) 00266 { 00267 return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); 00268 } 00269 00270 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) 00271 { 00272 __m128 low = _mm256_extractf128_ps(from, 0); 00273 to[stride*0] = _mm_cvtss_f32(low); 00274 to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)); 00275 to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)); 00276 to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)); 00277 00278 __m128 high = _mm256_extractf128_ps(from, 1); 00279 to[stride*4] = _mm_cvtss_f32(high); 00280 to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)); 00281 to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)); 00282 to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)); 00283 } 00284 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride) 00285 { 00286 __m128d low = _mm256_extractf128_pd(from, 0); 00287 to[stride*0] = _mm_cvtsd_f64(low); 00288 to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)); 00289 __m128d high = _mm256_extractf128_pd(from, 1); 00290 to[stride*2] = _mm_cvtsd_f64(high); 00291 to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)); 00292 } 00293 00294 template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a) 00295 { 00296 Packet8f pa = pset1<Packet8f>(a); 00297 pstore(to, pa); 00298 } 00299 template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a) 00300 { 00301 Packet4d pa = pset1<Packet4d>(a); 00302 pstore(to, pa); 00303 } 00304 template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a) 00305 { 00306 Packet8i pa = pset1<Packet8i>(a); 00307 pstore(to, pa); 00308 } 00309 00310 #ifndef EIGEN_VECTORIZE_AVX512 00311 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 00312 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 00313 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } 00314 #endif 00315 00316 template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) { 00317 return _mm_cvtss_f32(_mm256_castps256_ps128(a)); 00318 } 00319 template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) { 00320 return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); 00321 } 00322 template<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) { 00323 return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); 00324 } 00325 00326 00327 template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) 00328 { 00329 __m256 tmp = _mm256_shuffle_ps(a,a,0x1b); 00330 return _mm256_permute2f128_ps(tmp, tmp, 1); 00331 } 00332 template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) 00333 { 00334 __m256d tmp = _mm256_shuffle_pd(a,a,5); 00335 return _mm256_permute2f128_pd(tmp, tmp, 1); 00336 00337 __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); 00338 return _mm256_permute_pd(swap_halves,5); 00339 } 00340 00341 // pabs should be ok 00342 template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) 00343 { 00344 const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); 00345 return _mm256_and_ps(a,mask); 00346 } 00347 template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) 00348 { 00349 const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); 00350 return _mm256_and_pd(a,mask); 00351 } 00352 00353 // preduxp should be ok 00354 // FIXME: why is this ok? why isn't the simply implementation working as expected? 00355 template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs) 00356 { 00357 __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); 00358 __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); 00359 __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); 00360 __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); 00361 00362 __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); 00363 __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); 00364 __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); 00365 __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); 00366 00367 __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); 00368 __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); 00369 __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); 00370 __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); 00371 00372 __m256 sum1 = _mm256_add_ps(perm1, hsum5); 00373 __m256 sum2 = _mm256_add_ps(perm2, hsum6); 00374 __m256 sum3 = _mm256_add_ps(perm3, hsum7); 00375 __m256 sum4 = _mm256_add_ps(perm4, hsum8); 00376 00377 __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); 00378 __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); 00379 00380 __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); 00381 return final; 00382 } 00383 template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs) 00384 { 00385 Packet4d tmp0, tmp1; 00386 00387 tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); 00388 tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); 00389 00390 tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); 00391 tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); 00392 00393 return _mm256_blend_pd(tmp0, tmp1, 0xC); 00394 } 00395 00396 template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) 00397 { 00398 return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)))); 00399 } 00400 template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a) 00401 { 00402 return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1)))); 00403 } 00404 00405 template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a) 00406 { 00407 return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); 00408 } 00409 00410 template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a) 00411 { 00412 Packet8f tmp; 00413 tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1)); 00414 tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); 00415 return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); 00416 } 00417 template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a) 00418 { 00419 Packet4d tmp; 00420 tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1)); 00421 return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1))); 00422 } 00423 00424 template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a) 00425 { 00426 Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1)); 00427 tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); 00428 return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); 00429 } 00430 template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a) 00431 { 00432 Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1)); 00433 return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); 00434 } 00435 00436 template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a) 00437 { 00438 Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1)); 00439 tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); 00440 return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); 00441 } 00442 00443 template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a) 00444 { 00445 Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1)); 00446 return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); 00447 } 00448 00449 00450 template<int Offset> 00451 struct palign_impl<Offset,Packet8f> 00452 { 00453 static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) 00454 { 00455 if (Offset==1) 00456 { 00457 first = _mm256_blend_ps(first, second, 1); 00458 Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); 00459 Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); 00460 first = _mm256_blend_ps(tmp1, tmp2, 0x88); 00461 } 00462 else if (Offset==2) 00463 { 00464 first = _mm256_blend_ps(first, second, 3); 00465 Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); 00466 Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); 00467 first = _mm256_blend_ps(tmp1, tmp2, 0xcc); 00468 } 00469 else if (Offset==3) 00470 { 00471 first = _mm256_blend_ps(first, second, 7); 00472 Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); 00473 Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); 00474 first = _mm256_blend_ps(tmp1, tmp2, 0xee); 00475 } 00476 else if (Offset==4) 00477 { 00478 first = _mm256_blend_ps(first, second, 15); 00479 Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); 00480 Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); 00481 first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0)); 00482 } 00483 else if (Offset==5) 00484 { 00485 first = _mm256_blend_ps(first, second, 31); 00486 first = _mm256_permute2f128_ps(first, first, 1); 00487 Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); 00488 first = _mm256_permute2f128_ps(tmp, tmp, 1); 00489 first = _mm256_blend_ps(tmp, first, 0x88); 00490 } 00491 else if (Offset==6) 00492 { 00493 first = _mm256_blend_ps(first, second, 63); 00494 first = _mm256_permute2f128_ps(first, first, 1); 00495 Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); 00496 first = _mm256_permute2f128_ps(tmp, tmp, 1); 00497 first = _mm256_blend_ps(tmp, first, 0xcc); 00498 } 00499 else if (Offset==7) 00500 { 00501 first = _mm256_blend_ps(first, second, 127); 00502 first = _mm256_permute2f128_ps(first, first, 1); 00503 Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); 00504 first = _mm256_permute2f128_ps(tmp, tmp, 1); 00505 first = _mm256_blend_ps(tmp, first, 0xee); 00506 } 00507 } 00508 }; 00509 00510 template<int Offset> 00511 struct palign_impl<Offset,Packet4d> 00512 { 00513 static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) 00514 { 00515 if (Offset==1) 00516 { 00517 first = _mm256_blend_pd(first, second, 1); 00518 __m256d tmp = _mm256_permute_pd(first, 5); 00519 first = _mm256_permute2f128_pd(tmp, tmp, 1); 00520 first = _mm256_blend_pd(tmp, first, 0xA); 00521 } 00522 else if (Offset==2) 00523 { 00524 first = _mm256_blend_pd(first, second, 3); 00525 first = _mm256_permute2f128_pd(first, first, 1); 00526 } 00527 else if (Offset==3) 00528 { 00529 first = _mm256_blend_pd(first, second, 7); 00530 __m256d tmp = _mm256_permute_pd(first, 5); 00531 first = _mm256_permute2f128_pd(tmp, tmp, 1); 00532 first = _mm256_blend_pd(tmp, first, 5); 00533 } 00534 } 00535 }; 00536 00537 EIGEN_DEVICE_FUNC inline void 00538 ptranspose(PacketBlock<Packet8f,8>& kernel) { 00539 __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); 00540 __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); 00541 __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); 00542 __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); 00543 __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]); 00544 __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]); 00545 __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]); 00546 __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]); 00547 __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); 00548 __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); 00549 __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); 00550 __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); 00551 __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0)); 00552 __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2)); 00553 __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0)); 00554 __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2)); 00555 kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20); 00556 kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20); 00557 kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20); 00558 kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20); 00559 kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31); 00560 kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31); 00561 kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31); 00562 kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31); 00563 } 00564 00565 EIGEN_DEVICE_FUNC inline void 00566 ptranspose(PacketBlock<Packet8f,4>& kernel) { 00567 __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); 00568 __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); 00569 __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); 00570 __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); 00571 00572 __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); 00573 __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); 00574 __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); 00575 __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); 00576 00577 kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20); 00578 kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20); 00579 kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31); 00580 kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31); 00581 } 00582 00583 EIGEN_DEVICE_FUNC inline void 00584 ptranspose(PacketBlock<Packet4d,4>& kernel) { 00585 __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15); 00586 __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); 00587 __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15); 00588 __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0); 00589 00590 kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32); 00591 kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49); 00592 kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32); 00593 kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); 00594 } 00595 00596 template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { 00597 const __m256 zero = _mm256_setzero_ps(); 00598 const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); 00599 __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); 00600 return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); 00601 } 00602 template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { 00603 const __m256d zero = _mm256_setzero_pd(); 00604 const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); 00605 __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); 00606 return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); 00607 } 00608 00609 template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b) 00610 { 00611 return _mm256_blend_ps(a,pset1<Packet8f>(b),1); 00612 } 00613 00614 template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b) 00615 { 00616 return _mm256_blend_pd(a,pset1<Packet4d>(b),1); 00617 } 00618 00619 template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b) 00620 { 00621 return _mm256_blend_ps(a,pset1<Packet8f>(b),(1<<7)); 00622 } 00623 00624 template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b) 00625 { 00626 return _mm256_blend_pd(a,pset1<Packet4d>(b),(1<<3)); 00627 } 00628 00629 } // end namespace internal 00630 00631 } // end namespace Eigen 00632 00633 #endif // EIGEN_PACKET_MATH_AVX_H