![]() |
Eigen
3.3.3
|
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) 00005 // 00006 // This Source Code Form is subject to the terms of the Mozilla 00007 // Public License v. 2.0. If a copy of the MPL was not distributed 00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00009 00010 #ifndef EIGEN_COMPLEX_AVX_H 00011 #define EIGEN_COMPLEX_AVX_H 00012 00013 namespace Eigen { 00014 00015 namespace internal { 00016 00017 //---------- float ---------- 00018 struct Packet4cf 00019 { 00020 EIGEN_STRONG_INLINE Packet4cf() {} 00021 EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {} 00022 __m256 v; 00023 }; 00024 00025 template<> struct packet_traits<std::complex<float> > : default_packet_traits 00026 { 00027 typedef Packet4cf type; 00028 typedef Packet2cf half; 00029 enum { 00030 Vectorizable = 1, 00031 AlignedOnScalar = 1, 00032 size = 4, 00033 HasHalfPacket = 1, 00034 00035 HasAdd = 1, 00036 HasSub = 1, 00037 HasMul = 1, 00038 HasDiv = 1, 00039 HasNegate = 1, 00040 HasAbs = 0, 00041 HasAbs2 = 0, 00042 HasMin = 0, 00043 HasMax = 0, 00044 HasSetLinear = 0 00045 }; 00046 }; 00047 00048 template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; }; 00049 00050 template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } 00051 template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } 00052 template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a) 00053 { 00054 return Packet4cf(pnegate(a.v)); 00055 } 00056 template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) 00057 { 00058 const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000)); 00059 return Packet4cf(_mm256_xor_ps(a.v,mask)); 00060 } 00061 00062 template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b) 00063 { 00064 __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v); 00065 __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1))); 00066 __m256 result = _mm256_addsub_ps(tmp1, tmp2); 00067 return Packet4cf(result); 00068 } 00069 00070 template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } 00071 template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } 00072 template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } 00073 template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); } 00074 00075 template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); } 00076 template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); } 00077 00078 00079 template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from) 00080 { 00081 return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from))); 00082 } 00083 00084 template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from) 00085 { 00086 // FIXME The following might be optimized using _mm256_movedup_pd 00087 Packet2cf a = ploaddup<Packet2cf>(from); 00088 Packet2cf b = ploaddup<Packet2cf>(from+1); 00089 return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1)); 00090 } 00091 00092 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } 00093 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } 00094 00095 template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, Index stride) 00096 { 00097 return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]), 00098 std::imag(from[2*stride]), std::real(from[2*stride]), 00099 std::imag(from[1*stride]), std::real(from[1*stride]), 00100 std::imag(from[0*stride]), std::real(from[0*stride]))); 00101 } 00102 00103 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index stride) 00104 { 00105 __m128 low = _mm256_extractf128_ps(from.v, 0); 00106 to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)), 00107 _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1))); 00108 to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)), 00109 _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3))); 00110 00111 __m128 high = _mm256_extractf128_ps(from.v, 1); 00112 to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)), 00113 _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1))); 00114 to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)), 00115 _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3))); 00116 00117 } 00118 00119 template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a) 00120 { 00121 return pfirst(Packet2cf(_mm256_castps256_ps128(a.v))); 00122 } 00123 00124 template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { 00125 __m128 low = _mm256_extractf128_ps(a.v, 0); 00126 __m128 high = _mm256_extractf128_ps(a.v, 1); 00127 __m128d lowd = _mm_castps_pd(low); 00128 __m128d highd = _mm_castps_pd(high); 00129 low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1)); 00130 high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1)); 00131 __m256 result = _mm256_setzero_ps(); 00132 result = _mm256_insertf128_ps(result, low, 1); 00133 result = _mm256_insertf128_ps(result, high, 0); 00134 return Packet4cf(result); 00135 } 00136 00137 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a) 00138 { 00139 return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)), 00140 Packet2cf(_mm256_extractf128_ps(a.v,1)))); 00141 } 00142 00143 template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs) 00144 { 00145 Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0)); 00146 Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0)); 00147 t0 = _mm256_hadd_ps(t0,t1); 00148 Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0)); 00149 Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0)); 00150 t2 = _mm256_hadd_ps(t2,t3); 00151 00152 t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4)); 00153 t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4)); 00154 00155 return Packet4cf(_mm256_add_ps(t1,t3)); 00156 } 00157 00158 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a) 00159 { 00160 return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), 00161 Packet2cf(_mm256_extractf128_ps(a.v, 1)))); 00162 } 00163 00164 template<int Offset> 00165 struct palign_impl<Offset,Packet4cf> 00166 { 00167 static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second) 00168 { 00169 if (Offset==0) return; 00170 palign_impl<Offset*2,Packet8f>::run(first.v, second.v); 00171 } 00172 }; 00173 00174 template<> struct conj_helper<Packet4cf, Packet4cf, false,true> 00175 { 00176 EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const 00177 { return padd(pmul(x,y),c); } 00178 00179 EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const 00180 { 00181 return internal::pmul(a, pconj(b)); 00182 } 00183 }; 00184 00185 template<> struct conj_helper<Packet4cf, Packet4cf, true,false> 00186 { 00187 EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const 00188 { return padd(pmul(x,y),c); } 00189 00190 EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const 00191 { 00192 return internal::pmul(pconj(a), b); 00193 } 00194 }; 00195 00196 template<> struct conj_helper<Packet4cf, Packet4cf, true,true> 00197 { 00198 EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const 00199 { return padd(pmul(x,y),c); } 00200 00201 EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const 00202 { 00203 return pconj(internal::pmul(a, b)); 00204 } 00205 }; 00206 00207 template<> struct conj_helper<Packet8f, Packet4cf, false,false> 00208 { 00209 EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const 00210 { return padd(c, pmul(x,y)); } 00211 00212 EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const 00213 { return Packet4cf(Eigen::internal::pmul(x, y.v)); } 00214 }; 00215 00216 template<> struct conj_helper<Packet4cf, Packet8f, false,false> 00217 { 00218 EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const 00219 { return padd(c, pmul(x,y)); } 00220 00221 EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const 00222 { return Packet4cf(Eigen::internal::pmul(x.v, y)); } 00223 }; 00224 00225 template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b) 00226 { 00227 Packet4cf num = pmul(a, pconj(b)); 00228 __m256 tmp = _mm256_mul_ps(b.v, b.v); 00229 __m256 tmp2 = _mm256_shuffle_ps(tmp,tmp,0xB1); 00230 __m256 denom = _mm256_add_ps(tmp, tmp2); 00231 return Packet4cf(_mm256_div_ps(num.v, denom)); 00232 } 00233 00234 template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x) 00235 { 00236 return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1))); 00237 } 00238 00239 //---------- double ---------- 00240 struct Packet2cd 00241 { 00242 EIGEN_STRONG_INLINE Packet2cd() {} 00243 EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {} 00244 __m256d v; 00245 }; 00246 00247 template<> struct packet_traits<std::complex<double> > : default_packet_traits 00248 { 00249 typedef Packet2cd type; 00250 typedef Packet1cd half; 00251 enum { 00252 Vectorizable = 1, 00253 AlignedOnScalar = 0, 00254 size = 2, 00255 HasHalfPacket = 1, 00256 00257 HasAdd = 1, 00258 HasSub = 1, 00259 HasMul = 1, 00260 HasDiv = 1, 00261 HasNegate = 1, 00262 HasAbs = 0, 00263 HasAbs2 = 0, 00264 HasMin = 0, 00265 HasMax = 0, 00266 HasSetLinear = 0 00267 }; 00268 }; 00269 00270 template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; }; 00271 00272 template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } 00273 template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } 00274 template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); } 00275 template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) 00276 { 00277 const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0)); 00278 return Packet2cd(_mm256_xor_pd(a.v,mask)); 00279 } 00280 00281 template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b) 00282 { 00283 __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0); 00284 __m256d even = _mm256_mul_pd(tmp1, b.v); 00285 __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF); 00286 __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5); 00287 __m256d odd = _mm256_mul_pd(tmp2, tmp3); 00288 return Packet2cd(_mm256_addsub_pd(even, odd)); 00289 } 00290 00291 template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } 00292 template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } 00293 template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } 00294 template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); } 00295 00296 template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from) 00297 { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); } 00298 template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from) 00299 { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); } 00300 00301 template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from) 00302 { 00303 // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though) 00304 // return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from)); 00305 return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from)); 00306 } 00307 00308 template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); } 00309 00310 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } 00311 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } 00312 00313 template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, Index stride) 00314 { 00315 return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]), 00316 std::imag(from[0*stride]), std::real(from[0*stride]))); 00317 } 00318 00319 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, Index stride) 00320 { 00321 __m128d low = _mm256_extractf128_pd(from.v, 0); 00322 to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1))); 00323 __m128d high = _mm256_extractf128_pd(from.v, 1); 00324 to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1))); 00325 } 00326 00327 template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a) 00328 { 00329 __m128d low = _mm256_extractf128_pd(a.v, 0); 00330 EIGEN_ALIGN16 double res[2]; 00331 _mm_store_pd(res, low); 00332 return std::complex<double>(res[0],res[1]); 00333 } 00334 00335 template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) { 00336 __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1); 00337 return Packet2cd(result); 00338 } 00339 00340 template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a) 00341 { 00342 return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)), 00343 Packet1cd(_mm256_extractf128_pd(a.v,1)))); 00344 } 00345 00346 template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs) 00347 { 00348 Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4)); 00349 Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4)); 00350 00351 return Packet2cd(_mm256_add_pd(t0,t1)); 00352 } 00353 00354 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a) 00355 { 00356 return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)), 00357 Packet1cd(_mm256_extractf128_pd(a.v,1)))); 00358 } 00359 00360 template<int Offset> 00361 struct palign_impl<Offset,Packet2cd> 00362 { 00363 static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second) 00364 { 00365 if (Offset==0) return; 00366 palign_impl<Offset*2,Packet4d>::run(first.v, second.v); 00367 } 00368 }; 00369 00370 template<> struct conj_helper<Packet2cd, Packet2cd, false,true> 00371 { 00372 EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const 00373 { return padd(pmul(x,y),c); } 00374 00375 EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const 00376 { 00377 return internal::pmul(a, pconj(b)); 00378 } 00379 }; 00380 00381 template<> struct conj_helper<Packet2cd, Packet2cd, true,false> 00382 { 00383 EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const 00384 { return padd(pmul(x,y),c); } 00385 00386 EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const 00387 { 00388 return internal::pmul(pconj(a), b); 00389 } 00390 }; 00391 00392 template<> struct conj_helper<Packet2cd, Packet2cd, true,true> 00393 { 00394 EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const 00395 { return padd(pmul(x,y),c); } 00396 00397 EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const 00398 { 00399 return pconj(internal::pmul(a, b)); 00400 } 00401 }; 00402 00403 template<> struct conj_helper<Packet4d, Packet2cd, false,false> 00404 { 00405 EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const 00406 { return padd(c, pmul(x,y)); } 00407 00408 EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const 00409 { return Packet2cd(Eigen::internal::pmul(x, y.v)); } 00410 }; 00411 00412 template<> struct conj_helper<Packet2cd, Packet4d, false,false> 00413 { 00414 EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const 00415 { return padd(c, pmul(x,y)); } 00416 00417 EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const 00418 { return Packet2cd(Eigen::internal::pmul(x.v, y)); } 00419 }; 00420 00421 template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b) 00422 { 00423 Packet2cd num = pmul(a, pconj(b)); 00424 __m256d tmp = _mm256_mul_pd(b.v, b.v); 00425 __m256d denom = _mm256_hadd_pd(tmp, tmp); 00426 return Packet2cd(_mm256_div_pd(num.v, denom)); 00427 } 00428 00429 template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x) 00430 { 00431 return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5)); 00432 } 00433 00434 EIGEN_DEVICE_FUNC inline void 00435 ptranspose(PacketBlock<Packet4cf,4>& kernel) { 00436 __m256d P0 = _mm256_castps_pd(kernel.packet[0].v); 00437 __m256d P1 = _mm256_castps_pd(kernel.packet[1].v); 00438 __m256d P2 = _mm256_castps_pd(kernel.packet[2].v); 00439 __m256d P3 = _mm256_castps_pd(kernel.packet[3].v); 00440 00441 __m256d T0 = _mm256_shuffle_pd(P0, P1, 15); 00442 __m256d T1 = _mm256_shuffle_pd(P0, P1, 0); 00443 __m256d T2 = _mm256_shuffle_pd(P2, P3, 15); 00444 __m256d T3 = _mm256_shuffle_pd(P2, P3, 0); 00445 00446 kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32)); 00447 kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49)); 00448 kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32)); 00449 kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49)); 00450 } 00451 00452 EIGEN_DEVICE_FUNC inline void 00453 ptranspose(PacketBlock<Packet2cd,2>& kernel) { 00454 __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4)); 00455 kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4)); 00456 kernel.packet[0].v = tmp; 00457 } 00458 00459 template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex<float> b) 00460 { 00461 return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,1|2)); 00462 } 00463 00464 template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex<double> b) 00465 { 00466 return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,1|2)); 00467 } 00468 00469 template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex<float> b) 00470 { 00471 return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,(1<<7)|(1<<6))); 00472 } 00473 00474 template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex<double> b) 00475 { 00476 return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,(1<<3)|(1<<2))); 00477 } 00478 00479 } // end namespace internal 00480 00481 } // end namespace Eigen 00482 00483 #endif // EIGEN_COMPLEX_AVX_H