![]() |
Eigen
3.3.3
|
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> 00005 // 00006 // This Source Code Form is subject to the terms of the Mozilla 00007 // Public License v. 2.0. If a copy of the MPL was not distributed 00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00009 00010 #ifndef EIGEN_PACKET_MATH_HALF_CUDA_H 00011 #define EIGEN_PACKET_MATH_HALF_CUDA_H 00012 00013 00014 namespace Eigen { 00015 namespace internal { 00016 00017 // Most of the following operations require arch >= 3.0 00018 #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 00019 00020 template<> struct is_arithmetic<half2> { enum { value = true }; }; 00021 00022 template<> struct packet_traits<Eigen::half> : default_packet_traits 00023 { 00024 typedef half2 type; 00025 typedef half2 half; 00026 enum { 00027 Vectorizable = 1, 00028 AlignedOnScalar = 1, 00029 size=2, 00030 HasHalfPacket = 0, 00031 HasAdd = 1, 00032 HasMul = 1, 00033 HasDiv = 1, 00034 HasSqrt = 1, 00035 HasRsqrt = 1, 00036 HasExp = 1, 00037 HasLog = 1, 00038 HasLog1p = 1 00039 }; 00040 }; 00041 00042 template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; 00043 00044 template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) { 00045 return __half2half2(from); 00046 } 00047 00048 template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) { 00049 return *reinterpret_cast<const half2*>(from); 00050 } 00051 00052 template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) { 00053 return __halves2half2(from[0], from[1]); 00054 } 00055 00056 template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) { 00057 return __halves2half2(from[0], from[0]); 00058 } 00059 00060 template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) { 00061 *reinterpret_cast<half2*>(to) = from; 00062 } 00063 00064 template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) { 00065 to[0] = __low2half(from); 00066 to[1] = __high2half(from); 00067 } 00068 00069 template<> 00070 __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) { 00071 #if __CUDA_ARCH__ >= 350 00072 return __ldg((const half2*)from); 00073 #else 00074 return __halves2half2(*(from+0), *(from+1)); 00075 #endif 00076 } 00077 00078 template<> 00079 __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) { 00080 #if __CUDA_ARCH__ >= 350 00081 return __halves2half2(__ldg(from+0), __ldg(from+1)); 00082 #else 00083 return __halves2half2(*(from+0), *(from+1)); 00084 #endif 00085 } 00086 00087 template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) { 00088 return __halves2half2(from[0*stride], from[1*stride]); 00089 } 00090 00091 template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) { 00092 to[stride*0] = __low2half(from); 00093 to[stride*1] = __high2half(from); 00094 } 00095 00096 template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) { 00097 return __low2half(a); 00098 } 00099 00100 template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) { 00101 half2 result; 00102 result.x = a.x & 0x7FFF7FFF; 00103 return result; 00104 } 00105 00106 00107 __device__ EIGEN_STRONG_INLINE void 00108 ptranspose(PacketBlock<half2,2>& kernel) { 00109 __half a1 = __low2half(kernel.packet[0]); 00110 __half a2 = __high2half(kernel.packet[0]); 00111 __half b1 = __low2half(kernel.packet[1]); 00112 __half b2 = __high2half(kernel.packet[1]); 00113 kernel.packet[0] = __halves2half2(a1, b1); 00114 kernel.packet[1] = __halves2half2(a2, b2); 00115 } 00116 00117 template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) { 00118 #if __CUDA_ARCH__ >= 530 00119 return __halves2half2(a, __hadd(a, __float2half(1.0f))); 00120 #else 00121 float f = __half2float(a) + 1.0f; 00122 return __halves2half2(a, __float2half(f)); 00123 #endif 00124 } 00125 00126 template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) { 00127 #if __CUDA_ARCH__ >= 530 00128 return __hadd2(a, b); 00129 #else 00130 float a1 = __low2float(a); 00131 float a2 = __high2float(a); 00132 float b1 = __low2float(b); 00133 float b2 = __high2float(b); 00134 float r1 = a1 + b1; 00135 float r2 = a2 + b2; 00136 return __floats2half2_rn(r1, r2); 00137 #endif 00138 } 00139 00140 template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) { 00141 #if __CUDA_ARCH__ >= 530 00142 return __hsub2(a, b); 00143 #else 00144 float a1 = __low2float(a); 00145 float a2 = __high2float(a); 00146 float b1 = __low2float(b); 00147 float b2 = __high2float(b); 00148 float r1 = a1 - b1; 00149 float r2 = a2 - b2; 00150 return __floats2half2_rn(r1, r2); 00151 #endif 00152 } 00153 00154 template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { 00155 #if __CUDA_ARCH__ >= 530 00156 return __hneg2(a); 00157 #else 00158 float a1 = __low2float(a); 00159 float a2 = __high2float(a); 00160 return __floats2half2_rn(-a1, -a2); 00161 #endif 00162 } 00163 00164 template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } 00165 00166 template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) { 00167 #if __CUDA_ARCH__ >= 530 00168 return __hmul2(a, b); 00169 #else 00170 float a1 = __low2float(a); 00171 float a2 = __high2float(a); 00172 float b1 = __low2float(b); 00173 float b2 = __high2float(b); 00174 float r1 = a1 * b1; 00175 float r2 = a2 * b2; 00176 return __floats2half2_rn(r1, r2); 00177 #endif 00178 } 00179 00180 template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) { 00181 #if __CUDA_ARCH__ >= 530 00182 return __hfma2(a, b, c); 00183 #else 00184 float a1 = __low2float(a); 00185 float a2 = __high2float(a); 00186 float b1 = __low2float(b); 00187 float b2 = __high2float(b); 00188 float c1 = __low2float(c); 00189 float c2 = __high2float(c); 00190 float r1 = a1 * b1 + c1; 00191 float r2 = a2 * b2 + c2; 00192 return __floats2half2_rn(r1, r2); 00193 #endif 00194 } 00195 00196 template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) { 00197 float a1 = __low2float(a); 00198 float a2 = __high2float(a); 00199 float b1 = __low2float(b); 00200 float b2 = __high2float(b); 00201 float r1 = a1 / b1; 00202 float r2 = a2 / b2; 00203 return __floats2half2_rn(r1, r2); 00204 } 00205 00206 template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) { 00207 float a1 = __low2float(a); 00208 float a2 = __high2float(a); 00209 float b1 = __low2float(b); 00210 float b2 = __high2float(b); 00211 __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); 00212 __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); 00213 return __halves2half2(r1, r2); 00214 } 00215 00216 template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) { 00217 float a1 = __low2float(a); 00218 float a2 = __high2float(a); 00219 float b1 = __low2float(b); 00220 float b2 = __high2float(b); 00221 __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); 00222 __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); 00223 return __halves2half2(r1, r2); 00224 } 00225 00226 template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) { 00227 #if __CUDA_ARCH__ >= 530 00228 return __hadd(__low2half(a), __high2half(a)); 00229 #else 00230 float a1 = __low2float(a); 00231 float a2 = __high2float(a); 00232 return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2))); 00233 #endif 00234 } 00235 00236 template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) { 00237 #if __CUDA_ARCH__ >= 530 00238 __half first = __low2half(a); 00239 __half second = __high2half(a); 00240 return __hgt(first, second) ? first : second; 00241 #else 00242 float a1 = __low2float(a); 00243 float a2 = __high2float(a); 00244 return a1 > a2 ? __low2half(a) : __high2half(a); 00245 #endif 00246 } 00247 00248 template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) { 00249 #if __CUDA_ARCH__ >= 530 00250 __half first = __low2half(a); 00251 __half second = __high2half(a); 00252 return __hlt(first, second) ? first : second; 00253 #else 00254 float a1 = __low2float(a); 00255 float a2 = __high2float(a); 00256 return a1 < a2 ? __low2half(a) : __high2half(a); 00257 #endif 00258 } 00259 00260 template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) { 00261 #if __CUDA_ARCH__ >= 530 00262 return __hmul(__low2half(a), __high2half(a)); 00263 #else 00264 float a1 = __low2float(a); 00265 float a2 = __high2float(a); 00266 return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2))); 00267 #endif 00268 } 00269 00270 template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) { 00271 float a1 = __low2float(a); 00272 float a2 = __high2float(a); 00273 float r1 = log1pf(a1); 00274 float r2 = log1pf(a2); 00275 return __floats2half2_rn(r1, r2); 00276 } 00277 00278 #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 00279 00280 template<> __device__ EIGEN_STRONG_INLINE 00281 half2 plog<half2>(const half2& a) { 00282 return h2log(a); 00283 } 00284 00285 template<> __device__ EIGEN_STRONG_INLINE 00286 half2 pexp<half2>(const half2& a) { 00287 return h2exp(a); 00288 } 00289 00290 template<> __device__ EIGEN_STRONG_INLINE 00291 half2 psqrt<half2>(const half2& a) { 00292 return h2sqrt(a); 00293 } 00294 00295 template<> __device__ EIGEN_STRONG_INLINE 00296 half2 prsqrt<half2>(const half2& a) { 00297 return h2rsqrt(a); 00298 } 00299 00300 #else 00301 00302 template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) { 00303 float a1 = __low2float(a); 00304 float a2 = __high2float(a); 00305 float r1 = logf(a1); 00306 float r2 = logf(a2); 00307 return __floats2half2_rn(r1, r2); 00308 } 00309 00310 template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) { 00311 float a1 = __low2float(a); 00312 float a2 = __high2float(a); 00313 float r1 = expf(a1); 00314 float r2 = expf(a2); 00315 return __floats2half2_rn(r1, r2); 00316 } 00317 00318 template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) { 00319 float a1 = __low2float(a); 00320 float a2 = __high2float(a); 00321 float r1 = sqrtf(a1); 00322 float r2 = sqrtf(a2); 00323 return __floats2half2_rn(r1, r2); 00324 } 00325 00326 template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) { 00327 float a1 = __low2float(a); 00328 float a2 = __high2float(a); 00329 float r1 = rsqrtf(a1); 00330 float r2 = rsqrtf(a2); 00331 return __floats2half2_rn(r1, r2); 00332 } 00333 00334 #endif 00335 00336 #elif defined EIGEN_VECTORIZE_AVX512 00337 00338 typedef struct { 00339 __m256i x; 00340 } Packet16h; 00341 00342 00343 template<> struct is_arithmetic<Packet16h> { enum { value = true }; }; 00344 00345 template <> 00346 struct packet_traits<half> : default_packet_traits { 00347 typedef Packet16h type; 00348 // There is no half-size packet for Packet16h. 00349 typedef Packet16h half; 00350 enum { 00351 Vectorizable = 1, 00352 AlignedOnScalar = 1, 00353 size = 16, 00354 HasHalfPacket = 0, 00355 HasAdd = 0, 00356 HasSub = 0, 00357 HasMul = 0, 00358 HasNegate = 0, 00359 HasAbs = 0, 00360 HasAbs2 = 0, 00361 HasMin = 0, 00362 HasMax = 0, 00363 HasConj = 0, 00364 HasSetLinear = 0, 00365 HasDiv = 0, 00366 HasSqrt = 0, 00367 HasRsqrt = 0, 00368 HasExp = 0, 00369 HasLog = 0, 00370 HasBlend = 0 00371 }; 00372 }; 00373 00374 00375 template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; }; 00376 00377 template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) { 00378 Packet16h result; 00379 result.x = _mm256_set1_epi16(from.x); 00380 return result; 00381 } 00382 00383 template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) { 00384 return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0))); 00385 } 00386 00387 template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) { 00388 Packet16h result; 00389 result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); 00390 return result; 00391 } 00392 00393 template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) { 00394 Packet16h result; 00395 result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); 00396 return result; 00397 } 00398 00399 template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) { 00400 _mm256_store_si256((__m256i*)to, from.x); 00401 } 00402 00403 template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) { 00404 _mm256_storeu_si256((__m256i*)to, from.x); 00405 } 00406 00407 template<> EIGEN_STRONG_INLINE Packet16h 00408 ploadquad(const Eigen::half* from) { 00409 Packet16h result; 00410 unsigned short a = from[0].x; 00411 unsigned short b = from[1].x; 00412 unsigned short c = from[2].x; 00413 unsigned short d = from[3].x; 00414 result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); 00415 return result; 00416 } 00417 00418 EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { 00419 #ifdef EIGEN_HAS_FP16_C 00420 return _mm512_cvtph_ps(a.x); 00421 #else 00422 EIGEN_ALIGN64 half aux[16]; 00423 pstore(aux, a); 00424 float f0(aux[0]); 00425 float f1(aux[1]); 00426 float f2(aux[2]); 00427 float f3(aux[3]); 00428 float f4(aux[4]); 00429 float f5(aux[5]); 00430 float f6(aux[6]); 00431 float f7(aux[7]); 00432 float f8(aux[8]); 00433 float f9(aux[9]); 00434 float fa(aux[10]); 00435 float fb(aux[11]); 00436 float fc(aux[12]); 00437 float fd(aux[13]); 00438 float fe(aux[14]); 00439 float ff(aux[15]); 00440 00441 return _mm512_set_ps( 00442 ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); 00443 #endif 00444 } 00445 00446 EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { 00447 #ifdef EIGEN_HAS_FP16_C 00448 Packet16h result; 00449 result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); 00450 return result; 00451 #else 00452 EIGEN_ALIGN64 float aux[16]; 00453 pstore(aux, a); 00454 half h0(aux[0]); 00455 half h1(aux[1]); 00456 half h2(aux[2]); 00457 half h3(aux[3]); 00458 half h4(aux[4]); 00459 half h5(aux[5]); 00460 half h6(aux[6]); 00461 half h7(aux[7]); 00462 half h8(aux[8]); 00463 half h9(aux[9]); 00464 half ha(aux[10]); 00465 half hb(aux[11]); 00466 half hc(aux[12]); 00467 half hd(aux[13]); 00468 half he(aux[14]); 00469 half hf(aux[15]); 00470 00471 Packet16h result; 00472 result.x = _mm256_set_epi16( 00473 hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, 00474 h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); 00475 return result; 00476 #endif 00477 } 00478 00479 template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) { 00480 Packet16f af = half2float(a); 00481 Packet16f bf = half2float(b); 00482 Packet16f rf = padd(af, bf); 00483 return float2half(rf); 00484 } 00485 00486 template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) { 00487 Packet16f af = half2float(a); 00488 Packet16f bf = half2float(b); 00489 Packet16f rf = pmul(af, bf); 00490 return float2half(rf); 00491 } 00492 00493 template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) { 00494 Packet16f from_float = half2float(from); 00495 return half(predux(from_float)); 00496 } 00497 00498 template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) 00499 { 00500 Packet16h result; 00501 result.x = _mm256_set_epi16( 00502 from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, 00503 from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, 00504 from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, 00505 from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); 00506 return result; 00507 } 00508 00509 template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) 00510 { 00511 EIGEN_ALIGN64 half aux[16]; 00512 pstore(aux, from); 00513 to[stride*0].x = aux[0].x; 00514 to[stride*1].x = aux[1].x; 00515 to[stride*2].x = aux[2].x; 00516 to[stride*3].x = aux[3].x; 00517 to[stride*4].x = aux[4].x; 00518 to[stride*5].x = aux[5].x; 00519 to[stride*6].x = aux[6].x; 00520 to[stride*7].x = aux[7].x; 00521 to[stride*8].x = aux[8].x; 00522 to[stride*9].x = aux[9].x; 00523 to[stride*10].x = aux[10].x; 00524 to[stride*11].x = aux[11].x; 00525 to[stride*12].x = aux[12].x; 00526 to[stride*13].x = aux[13].x; 00527 to[stride*14].x = aux[14].x; 00528 to[stride*15].x = aux[15].x; 00529 } 00530 00531 EIGEN_STRONG_INLINE void 00532 ptranspose(PacketBlock<Packet16h,16>& kernel) { 00533 __m256i a = kernel.packet[0].x; 00534 __m256i b = kernel.packet[1].x; 00535 __m256i c = kernel.packet[2].x; 00536 __m256i d = kernel.packet[3].x; 00537 __m256i e = kernel.packet[4].x; 00538 __m256i f = kernel.packet[5].x; 00539 __m256i g = kernel.packet[6].x; 00540 __m256i h = kernel.packet[7].x; 00541 __m256i i = kernel.packet[8].x; 00542 __m256i j = kernel.packet[9].x; 00543 __m256i k = kernel.packet[10].x; 00544 __m256i l = kernel.packet[11].x; 00545 __m256i m = kernel.packet[12].x; 00546 __m256i n = kernel.packet[13].x; 00547 __m256i o = kernel.packet[14].x; 00548 __m256i p = kernel.packet[15].x; 00549 00550 __m256i ab_07 = _mm256_unpacklo_epi16(a, b); 00551 __m256i cd_07 = _mm256_unpacklo_epi16(c, d); 00552 __m256i ef_07 = _mm256_unpacklo_epi16(e, f); 00553 __m256i gh_07 = _mm256_unpacklo_epi16(g, h); 00554 __m256i ij_07 = _mm256_unpacklo_epi16(i, j); 00555 __m256i kl_07 = _mm256_unpacklo_epi16(k, l); 00556 __m256i mn_07 = _mm256_unpacklo_epi16(m, n); 00557 __m256i op_07 = _mm256_unpacklo_epi16(o, p); 00558 00559 __m256i ab_8f = _mm256_unpackhi_epi16(a, b); 00560 __m256i cd_8f = _mm256_unpackhi_epi16(c, d); 00561 __m256i ef_8f = _mm256_unpackhi_epi16(e, f); 00562 __m256i gh_8f = _mm256_unpackhi_epi16(g, h); 00563 __m256i ij_8f = _mm256_unpackhi_epi16(i, j); 00564 __m256i kl_8f = _mm256_unpackhi_epi16(k, l); 00565 __m256i mn_8f = _mm256_unpackhi_epi16(m, n); 00566 __m256i op_8f = _mm256_unpackhi_epi16(o, p); 00567 00568 __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); 00569 __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); 00570 __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); 00571 __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); 00572 __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); 00573 __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); 00574 __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); 00575 __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); 00576 00577 __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); 00578 __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); 00579 __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); 00580 __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); 00581 __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); 00582 __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); 00583 __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); 00584 __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); 00585 00586 __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); 00587 __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); 00588 __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); 00589 __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); 00590 __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); 00591 __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); 00592 __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); 00593 __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); 00594 __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); 00595 __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); 00596 __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); 00597 __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); 00598 __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); 00599 __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); 00600 __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); 00601 __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); 00602 00603 // NOTE: no unpacklo/hi instr in this case, so using permute instr. 00604 __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); 00605 __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); 00606 __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); 00607 __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); 00608 __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); 00609 __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); 00610 __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); 00611 __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); 00612 __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); 00613 __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); 00614 __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); 00615 __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); 00616 __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); 00617 __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); 00618 __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); 00619 __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); 00620 00621 kernel.packet[0].x = a_p_0; 00622 kernel.packet[1].x = a_p_1; 00623 kernel.packet[2].x = a_p_2; 00624 kernel.packet[3].x = a_p_3; 00625 kernel.packet[4].x = a_p_4; 00626 kernel.packet[5].x = a_p_5; 00627 kernel.packet[6].x = a_p_6; 00628 kernel.packet[7].x = a_p_7; 00629 kernel.packet[8].x = a_p_8; 00630 kernel.packet[9].x = a_p_9; 00631 kernel.packet[10].x = a_p_a; 00632 kernel.packet[11].x = a_p_b; 00633 kernel.packet[12].x = a_p_c; 00634 kernel.packet[13].x = a_p_d; 00635 kernel.packet[14].x = a_p_e; 00636 kernel.packet[15].x = a_p_f; 00637 } 00638 00639 EIGEN_STRONG_INLINE void 00640 ptranspose(PacketBlock<Packet16h,8>& kernel) { 00641 EIGEN_ALIGN64 half in[8][16]; 00642 pstore<half>(in[0], kernel.packet[0]); 00643 pstore<half>(in[1], kernel.packet[1]); 00644 pstore<half>(in[2], kernel.packet[2]); 00645 pstore<half>(in[3], kernel.packet[3]); 00646 pstore<half>(in[4], kernel.packet[4]); 00647 pstore<half>(in[5], kernel.packet[5]); 00648 pstore<half>(in[6], kernel.packet[6]); 00649 pstore<half>(in[7], kernel.packet[7]); 00650 00651 EIGEN_ALIGN64 half out[8][16]; 00652 00653 for (int i = 0; i < 8; ++i) { 00654 for (int j = 0; j < 8; ++j) { 00655 out[i][j] = in[j][2*i]; 00656 } 00657 for (int j = 0; j < 8; ++j) { 00658 out[i][j+8] = in[j][2*i+1]; 00659 } 00660 } 00661 00662 kernel.packet[0] = pload<Packet16h>(out[0]); 00663 kernel.packet[1] = pload<Packet16h>(out[1]); 00664 kernel.packet[2] = pload<Packet16h>(out[2]); 00665 kernel.packet[3] = pload<Packet16h>(out[3]); 00666 kernel.packet[4] = pload<Packet16h>(out[4]); 00667 kernel.packet[5] = pload<Packet16h>(out[5]); 00668 kernel.packet[6] = pload<Packet16h>(out[6]); 00669 kernel.packet[7] = pload<Packet16h>(out[7]); 00670 } 00671 00672 EIGEN_STRONG_INLINE void 00673 ptranspose(PacketBlock<Packet16h,4>& kernel) { 00674 EIGEN_ALIGN64 half in[4][16]; 00675 pstore<half>(in[0], kernel.packet[0]); 00676 pstore<half>(in[1], kernel.packet[1]); 00677 pstore<half>(in[2], kernel.packet[2]); 00678 pstore<half>(in[3], kernel.packet[3]); 00679 00680 EIGEN_ALIGN64 half out[4][16]; 00681 00682 for (int i = 0; i < 4; ++i) { 00683 for (int j = 0; j < 4; ++j) { 00684 out[i][j] = in[j][4*i]; 00685 } 00686 for (int j = 0; j < 4; ++j) { 00687 out[i][j+4] = in[j][4*i+1]; 00688 } 00689 for (int j = 0; j < 4; ++j) { 00690 out[i][j+8] = in[j][4*i+2]; 00691 } 00692 for (int j = 0; j < 4; ++j) { 00693 out[i][j+12] = in[j][4*i+3]; 00694 } 00695 } 00696 00697 kernel.packet[0] = pload<Packet16h>(out[0]); 00698 kernel.packet[1] = pload<Packet16h>(out[1]); 00699 kernel.packet[2] = pload<Packet16h>(out[2]); 00700 kernel.packet[3] = pload<Packet16h>(out[3]); 00701 } 00702 00703 00704 #elif defined EIGEN_VECTORIZE_AVX 00705 00706 typedef struct { 00707 __m128i x; 00708 } Packet8h; 00709 00710 00711 template<> struct is_arithmetic<Packet8h> { enum { value = true }; }; 00712 00713 template <> 00714 struct packet_traits<Eigen::half> : default_packet_traits { 00715 typedef Packet8h type; 00716 // There is no half-size packet for Packet8h. 00717 typedef Packet8h half; 00718 enum { 00719 Vectorizable = 1, 00720 AlignedOnScalar = 1, 00721 size = 8, 00722 HasHalfPacket = 0, 00723 HasAdd = 0, 00724 HasSub = 0, 00725 HasMul = 0, 00726 HasNegate = 0, 00727 HasAbs = 0, 00728 HasAbs2 = 0, 00729 HasMin = 0, 00730 HasMax = 0, 00731 HasConj = 0, 00732 HasSetLinear = 0, 00733 HasDiv = 0, 00734 HasSqrt = 0, 00735 HasRsqrt = 0, 00736 HasExp = 0, 00737 HasLog = 0, 00738 HasBlend = 0 00739 }; 00740 }; 00741 00742 00743 template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; }; 00744 00745 template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) { 00746 Packet8h result; 00747 result.x = _mm_set1_epi16(from.x); 00748 return result; 00749 } 00750 00751 template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) { 00752 return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0))); 00753 } 00754 00755 template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) { 00756 Packet8h result; 00757 result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from)); 00758 return result; 00759 } 00760 00761 template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) { 00762 Packet8h result; 00763 result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); 00764 return result; 00765 } 00766 00767 template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) { 00768 _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x); 00769 } 00770 00771 template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) { 00772 _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); 00773 } 00774 00775 template<> EIGEN_STRONG_INLINE Packet8h 00776 ploadquad<Packet8h>(const Eigen::half* from) { 00777 Packet8h result; 00778 unsigned short a = from[0].x; 00779 unsigned short b = from[1].x; 00780 result.x = _mm_set_epi16(b, b, b, b, a, a, a, a); 00781 return result; 00782 } 00783 00784 EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { 00785 #ifdef EIGEN_HAS_FP16_C 00786 return _mm256_cvtph_ps(a.x); 00787 #else 00788 EIGEN_ALIGN32 Eigen::half aux[8]; 00789 pstore(aux, a); 00790 float f0(aux[0]); 00791 float f1(aux[1]); 00792 float f2(aux[2]); 00793 float f3(aux[3]); 00794 float f4(aux[4]); 00795 float f5(aux[5]); 00796 float f6(aux[6]); 00797 float f7(aux[7]); 00798 00799 return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); 00800 #endif 00801 } 00802 00803 EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { 00804 #ifdef EIGEN_HAS_FP16_C 00805 Packet8h result; 00806 result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); 00807 return result; 00808 #else 00809 EIGEN_ALIGN32 float aux[8]; 00810 pstore(aux, a); 00811 Eigen::half h0(aux[0]); 00812 Eigen::half h1(aux[1]); 00813 Eigen::half h2(aux[2]); 00814 Eigen::half h3(aux[3]); 00815 Eigen::half h4(aux[4]); 00816 Eigen::half h5(aux[5]); 00817 Eigen::half h6(aux[6]); 00818 Eigen::half h7(aux[7]); 00819 00820 Packet8h result; 00821 result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); 00822 return result; 00823 #endif 00824 } 00825 00826 template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } 00827 00828 template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) { 00829 Packet8f af = half2float(a); 00830 Packet8f bf = half2float(b); 00831 Packet8f rf = padd(af, bf); 00832 return float2half(rf); 00833 } 00834 00835 template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) { 00836 Packet8f af = half2float(a); 00837 Packet8f bf = half2float(b); 00838 Packet8f rf = pmul(af, bf); 00839 return float2half(rf); 00840 } 00841 00842 template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) 00843 { 00844 Packet8h result; 00845 result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); 00846 return result; 00847 } 00848 00849 template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) 00850 { 00851 EIGEN_ALIGN32 Eigen::half aux[8]; 00852 pstore(aux, from); 00853 to[stride*0].x = aux[0].x; 00854 to[stride*1].x = aux[1].x; 00855 to[stride*2].x = aux[2].x; 00856 to[stride*3].x = aux[3].x; 00857 to[stride*4].x = aux[4].x; 00858 to[stride*5].x = aux[5].x; 00859 to[stride*6].x = aux[6].x; 00860 to[stride*7].x = aux[7].x; 00861 } 00862 00863 template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) { 00864 Packet8f af = half2float(a); 00865 float reduced = predux<Packet8f>(af); 00866 return Eigen::half(reduced); 00867 } 00868 00869 template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) { 00870 Packet8f af = half2float(a); 00871 float reduced = predux_max<Packet8f>(af); 00872 return Eigen::half(reduced); 00873 } 00874 00875 template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) { 00876 Packet8f af = half2float(a); 00877 float reduced = predux_min<Packet8f>(af); 00878 return Eigen::half(reduced); 00879 } 00880 00881 template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) { 00882 Packet8f af = half2float(a); 00883 float reduced = predux_mul<Packet8f>(af); 00884 return Eigen::half(reduced); 00885 } 00886 00887 EIGEN_STRONG_INLINE void 00888 ptranspose(PacketBlock<Packet8h,8>& kernel) { 00889 __m128i a = kernel.packet[0].x; 00890 __m128i b = kernel.packet[1].x; 00891 __m128i c = kernel.packet[2].x; 00892 __m128i d = kernel.packet[3].x; 00893 __m128i e = kernel.packet[4].x; 00894 __m128i f = kernel.packet[5].x; 00895 __m128i g = kernel.packet[6].x; 00896 __m128i h = kernel.packet[7].x; 00897 00898 __m128i a03b03 = _mm_unpacklo_epi16(a, b); 00899 __m128i c03d03 = _mm_unpacklo_epi16(c, d); 00900 __m128i e03f03 = _mm_unpacklo_epi16(e, f); 00901 __m128i g03h03 = _mm_unpacklo_epi16(g, h); 00902 __m128i a47b47 = _mm_unpackhi_epi16(a, b); 00903 __m128i c47d47 = _mm_unpackhi_epi16(c, d); 00904 __m128i e47f47 = _mm_unpackhi_epi16(e, f); 00905 __m128i g47h47 = _mm_unpackhi_epi16(g, h); 00906 00907 __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); 00908 __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); 00909 __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); 00910 __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); 00911 __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); 00912 __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); 00913 __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); 00914 __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); 00915 00916 __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); 00917 __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); 00918 __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); 00919 __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); 00920 __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); 00921 __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); 00922 __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); 00923 __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); 00924 00925 kernel.packet[0].x = a0b0c0d0e0f0g0h0; 00926 kernel.packet[1].x = a1b1c1d1e1f1g1h1; 00927 kernel.packet[2].x = a2b2c2d2e2f2g2h2; 00928 kernel.packet[3].x = a3b3c3d3e3f3g3h3; 00929 kernel.packet[4].x = a4b4c4d4e4f4g4h4; 00930 kernel.packet[5].x = a5b5c5d5e5f5g5h5; 00931 kernel.packet[6].x = a6b6c6d6e6f6g6h6; 00932 kernel.packet[7].x = a7b7c7d7e7f7g7h7; 00933 } 00934 00935 EIGEN_STRONG_INLINE void 00936 ptranspose(PacketBlock<Packet8h,4>& kernel) { 00937 EIGEN_ALIGN32 Eigen::half in[4][8]; 00938 pstore<Eigen::half>(in[0], kernel.packet[0]); 00939 pstore<Eigen::half>(in[1], kernel.packet[1]); 00940 pstore<Eigen::half>(in[2], kernel.packet[2]); 00941 pstore<Eigen::half>(in[3], kernel.packet[3]); 00942 00943 EIGEN_ALIGN32 Eigen::half out[4][8]; 00944 00945 for (int i = 0; i < 4; ++i) { 00946 for (int j = 0; j < 4; ++j) { 00947 out[i][j] = in[j][2*i]; 00948 } 00949 for (int j = 0; j < 4; ++j) { 00950 out[i][j+4] = in[j][2*i+1]; 00951 } 00952 } 00953 00954 kernel.packet[0] = pload<Packet8h>(out[0]); 00955 kernel.packet[1] = pload<Packet8h>(out[1]); 00956 kernel.packet[2] = pload<Packet8h>(out[2]); 00957 kernel.packet[3] = pload<Packet8h>(out[3]); 00958 } 00959 00960 00961 // Disable the following code since it's broken on too many platforms / compilers. 00962 //#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) 00963 #elif 0 00964 00965 typedef struct { 00966 __m64 x; 00967 } Packet4h; 00968 00969 00970 template<> struct is_arithmetic<Packet4h> { enum { value = true }; }; 00971 00972 template <> 00973 struct packet_traits<Eigen::half> : default_packet_traits { 00974 typedef Packet4h type; 00975 // There is no half-size packet for Packet4h. 00976 typedef Packet4h half; 00977 enum { 00978 Vectorizable = 1, 00979 AlignedOnScalar = 1, 00980 size = 4, 00981 HasHalfPacket = 0, 00982 HasAdd = 0, 00983 HasSub = 0, 00984 HasMul = 0, 00985 HasNegate = 0, 00986 HasAbs = 0, 00987 HasAbs2 = 0, 00988 HasMin = 0, 00989 HasMax = 0, 00990 HasConj = 0, 00991 HasSetLinear = 0, 00992 HasDiv = 0, 00993 HasSqrt = 0, 00994 HasRsqrt = 0, 00995 HasExp = 0, 00996 HasLog = 0, 00997 HasBlend = 0 00998 }; 00999 }; 01000 01001 01002 template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; }; 01003 01004 template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) { 01005 Packet4h result; 01006 result.x = _mm_set1_pi16(from.x); 01007 return result; 01008 } 01009 01010 template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) { 01011 return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x))); 01012 } 01013 01014 template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } 01015 01016 template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) { 01017 __int64_t a64 = _mm_cvtm64_si64(a.x); 01018 __int64_t b64 = _mm_cvtm64_si64(b.x); 01019 01020 Eigen::half h[4]; 01021 01022 Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64)); 01023 Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64)); 01024 h[0] = ha + hb; 01025 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16)); 01026 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16)); 01027 h[1] = ha + hb; 01028 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32)); 01029 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32)); 01030 h[2] = ha + hb; 01031 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48)); 01032 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48)); 01033 h[3] = ha + hb; 01034 Packet4h result; 01035 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); 01036 return result; 01037 } 01038 01039 template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) { 01040 __int64_t a64 = _mm_cvtm64_si64(a.x); 01041 __int64_t b64 = _mm_cvtm64_si64(b.x); 01042 01043 Eigen::half h[4]; 01044 01045 Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64)); 01046 Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64)); 01047 h[0] = ha * hb; 01048 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16)); 01049 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16)); 01050 h[1] = ha * hb; 01051 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32)); 01052 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32)); 01053 h[2] = ha * hb; 01054 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48)); 01055 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48)); 01056 h[3] = ha * hb; 01057 Packet4h result; 01058 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); 01059 return result; 01060 } 01061 01062 template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) { 01063 Packet4h result; 01064 result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from)); 01065 return result; 01066 } 01067 01068 template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) { 01069 Packet4h result; 01070 result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from)); 01071 return result; 01072 } 01073 01074 template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) { 01075 __int64_t r = _mm_cvtm64_si64(from.x); 01076 *(reinterpret_cast<__int64_t*>(to)) = r; 01077 } 01078 01079 template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) { 01080 __int64_t r = _mm_cvtm64_si64(from.x); 01081 *(reinterpret_cast<__int64_t*>(to)) = r; 01082 } 01083 01084 template<> EIGEN_STRONG_INLINE Packet4h 01085 ploadquad<Packet4h>(const Eigen::half* from) { 01086 return pset1<Packet4h>(*from); 01087 } 01088 01089 template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride) 01090 { 01091 Packet4h result; 01092 result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); 01093 return result; 01094 } 01095 01096 template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride) 01097 { 01098 __int64_t a = _mm_cvtm64_si64(from.x); 01099 to[stride*0].x = static_cast<unsigned short>(a); 01100 to[stride*1].x = static_cast<unsigned short>(a >> 16); 01101 to[stride*2].x = static_cast<unsigned short>(a >> 32); 01102 to[stride*3].x = static_cast<unsigned short>(a >> 48); 01103 } 01104 01105 EIGEN_STRONG_INLINE void 01106 ptranspose(PacketBlock<Packet4h,4>& kernel) { 01107 __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); 01108 __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); 01109 __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); 01110 __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); 01111 01112 kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); 01113 kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); 01114 kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); 01115 kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); 01116 } 01117 01118 #endif 01119 01120 } 01121 } 01122 01123 #endif // EIGEN_PACKET_MATH_HALF_CUDA_H