GDAL
|
00001 /****************************************************************************** 00002 * $Id: gdalsse_priv.h 28877 2015-04-08 23:11:36Z rouault $ 00003 * 00004 * Project: GDAL 00005 * Purpose: SSE2 helper 00006 * Author: Even Rouault <even dot rouault at spatialys dot com> 00007 * 00008 ****************************************************************************** 00009 * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com> 00010 * 00011 * Permission is hereby granted, free of charge, to any person obtaining a 00012 * copy of this software and associated documentation files (the "Software"), 00013 * to deal in the Software without restriction, including without limitation 00014 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 00015 * and/or sell copies of the Software, and to permit persons to whom the 00016 * Software is furnished to do so, subject to the following conditions: 00017 * 00018 * The above copyright notice and this permission notice shall be included 00019 * in all copies or substantial portions of the Software. 00020 * 00021 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 00022 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00023 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 00024 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00025 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 00026 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 00027 * DEALINGS IN THE SOFTWARE. 00028 ****************************************************************************/ 00029 00030 #ifndef GDALSSE_PRIV_H_INCLUDED 00031 #define GDALSSE_PRIV_H_INCLUDED 00032 00033 /* We restrict to 64bit processors because they are guaranteed to have SSE2 */ 00034 /* Could possibly be used too on 32bit, but we would need to check at runtime */ 00035 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION) 00036 00037 /* Requires SSE2 */ 00038 #include <emmintrin.h> 00039 #include <string.h> 00040 00041 class XMMReg2Double 00042 { 00043 public: 00044 __m128d xmm; 00045 00046 XMMReg2Double() {} 00047 XMMReg2Double(double val) { xmm = _mm_load_sd (&val); } 00048 XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {} 00049 00050 static inline XMMReg2Double Zero() 00051 { 00052 XMMReg2Double reg; 00053 reg.Zeroize(); 00054 return reg; 00055 } 00056 00057 static inline XMMReg2Double Load2Val(const double* ptr) 00058 { 00059 XMMReg2Double reg; 00060 reg.nsLoad2Val(ptr); 00061 return reg; 00062 } 00063 00064 static inline XMMReg2Double Load2Val(const float* ptr) 00065 { 00066 XMMReg2Double reg; 00067 reg.nsLoad2Val(ptr); 00068 return reg; 00069 } 00070 00071 static inline XMMReg2Double Load2ValAligned(const double* ptr) 00072 { 00073 XMMReg2Double reg; 00074 reg.nsLoad2ValAligned(ptr); 00075 return reg; 00076 } 00077 00078 static inline XMMReg2Double Load2Val(const unsigned char* ptr) 00079 { 00080 XMMReg2Double reg; 00081 reg.nsLoad2Val(ptr); 00082 return reg; 00083 } 00084 00085 static inline XMMReg2Double Load2Val(const short* ptr) 00086 { 00087 XMMReg2Double reg; 00088 reg.nsLoad2Val(ptr); 00089 return reg; 00090 } 00091 00092 static inline XMMReg2Double Load2Val(const unsigned short* ptr) 00093 { 00094 XMMReg2Double reg; 00095 reg.nsLoad2Val(ptr); 00096 return reg; 00097 } 00098 00099 inline void nsLoad2Val(const double* ptr) 00100 { 00101 xmm = _mm_loadu_pd(ptr); 00102 } 00103 00104 inline void nsLoad2ValAligned(const double* pval) 00105 { 00106 xmm = _mm_load_pd(pval); 00107 } 00108 00109 inline void nsLoad2Val(const float* pval) 00110 { 00111 __m128 temp1 = _mm_load_ss(pval); 00112 __m128 temp2 = _mm_load_ss(pval + 1); 00113 temp1 = _mm_shuffle_ps(temp1, temp2, _MM_SHUFFLE(1,0,1,0)); 00114 temp1 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,3,2,0)); 00115 xmm = _mm_cvtps_pd(temp1); 00116 } 00117 00118 inline void nsLoad2Val(const unsigned char* ptr) 00119 { 00120 __m128i xmm_i = _mm_cvtsi32_si128(*(unsigned short*)(ptr)); 00121 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128()); 00122 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128()); 00123 xmm = _mm_cvtepi32_pd(xmm_i); 00124 } 00125 00126 inline void nsLoad2Val(const short* ptr) 00127 { 00128 int i; 00129 memcpy(&i, ptr, 4); 00130 __m128i xmm_i = _mm_cvtsi32_si128(i); 00131 xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */ 00132 xmm_i = _mm_srai_epi32(xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */ 00133 xmm = _mm_cvtepi32_pd(xmm_i); 00134 } 00135 00136 inline void nsLoad2Val(const unsigned short* ptr) 00137 { 00138 int i; 00139 memcpy(&i, ptr, 4); 00140 __m128i xmm_i = _mm_cvtsi32_si128(i); 00141 xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */ 00142 xmm_i = _mm_srli_epi32(xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|0|b|0|a */ 00143 xmm = _mm_cvtepi32_pd(xmm_i); 00144 } 00145 00146 static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high) 00147 { 00148 __m128i xmm_i = _mm_cvtsi32_si128(*(int*)(ptr)); 00149 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128()); 00150 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128()); 00151 low.xmm = _mm_cvtepi32_pd(xmm_i); 00152 high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2))); 00153 } 00154 00155 static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high) 00156 { 00157 low.nsLoad2Val(ptr); 00158 high.nsLoad2Val(ptr+2); 00159 } 00160 00161 static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high) 00162 { 00163 low.nsLoad2Val(ptr); 00164 high.nsLoad2Val(ptr+2); 00165 } 00166 00167 static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high) 00168 { 00169 low.nsLoad2Val(ptr); 00170 high.nsLoad2Val(ptr+2); 00171 } 00172 00173 static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high) 00174 { 00175 __m128 temp1 = _mm_loadu_ps(ptr); 00176 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2)); 00177 low.xmm = _mm_cvtps_pd(temp1); 00178 high.xmm = _mm_cvtps_pd(temp2); 00179 } 00180 00181 inline void Zeroize() 00182 { 00183 xmm = _mm_setzero_pd(); 00184 } 00185 00186 inline const XMMReg2Double& operator= (const XMMReg2Double& other) 00187 { 00188 xmm = other.xmm; 00189 return *this; 00190 } 00191 00192 inline const XMMReg2Double& operator+= (const XMMReg2Double& other) 00193 { 00194 xmm = _mm_add_pd(xmm, other.xmm); 00195 return *this; 00196 } 00197 00198 inline XMMReg2Double operator+ (const XMMReg2Double& other) 00199 { 00200 XMMReg2Double ret; 00201 ret.xmm = _mm_add_pd(xmm, other.xmm); 00202 return ret; 00203 } 00204 00205 inline XMMReg2Double operator- (const XMMReg2Double& other) 00206 { 00207 XMMReg2Double ret; 00208 ret.xmm = _mm_sub_pd(xmm, other.xmm); 00209 return ret; 00210 } 00211 00212 inline XMMReg2Double operator* (const XMMReg2Double& other) 00213 { 00214 XMMReg2Double ret; 00215 ret.xmm = _mm_mul_pd(xmm, other.xmm); 00216 return ret; 00217 } 00218 00219 inline const XMMReg2Double& operator*= (const XMMReg2Double& other) 00220 { 00221 xmm = _mm_mul_pd(xmm, other.xmm); 00222 return *this; 00223 } 00224 00225 inline void AddLowAndHigh() 00226 { 00227 __m128d xmm2; 00228 xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1)); /* transfer high word into low word of xmm2 */ 00229 xmm = _mm_add_pd(xmm, xmm2); 00230 } 00231 00232 inline void Store2Double(double* pval) 00233 { 00234 _mm_storeu_pd(pval, xmm); 00235 } 00236 00237 inline void Store2DoubleAligned(double* pval) 00238 { 00239 _mm_store_pd(pval, xmm); 00240 } 00241 00242 inline operator double () const 00243 { 00244 double val; 00245 _mm_store_sd(&val, xmm); 00246 return val; 00247 } 00248 }; 00249 00250 #else 00251 00252 #warning "Software emulation of SSE2 !" 00253 00254 class XMMReg2Double 00255 { 00256 public: 00257 double low; 00258 double high; 00259 00260 XMMReg2Double() {} 00261 XMMReg2Double(double val) { low = val; high = 0.0; } 00262 XMMReg2Double(const XMMReg2Double& other) : low(other.low), high(other.high) {} 00263 00264 static inline XMMReg2Double Zero() 00265 { 00266 XMMReg2Double reg; 00267 reg.Zeroize(); 00268 return reg; 00269 } 00270 00271 static inline XMMReg2Double Load2Val(const double* ptr) 00272 { 00273 XMMReg2Double reg; 00274 reg.nsLoad2Val(ptr); 00275 return reg; 00276 } 00277 00278 static inline XMMReg2Double Load2ValAligned(const double* ptr) 00279 { 00280 XMMReg2Double reg; 00281 reg.nsLoad2ValAligned(ptr); 00282 return reg; 00283 } 00284 00285 static inline XMMReg2Double Load2Val(const float* ptr) 00286 { 00287 XMMReg2Double reg; 00288 reg.nsLoad2Val(ptr); 00289 return reg; 00290 } 00291 00292 static inline XMMReg2Double Load2Val(const unsigned char* ptr) 00293 { 00294 XMMReg2Double reg; 00295 reg.nsLoad2Val(ptr); 00296 return reg; 00297 } 00298 00299 static inline XMMReg2Double Load2Val(const short* ptr) 00300 { 00301 XMMReg2Double reg; 00302 reg.nsLoad2Val(ptr); 00303 return reg; 00304 } 00305 00306 inline void nsLoad2Val(const double* pval) 00307 { 00308 low = pval[0]; 00309 high = pval[1]; 00310 } 00311 00312 inline void nsLoad2ValAligned(const double* pval) 00313 { 00314 low = pval[0]; 00315 high = pval[1]; 00316 } 00317 00318 inline void nsLoad2Val(const float* pval) 00319 { 00320 low = pval[0]; 00321 high = pval[1]; 00322 } 00323 00324 inline void nsLoad2Val(const unsigned char* ptr) 00325 { 00326 low = ptr[0]; 00327 high = ptr[1]; 00328 } 00329 00330 inline void nsLoad2Val(const short* ptr) 00331 { 00332 low = ptr[0]; 00333 high = ptr[1]; 00334 } 00335 00336 inline void nsLoad2Val(const unsigned short* ptr) 00337 { 00338 low = ptr[0]; 00339 high = ptr[1]; 00340 } 00341 00342 static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high) 00343 { 00344 low.low = ptr[0]; 00345 low.high = ptr[1]; 00346 high.low = ptr[2]; 00347 high.high = ptr[3]; 00348 } 00349 00350 static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high) 00351 { 00352 low.nsLoad2Val(ptr); 00353 high.nsLoad2Val(ptr+2); 00354 } 00355 00356 static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high) 00357 { 00358 low.nsLoad2Val(ptr); 00359 high.nsLoad2Val(ptr+2); 00360 } 00361 00362 static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high) 00363 { 00364 low.nsLoad2Val(ptr); 00365 high.nsLoad2Val(ptr+2); 00366 } 00367 00368 static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high) 00369 { 00370 low.nsLoad2Val(ptr); 00371 high.nsLoad2Val(ptr+2); 00372 } 00373 00374 inline void Zeroize() 00375 { 00376 low = 0.0; 00377 high = 0.0; 00378 } 00379 00380 inline const XMMReg2Double& operator= (const XMMReg2Double& other) 00381 { 00382 low = other.low; 00383 high = other.high; 00384 return *this; 00385 } 00386 00387 inline const XMMReg2Double& operator+= (const XMMReg2Double& other) 00388 { 00389 low += other.low; 00390 high += other.high; 00391 return *this; 00392 } 00393 00394 inline XMMReg2Double operator+ (const XMMReg2Double& other) 00395 { 00396 XMMReg2Double ret; 00397 ret.low = low + other.low; 00398 ret.high = high + other.high; 00399 return ret; 00400 } 00401 00402 inline XMMReg2Double operator- (const XMMReg2Double& other) 00403 { 00404 XMMReg2Double ret; 00405 ret.low = low - other.low; 00406 ret.high = high - other.high; 00407 return ret; 00408 } 00409 00410 inline XMMReg2Double operator* (const XMMReg2Double& other) 00411 { 00412 XMMReg2Double ret; 00413 ret.low = low * other.low; 00414 ret.high = high * other.high; 00415 return ret; 00416 } 00417 00418 inline const XMMReg2Double& operator*= (const XMMReg2Double& other) 00419 { 00420 low *= other.low; 00421 high *= other.high; 00422 return *this; 00423 } 00424 00425 inline void AddLowAndHigh() 00426 { 00427 double add = low + high; 00428 low = add; 00429 high = add; 00430 } 00431 00432 inline void Store2Double(double* pval) 00433 { 00434 pval[0] = low; 00435 pval[1] = high; 00436 } 00437 00438 inline void Store2DoubleAligned(double* pval) 00439 { 00440 pval[0] = low; 00441 pval[1] = high; 00442 } 00443 00444 inline operator double () const 00445 { 00446 return low; 00447 } 00448 }; 00449 00450 #endif /* defined(__x86_64) || defined(_M_X64) */ 00451 00452 class XMMReg4Double 00453 { 00454 public: 00455 XMMReg2Double low, high; 00456 00457 XMMReg4Double() {} 00458 XMMReg4Double(const XMMReg4Double& other) : low(other.low), high(other.high) {} 00459 00460 static inline XMMReg4Double Zero() 00461 { 00462 XMMReg4Double reg; 00463 reg.low.Zeroize(); 00464 reg.high.Zeroize(); 00465 return reg; 00466 } 00467 00468 static inline XMMReg4Double Load4Val(const unsigned char* ptr) 00469 { 00470 XMMReg4Double reg; 00471 XMMReg2Double::Load4Val(ptr, reg.low, reg.high); 00472 return reg; 00473 } 00474 00475 static inline XMMReg4Double Load4Val(const short* ptr) 00476 { 00477 XMMReg4Double reg; 00478 reg.low.nsLoad2Val(ptr); 00479 reg.high.nsLoad2Val(ptr+2); 00480 return reg; 00481 } 00482 00483 static inline XMMReg4Double Load4Val(const unsigned short* ptr) 00484 { 00485 XMMReg4Double reg; 00486 reg.low.nsLoad2Val(ptr); 00487 reg.high.nsLoad2Val(ptr+2); 00488 return reg; 00489 } 00490 00491 static inline XMMReg4Double Load4Val(const double* ptr) 00492 { 00493 XMMReg4Double reg; 00494 reg.low.nsLoad2Val(ptr); 00495 reg.high.nsLoad2Val(ptr+2); 00496 return reg; 00497 } 00498 00499 static inline XMMReg4Double Load4ValAligned(const double* ptr) 00500 { 00501 XMMReg4Double reg; 00502 reg.low.nsLoad2ValAligned(ptr); 00503 reg.high.nsLoad2ValAligned(ptr+2); 00504 return reg; 00505 } 00506 00507 static inline XMMReg4Double Load4Val(const float* ptr) 00508 { 00509 XMMReg4Double reg; 00510 XMMReg2Double::Load4Val(ptr, reg.low, reg.high); 00511 return reg; 00512 } 00513 00514 inline const XMMReg4Double& operator= (const XMMReg4Double& other) 00515 { 00516 low = other.low; 00517 high = other.high; 00518 return *this; 00519 } 00520 00521 inline const XMMReg4Double& operator+= (const XMMReg4Double& other) 00522 { 00523 low += other.low; 00524 high += other.high; 00525 return *this; 00526 } 00527 00528 inline XMMReg4Double operator+ (const XMMReg4Double& other) 00529 { 00530 XMMReg4Double ret; 00531 ret.low = low + other.low; 00532 ret.high = high + other.high; 00533 return ret; 00534 } 00535 00536 inline XMMReg4Double operator- (const XMMReg4Double& other) 00537 { 00538 XMMReg4Double ret; 00539 ret.low = low - other.low; 00540 ret.high = high - other.high; 00541 return ret; 00542 } 00543 00544 inline XMMReg4Double operator* (const XMMReg4Double& other) 00545 { 00546 XMMReg4Double ret; 00547 ret.low = low * other.low; 00548 ret.high = high * other.high; 00549 return ret; 00550 } 00551 00552 inline const XMMReg4Double& operator*= (const XMMReg4Double& other) 00553 { 00554 low *= other.low; 00555 high *= other.high; 00556 return *this; 00557 } 00558 00559 inline void AddLowAndHigh() 00560 { 00561 low = low + high; 00562 low.AddLowAndHigh(); 00563 } 00564 00565 inline XMMReg2Double& GetLow() 00566 { 00567 return low; 00568 } 00569 }; 00570 00571 #endif /* GDALSSE_PRIV_H_INCLUDED */