GDAL
gdalsse_priv.h
00001 /******************************************************************************
00002  * $Id: gdalsse_priv.h 28877 2015-04-08 23:11:36Z rouault $
00003  *
00004  * Project:  GDAL
00005  * Purpose:  SSE2 helper
00006  * Author:   Even Rouault <even dot rouault at spatialys dot com>
00007  *
00008  ******************************************************************************
00009  * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
00010  *
00011  * Permission is hereby granted, free of charge, to any person obtaining a
00012  * copy of this software and associated documentation files (the "Software"),
00013  * to deal in the Software without restriction, including without limitation
00014  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
00015  * and/or sell copies of the Software, and to permit persons to whom the
00016  * Software is furnished to do so, subject to the following conditions:
00017  *
00018  * The above copyright notice and this permission notice shall be included
00019  * in all copies or substantial portions of the Software.
00020  *
00021  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
00022  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00023  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
00024  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00025  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
00026  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
00027  * DEALINGS IN THE SOFTWARE.
00028  ****************************************************************************/
00029 
00030 #ifndef GDALSSE_PRIV_H_INCLUDED
00031 #define GDALSSE_PRIV_H_INCLUDED
00032 
00033 /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
00034 /* Could possibly be used too on 32bit, but we would need to check at runtime */
00035 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)
00036 
00037 /* Requires SSE2 */
00038 #include <emmintrin.h>
00039 #include <string.h>
00040 
00041 class XMMReg2Double
00042 {
00043   public:
00044     __m128d xmm;
00045 
00046     XMMReg2Double() {}
00047     XMMReg2Double(double  val)  { xmm = _mm_load_sd (&val); }
00048     XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {}
00049 
00050     static inline XMMReg2Double Zero()
00051     {
00052         XMMReg2Double reg;
00053         reg.Zeroize();
00054         return reg;
00055     }
00056 
00057     static inline XMMReg2Double Load2Val(const double* ptr)
00058     {
00059         XMMReg2Double reg;
00060         reg.nsLoad2Val(ptr);
00061         return reg;
00062     }
00063 
00064     static inline XMMReg2Double Load2Val(const float* ptr)
00065     {
00066         XMMReg2Double reg;
00067         reg.nsLoad2Val(ptr);
00068         return reg;
00069     }
00070 
00071     static inline XMMReg2Double Load2ValAligned(const double* ptr)
00072     {
00073         XMMReg2Double reg;
00074         reg.nsLoad2ValAligned(ptr);
00075         return reg;
00076     }
00077 
00078     static inline XMMReg2Double Load2Val(const unsigned char* ptr)
00079     {
00080         XMMReg2Double reg;
00081         reg.nsLoad2Val(ptr);
00082         return reg;
00083     }
00084 
00085     static inline XMMReg2Double Load2Val(const short* ptr)
00086     {
00087         XMMReg2Double reg;
00088         reg.nsLoad2Val(ptr);
00089         return reg;
00090     }
00091 
00092     static inline XMMReg2Double Load2Val(const unsigned short* ptr)
00093     {
00094         XMMReg2Double reg;
00095         reg.nsLoad2Val(ptr);
00096         return reg;
00097     }
00098     
00099     inline void nsLoad2Val(const double* ptr)
00100     {
00101         xmm = _mm_loadu_pd(ptr);
00102     }
00103 
00104     inline void nsLoad2ValAligned(const double* pval)
00105     {
00106         xmm = _mm_load_pd(pval);
00107     }
00108 
00109     inline void nsLoad2Val(const float* pval)
00110     {
00111         __m128 temp1 = _mm_load_ss(pval);
00112         __m128 temp2 = _mm_load_ss(pval + 1);
00113         temp1 = _mm_shuffle_ps(temp1, temp2, _MM_SHUFFLE(1,0,1,0));
00114         temp1 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,3,2,0));
00115         xmm = _mm_cvtps_pd(temp1);
00116     }
00117 
00118     inline void nsLoad2Val(const unsigned char* ptr)
00119     {
00120         __m128i xmm_i = _mm_cvtsi32_si128(*(unsigned short*)(ptr));
00121         xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
00122         xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
00123         xmm = _mm_cvtepi32_pd(xmm_i);
00124     }
00125 
00126     inline void nsLoad2Val(const short* ptr)
00127     {
00128         int i;
00129         memcpy(&i, ptr, 4);
00130         __m128i xmm_i = _mm_cvtsi32_si128(i);
00131         xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
00132         xmm_i = _mm_srai_epi32(xmm_i, 16);       /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
00133         xmm = _mm_cvtepi32_pd(xmm_i);
00134     }
00135 
00136     inline void nsLoad2Val(const unsigned short* ptr)
00137     {
00138         int i;
00139         memcpy(&i, ptr, 4);
00140         __m128i xmm_i = _mm_cvtsi32_si128(i);
00141         xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
00142         xmm_i = _mm_srli_epi32(xmm_i, 16);       /* 0|0|0|0|b|b|a|a --> 0|0|0|0|0|b|0|a */
00143         xmm = _mm_cvtepi32_pd(xmm_i);
00144     }
00145     
00146     static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
00147     {
00148         __m128i xmm_i = _mm_cvtsi32_si128(*(int*)(ptr));
00149         xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
00150         xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
00151         low.xmm = _mm_cvtepi32_pd(xmm_i);
00152         high.xmm =  _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
00153     }
00154 
00155     static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
00156     {
00157         low.nsLoad2Val(ptr);
00158         high.nsLoad2Val(ptr+2);
00159     }
00160 
00161     static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
00162     {
00163         low.nsLoad2Val(ptr);
00164         high.nsLoad2Val(ptr+2);
00165     }
00166 
00167     static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
00168     {
00169         low.nsLoad2Val(ptr);
00170         high.nsLoad2Val(ptr+2);
00171     }
00172 
00173     static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
00174     {
00175         __m128 temp1 = _mm_loadu_ps(ptr);
00176         __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
00177         low.xmm = _mm_cvtps_pd(temp1);
00178         high.xmm = _mm_cvtps_pd(temp2);
00179     }
00180     
00181     inline void Zeroize()
00182     {
00183         xmm = _mm_setzero_pd();
00184     }
00185 
00186     inline const XMMReg2Double& operator= (const XMMReg2Double& other)
00187     {
00188         xmm = other.xmm;
00189         return *this;
00190     }
00191 
00192     inline const XMMReg2Double& operator+= (const XMMReg2Double& other)
00193     {
00194         xmm = _mm_add_pd(xmm, other.xmm);
00195         return *this;
00196     }
00197 
00198     inline XMMReg2Double operator+ (const XMMReg2Double& other)
00199     {
00200         XMMReg2Double ret;
00201         ret.xmm = _mm_add_pd(xmm, other.xmm);
00202         return ret;
00203     }
00204 
00205     inline XMMReg2Double operator- (const XMMReg2Double& other)
00206     {
00207         XMMReg2Double ret;
00208         ret.xmm = _mm_sub_pd(xmm, other.xmm);
00209         return ret;
00210     }
00211 
00212     inline XMMReg2Double operator* (const XMMReg2Double& other)
00213     {
00214         XMMReg2Double ret;
00215         ret.xmm = _mm_mul_pd(xmm, other.xmm);
00216         return ret;
00217     }
00218 
00219     inline const XMMReg2Double& operator*= (const XMMReg2Double& other)
00220     {
00221         xmm = _mm_mul_pd(xmm, other.xmm);
00222         return *this;
00223     }
00224 
00225     inline void AddLowAndHigh()
00226     {
00227         __m128d xmm2;
00228         xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1)); /* transfer high word into low word of xmm2 */
00229         xmm = _mm_add_pd(xmm, xmm2);
00230     }
00231     
00232     inline void Store2Double(double* pval)
00233     {
00234         _mm_storeu_pd(pval, xmm);
00235     }
00236     
00237     inline void Store2DoubleAligned(double* pval)
00238     {
00239         _mm_store_pd(pval, xmm);
00240     }
00241 
00242     inline operator double () const
00243     {
00244         double val;
00245         _mm_store_sd(&val, xmm);
00246         return val;
00247     }
00248 };
00249 
00250 #else
00251 
00252 #warning "Software emulation of SSE2 !"
00253 
00254 class XMMReg2Double
00255 {
00256   public:
00257     double low;
00258     double high;
00259 
00260     XMMReg2Double() {}
00261     XMMReg2Double(double  val)  { low = val; high = 0.0; }
00262     XMMReg2Double(const XMMReg2Double& other) : low(other.low), high(other.high) {}
00263 
00264     static inline XMMReg2Double Zero()
00265     {
00266         XMMReg2Double reg;
00267         reg.Zeroize();
00268         return reg;
00269     }
00270     
00271     static inline XMMReg2Double Load2Val(const double* ptr)
00272     {
00273         XMMReg2Double reg;
00274         reg.nsLoad2Val(ptr);
00275         return reg;
00276     }
00277 
00278     static inline XMMReg2Double Load2ValAligned(const double* ptr)
00279     {
00280         XMMReg2Double reg;
00281         reg.nsLoad2ValAligned(ptr);
00282         return reg;
00283     }
00284     
00285     static inline XMMReg2Double Load2Val(const float* ptr)
00286     {
00287         XMMReg2Double reg;
00288         reg.nsLoad2Val(ptr);
00289         return reg;
00290     }
00291 
00292     static inline XMMReg2Double Load2Val(const unsigned char* ptr)
00293     {
00294         XMMReg2Double reg;
00295         reg.nsLoad2Val(ptr);
00296         return reg;
00297     }
00298 
00299     static inline XMMReg2Double Load2Val(const short* ptr)
00300     {
00301         XMMReg2Double reg;
00302         reg.nsLoad2Val(ptr);
00303         return reg;
00304     }
00305 
00306     inline void nsLoad2Val(const double* pval)
00307     {
00308         low = pval[0];
00309         high = pval[1];
00310     }
00311 
00312     inline void nsLoad2ValAligned(const double* pval)
00313     {
00314         low = pval[0];
00315         high = pval[1];
00316     }
00317 
00318     inline void nsLoad2Val(const float* pval)
00319     {
00320         low = pval[0];
00321         high = pval[1];
00322     }
00323 
00324     inline void nsLoad2Val(const unsigned char* ptr)
00325     {
00326         low = ptr[0];
00327         high = ptr[1];
00328     }
00329 
00330     inline void nsLoad2Val(const short* ptr)
00331     {
00332         low = ptr[0];
00333         high = ptr[1];
00334     }
00335 
00336     inline void nsLoad2Val(const unsigned short* ptr)
00337     {
00338         low = ptr[0];
00339         high = ptr[1];
00340     }
00341     
00342     static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
00343     {
00344         low.low = ptr[0];
00345         low.high = ptr[1];
00346         high.low = ptr[2];
00347         high.high = ptr[3];
00348     }
00349 
00350     static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
00351     {
00352         low.nsLoad2Val(ptr);
00353         high.nsLoad2Val(ptr+2);
00354     }
00355 
00356     static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
00357     {
00358         low.nsLoad2Val(ptr);
00359         high.nsLoad2Val(ptr+2);
00360     }
00361 
00362     static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
00363     {
00364         low.nsLoad2Val(ptr);
00365         high.nsLoad2Val(ptr+2);
00366     }
00367 
00368     static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
00369     {
00370         low.nsLoad2Val(ptr);
00371         high.nsLoad2Val(ptr+2);
00372     }
00373 
00374     inline void Zeroize()
00375     {
00376         low = 0.0;
00377         high = 0.0;
00378     }
00379 
00380     inline const XMMReg2Double& operator= (const XMMReg2Double& other)
00381     {
00382         low = other.low;
00383         high = other.high;
00384         return *this;
00385     }
00386 
00387     inline const XMMReg2Double& operator+= (const XMMReg2Double& other)
00388     {
00389         low += other.low;
00390         high += other.high;
00391         return *this;
00392     }
00393 
00394     inline XMMReg2Double operator+ (const XMMReg2Double& other)
00395     {
00396         XMMReg2Double ret;
00397         ret.low = low + other.low;
00398         ret.high = high + other.high;
00399         return ret;
00400     }
00401 
00402     inline XMMReg2Double operator- (const XMMReg2Double& other)
00403     {
00404         XMMReg2Double ret;
00405         ret.low = low - other.low;
00406         ret.high = high - other.high;
00407         return ret;
00408     }
00409 
00410     inline XMMReg2Double operator* (const XMMReg2Double& other)
00411     {
00412         XMMReg2Double ret;
00413         ret.low = low * other.low;
00414         ret.high = high * other.high;
00415         return ret;
00416     }
00417 
00418     inline const XMMReg2Double& operator*= (const XMMReg2Double& other)
00419     {
00420         low *= other.low;
00421         high *= other.high;
00422         return *this;
00423     }
00424 
00425     inline void AddLowAndHigh()
00426     {
00427         double add = low + high;
00428         low = add;
00429         high = add;
00430     }
00431 
00432     inline void Store2Double(double* pval)
00433     {
00434         pval[0] = low;
00435         pval[1] = high;
00436     }
00437     
00438     inline void Store2DoubleAligned(double* pval)
00439     {
00440         pval[0] = low;
00441         pval[1] = high;
00442     }
00443 
00444     inline operator double () const
00445     {
00446         return low;
00447     }
00448 };
00449 
00450 #endif /*  defined(__x86_64) || defined(_M_X64) */
00451 
00452 class XMMReg4Double
00453 {
00454   public:
00455     XMMReg2Double low, high;
00456 
00457     XMMReg4Double() {}
00458     XMMReg4Double(const XMMReg4Double& other) : low(other.low), high(other.high) {}
00459 
00460     static inline XMMReg4Double Zero()
00461     {
00462         XMMReg4Double reg;
00463         reg.low.Zeroize();
00464         reg.high.Zeroize();
00465         return reg;
00466     }
00467     
00468     static inline XMMReg4Double Load4Val(const unsigned char* ptr)
00469     {
00470         XMMReg4Double reg;
00471         XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
00472         return reg;
00473     }
00474 
00475     static inline XMMReg4Double Load4Val(const short* ptr)
00476     {
00477         XMMReg4Double reg;
00478         reg.low.nsLoad2Val(ptr);
00479         reg.high.nsLoad2Val(ptr+2);
00480         return reg;
00481     }
00482 
00483     static inline XMMReg4Double Load4Val(const unsigned short* ptr)
00484     {
00485         XMMReg4Double reg;
00486         reg.low.nsLoad2Val(ptr);
00487         reg.high.nsLoad2Val(ptr+2);
00488         return reg;
00489     }
00490 
00491     static inline XMMReg4Double Load4Val(const double* ptr)
00492     {
00493         XMMReg4Double reg;
00494         reg.low.nsLoad2Val(ptr);
00495         reg.high.nsLoad2Val(ptr+2);
00496         return reg;
00497     }
00498 
00499     static inline XMMReg4Double Load4ValAligned(const double* ptr)
00500     {
00501         XMMReg4Double reg;
00502         reg.low.nsLoad2ValAligned(ptr);
00503         reg.high.nsLoad2ValAligned(ptr+2);
00504         return reg;
00505     }
00506 
00507     static inline XMMReg4Double Load4Val(const float* ptr)
00508     {
00509         XMMReg4Double reg;
00510         XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
00511         return reg;
00512     }
00513     
00514     inline const XMMReg4Double& operator= (const XMMReg4Double& other)
00515     {
00516         low = other.low;
00517         high = other.high;
00518         return *this;
00519     }
00520 
00521     inline const XMMReg4Double& operator+= (const XMMReg4Double& other)
00522     {
00523         low += other.low;
00524         high += other.high;
00525         return *this;
00526     }
00527 
00528     inline XMMReg4Double operator+ (const XMMReg4Double& other)
00529     {
00530         XMMReg4Double ret;
00531         ret.low = low + other.low;
00532         ret.high = high + other.high;
00533         return ret;
00534     }
00535 
00536     inline XMMReg4Double operator- (const XMMReg4Double& other)
00537     {
00538         XMMReg4Double ret;
00539         ret.low = low - other.low;
00540         ret.high = high - other.high;
00541         return ret;
00542     }
00543 
00544     inline XMMReg4Double operator* (const XMMReg4Double& other)
00545     {
00546         XMMReg4Double ret;
00547         ret.low = low * other.low;
00548         ret.high = high * other.high;
00549         return ret;
00550     }
00551 
00552     inline const XMMReg4Double& operator*= (const XMMReg4Double& other)
00553     {
00554         low *= other.low;
00555         high *= other.high;
00556         return *this;
00557     }
00558 
00559     inline void AddLowAndHigh()
00560     {
00561         low = low + high;
00562         low.AddLowAndHigh();
00563     }
00564 
00565     inline XMMReg2Double& GetLow()
00566     {
00567         return low;
00568     }
00569 };
00570 
00571 #endif /* GDALSSE_PRIV_H_INCLUDED */

Generated for GDAL by doxygen 1.7.6.1.