![]() |
Eigen
3.3.3
|
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2001 Intel Corporation 00005 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr> 00006 // Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com> 00007 // 00008 // This Source Code Form is subject to the terms of the Mozilla 00009 // Public License v. 2.0. If a copy of the MPL was not distributed 00010 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00011 00012 // The SSE code for the 4x4 float and double matrix inverse in this file 00013 // comes from the following Intel's library: 00014 // http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/ 00015 // 00016 // Here is the respective copyright and license statement: 00017 // 00018 // Copyright (c) 2001 Intel Corporation. 00019 // 00020 // Permition is granted to use, copy, distribute and prepare derivative works 00021 // of this library for any purpose and without fee, provided, that the above 00022 // copyright notice and this statement appear in all copies. 00023 // Intel makes no representations about the suitability of this software for 00024 // any purpose, and specifically disclaims all warranties. 00025 // See LEGAL.TXT for all the legal information. 00026 00027 #ifndef EIGEN_INVERSE_SSE_H 00028 #define EIGEN_INVERSE_SSE_H 00029 00030 namespace Eigen { 00031 00032 namespace internal { 00033 00034 template<typename MatrixType, typename ResultType> 00035 struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType> 00036 { 00037 enum { 00038 MatrixAlignment = traits<MatrixType>::Alignment, 00039 ResultAlignment = traits<ResultType>::Alignment, 00040 StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) 00041 }; 00042 typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType; 00043 00044 static void run(const MatrixType& mat, ResultType& result) 00045 { 00046 ActualMatrixType matrix(mat); 00047 EIGEN_ALIGN16 const unsigned int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 }; 00048 00049 // Load the full matrix into registers 00050 __m128 _L1 = matrix.template packet<MatrixAlignment>( 0); 00051 __m128 _L2 = matrix.template packet<MatrixAlignment>( 4); 00052 __m128 _L3 = matrix.template packet<MatrixAlignment>( 8); 00053 __m128 _L4 = matrix.template packet<MatrixAlignment>(12); 00054 00055 // The inverse is calculated using "Divide and Conquer" technique. The 00056 // original matrix is divide into four 2x2 sub-matrices. Since each 00057 // register holds four matrix element, the smaller matrices are 00058 // represented as a registers. Hence we get a better locality of the 00059 // calculations. 00060 00061 __m128 A, B, C, D; // the four sub-matrices 00062 if(!StorageOrdersMatch) 00063 { 00064 A = _mm_unpacklo_ps(_L1, _L2); 00065 B = _mm_unpacklo_ps(_L3, _L4); 00066 C = _mm_unpackhi_ps(_L1, _L2); 00067 D = _mm_unpackhi_ps(_L3, _L4); 00068 } 00069 else 00070 { 00071 A = _mm_movelh_ps(_L1, _L2); 00072 B = _mm_movehl_ps(_L2, _L1); 00073 C = _mm_movelh_ps(_L3, _L4); 00074 D = _mm_movehl_ps(_L4, _L3); 00075 } 00076 00077 __m128 iA, iB, iC, iD, // partial inverse of the sub-matrices 00078 DC, AB; 00079 __m128 dA, dB, dC, dD; // determinant of the sub-matrices 00080 __m128 det, d, d1, d2; 00081 __m128 rd; // reciprocal of the determinant 00082 00083 // AB = A# * B 00084 AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B); 00085 AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E))); 00086 // DC = D# * C 00087 DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C); 00088 DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E))); 00089 00090 // dA = |A| 00091 dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A); 00092 dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA)); 00093 // dB = |B| 00094 dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B); 00095 dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB)); 00096 00097 // dC = |C| 00098 dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C); 00099 dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC)); 00100 // dD = |D| 00101 dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D); 00102 dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD)); 00103 00104 // d = trace(AB*DC) = trace(A#*B*D#*C) 00105 d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB); 00106 00107 // iD = C*A#*B 00108 iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB)); 00109 iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB))); 00110 // iA = B*D#*C 00111 iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC)); 00112 iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC))); 00113 00114 // d = trace(AB*DC) = trace(A#*B*D#*C) [continue] 00115 d = _mm_add_ps(d, _mm_movehl_ps(d, d)); 00116 d = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1)); 00117 d1 = _mm_mul_ss(dA,dD); 00118 d2 = _mm_mul_ss(dB,dC); 00119 00120 // iD = D*|A| - C*A#*B 00121 iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD); 00122 00123 // iA = A*|D| - B*D#*C; 00124 iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA); 00125 00126 // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C) 00127 det = _mm_sub_ss(_mm_add_ss(d1,d2),d); 00128 rd = _mm_div_ss(_mm_set_ss(1.0f), det); 00129 00130 // #ifdef ZERO_SINGULAR 00131 // rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd); 00132 // #endif 00133 00134 // iB = D * (A#B)# = D*B#*A 00135 iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33)); 00136 iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66))); 00137 // iC = A * (D#C)# = A*C#*D 00138 iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33)); 00139 iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66))); 00140 00141 rd = _mm_shuffle_ps(rd,rd,0); 00142 rd = _mm_xor_ps(rd, _mm_load_ps((float*)_Sign_PNNP)); 00143 00144 // iB = C*|B| - D*B#*A 00145 iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB); 00146 00147 // iC = B*|C| - A*C#*D; 00148 iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC); 00149 00150 // iX = iX / det 00151 iA = _mm_mul_ps(rd,iA); 00152 iB = _mm_mul_ps(rd,iB); 00153 iC = _mm_mul_ps(rd,iC); 00154 iD = _mm_mul_ps(rd,iD); 00155 00156 Index res_stride = result.outerStride(); 00157 float* res = result.data(); 00158 pstoret<float, Packet4f, ResultAlignment>(res+0, _mm_shuffle_ps(iA,iB,0x77)); 00159 pstoret<float, Packet4f, ResultAlignment>(res+res_stride, _mm_shuffle_ps(iA,iB,0x22)); 00160 pstoret<float, Packet4f, ResultAlignment>(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77)); 00161 pstoret<float, Packet4f, ResultAlignment>(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22)); 00162 } 00163 00164 }; 00165 00166 template<typename MatrixType, typename ResultType> 00167 struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType> 00168 { 00169 enum { 00170 MatrixAlignment = traits<MatrixType>::Alignment, 00171 ResultAlignment = traits<ResultType>::Alignment, 00172 StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) 00173 }; 00174 typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType; 00175 00176 static void run(const MatrixType& mat, ResultType& result) 00177 { 00178 ActualMatrixType matrix(mat); 00179 const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); 00180 const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); 00181 00182 // The inverse is calculated using "Divide and Conquer" technique. The 00183 // original matrix is divide into four 2x2 sub-matrices. Since each 00184 // register of the matrix holds two elements, the smaller matrices are 00185 // consisted of two registers. Hence we get a better locality of the 00186 // calculations. 00187 00188 // the four sub-matrices 00189 __m128d A1, A2, B1, B2, C1, C2, D1, D2; 00190 00191 if(StorageOrdersMatch) 00192 { 00193 A1 = matrix.template packet<MatrixAlignment>( 0); B1 = matrix.template packet<MatrixAlignment>( 2); 00194 A2 = matrix.template packet<MatrixAlignment>( 4); B2 = matrix.template packet<MatrixAlignment>( 6); 00195 C1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10); 00196 C2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14); 00197 } 00198 else 00199 { 00200 __m128d tmp; 00201 A1 = matrix.template packet<MatrixAlignment>( 0); C1 = matrix.template packet<MatrixAlignment>( 2); 00202 A2 = matrix.template packet<MatrixAlignment>( 4); C2 = matrix.template packet<MatrixAlignment>( 6); 00203 tmp = A1; 00204 A1 = _mm_unpacklo_pd(A1,A2); 00205 A2 = _mm_unpackhi_pd(tmp,A2); 00206 tmp = C1; 00207 C1 = _mm_unpacklo_pd(C1,C2); 00208 C2 = _mm_unpackhi_pd(tmp,C2); 00209 00210 B1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10); 00211 B2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14); 00212 tmp = B1; 00213 B1 = _mm_unpacklo_pd(B1,B2); 00214 B2 = _mm_unpackhi_pd(tmp,B2); 00215 tmp = D1; 00216 D1 = _mm_unpacklo_pd(D1,D2); 00217 D2 = _mm_unpackhi_pd(tmp,D2); 00218 } 00219 00220 __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2, // partial invese of the sub-matrices 00221 DC1, DC2, AB1, AB2; 00222 __m128d dA, dB, dC, dD; // determinant of the sub-matrices 00223 __m128d det, d1, d2, rd; 00224 00225 // dA = |A| 00226 dA = _mm_shuffle_pd(A2, A2, 1); 00227 dA = _mm_mul_pd(A1, dA); 00228 dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3)); 00229 // dB = |B| 00230 dB = _mm_shuffle_pd(B2, B2, 1); 00231 dB = _mm_mul_pd(B1, dB); 00232 dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3)); 00233 00234 // AB = A# * B 00235 AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3)); 00236 AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0)); 00237 AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3))); 00238 AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0))); 00239 00240 // dC = |C| 00241 dC = _mm_shuffle_pd(C2, C2, 1); 00242 dC = _mm_mul_pd(C1, dC); 00243 dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3)); 00244 // dD = |D| 00245 dD = _mm_shuffle_pd(D2, D2, 1); 00246 dD = _mm_mul_pd(D1, dD); 00247 dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3)); 00248 00249 // DC = D# * C 00250 DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3)); 00251 DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0)); 00252 DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3))); 00253 DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0))); 00254 00255 // rd = trace(AB*DC) = trace(A#*B*D#*C) 00256 d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0)); 00257 d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3)); 00258 rd = _mm_add_pd(d1, d2); 00259 rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3)); 00260 00261 // iD = C*A#*B 00262 iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0)); 00263 iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0)); 00264 iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3))); 00265 iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3))); 00266 00267 // iA = B*D#*C 00268 iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0)); 00269 iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0)); 00270 iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3))); 00271 iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3))); 00272 00273 // iD = D*|A| - C*A#*B 00274 dA = _mm_shuffle_pd(dA,dA,0); 00275 iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1); 00276 iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2); 00277 00278 // iA = A*|D| - B*D#*C; 00279 dD = _mm_shuffle_pd(dD,dD,0); 00280 iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1); 00281 iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2); 00282 00283 d1 = _mm_mul_sd(dA, dD); 00284 d2 = _mm_mul_sd(dB, dC); 00285 00286 // iB = D * (A#B)# = D*B#*A 00287 iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1)); 00288 iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1)); 00289 iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2))); 00290 iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2))); 00291 00292 // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C) 00293 det = _mm_add_sd(d1, d2); 00294 det = _mm_sub_sd(det, rd); 00295 00296 // iC = A * (D#C)# = A*C#*D 00297 iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1)); 00298 iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1)); 00299 iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2))); 00300 iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2))); 00301 00302 rd = _mm_div_sd(_mm_set_sd(1.0), det); 00303 // #ifdef ZERO_SINGULAR 00304 // rd = _mm_and_pd(_mm_cmpneq_sd(det,_mm_setzero_pd()), rd); 00305 // #endif 00306 rd = _mm_shuffle_pd(rd,rd,0); 00307 00308 // iB = C*|B| - D*B#*A 00309 dB = _mm_shuffle_pd(dB,dB,0); 00310 iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1); 00311 iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2); 00312 00313 d1 = _mm_xor_pd(rd, _Sign_PN); 00314 d2 = _mm_xor_pd(rd, _Sign_NP); 00315 00316 // iC = B*|C| - A*C#*D; 00317 dC = _mm_shuffle_pd(dC,dC,0); 00318 iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1); 00319 iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2); 00320 00321 Index res_stride = result.outerStride(); 00322 double* res = result.data(); 00323 pstoret<double, Packet2d, ResultAlignment>(res+0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1)); 00324 pstoret<double, Packet2d, ResultAlignment>(res+res_stride, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2)); 00325 pstoret<double, Packet2d, ResultAlignment>(res+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1)); 00326 pstoret<double, Packet2d, ResultAlignment>(res+res_stride+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2)); 00327 pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1)); 00328 pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2)); 00329 pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1)); 00330 pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2)); 00331 } 00332 }; 00333 00334 } // end namespace internal 00335 00336 } // end namespace Eigen 00337 00338 #endif // EIGEN_INVERSE_SSE_H