libflame  revision_anchor
FLA_Apply_G_mx4s_asm.h
Go to the documentation of this file.
00001 /*
00002 
00003     Copyright (C) 2014, The University of Texas at Austin
00004 
00005     This file is part of libflame and is available under the 3-Clause
00006     BSD license, which can be found in the LICENSE file at the top-level
00007     directory, or at http://opensource.org/licenses/BSD-3-Clause
00008 
00009 */
00010 
00011 
00012 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
00013 
00014 #define MAC_Apply_G_mx4s_ass MAC_Apply_G_mx4s_ops
00015 #define MAC_Apply_G_mx4s_asd MAC_Apply_G_mx4s_opd
00016 #define MAC_Apply_G_mx4s_asc MAC_Apply_G_mx4s_opc
00017 #define MAC_Apply_G_mx4s_asz MAC_Apply_G_mx4s_opz
00018 
00019 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
00020 
00021 #define MAC_Apply_G_mx4s_ass( m_A, \
00022                               gamma23_k1, \
00023                               sigma23_k1, \
00024                               gamma34_k1, \
00025                               sigma34_k1, \
00026                               gamma12_k2, \
00027                               sigma12_k2, \
00028                               gamma23_k2, \
00029                               sigma23_k2, \
00030                               a1, inc_a1, \
00031                               a2, inc_a2, \
00032                               a3, inc_a3, \
00033                               a4, inc_a4 ) \
00034 {\
00035     int                n_iter32 = m_A / ( 4 * 8 ); \
00036     int                n_left32 = m_A % ( 4 * 8 ); \
00037     int                n_iter4  = n_left32 / ( 4 * 1 ); \
00038     int                n_left   = n_left32 % ( 4 * 1 ); \
00039     int                i; \
00040 \
00041     const int          step_a1 = inc_a1 * 4; \
00042     const int          step_a2 = inc_a2 * 4; \
00043     const int          step_a3 = inc_a3 * 4; \
00044     const int          step_a4 = inc_a4 * 4; \
00045 \
00046     float*    restrict alpha1 = a1; \
00047     float*    restrict alpha2 = a2; \
00048     float*    restrict alpha3 = a3; \
00049     float*    restrict alpha4 = a4; \
00050 \
00051     v4sf_t             a1v, a2v, a3v, a4v; \
00052     v4sf_t             b1v, b2v, b3v, b4v; \
00053     v4sf_t             g23_k1v, s23_k1v; \
00054     v4sf_t             g34_k1v, s34_k1v; \
00055     v4sf_t             g12_k2v, s12_k2v; \
00056     v4sf_t             g23_k2v, s23_k2v; \
00057     v4sf_t             t1v, t2v, t3v; \
00058 \
00059     g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \
00060     s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \
00061     g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \
00062     s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \
00063     g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \
00064     s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \
00065     g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \
00066     s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \
00067 \
00068     for ( i = 0; i < n_iter32; ++i ) \
00069     { \
00070 \
00071         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00072         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00073         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
00074 \
00075         t2v.v = a2v.v; \
00076         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00077         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00078 \
00079         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00080 \
00081         t3v.v = a3v.v; \
00082         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00083         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00084 \
00085         _mm_store_ps( ( float* )alpha4, a4v.v ); \
00086         alpha4 += step_a4; \
00087 \
00088         t1v.v = a1v.v; \
00089         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00090         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00091 \
00092         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00093         alpha1 += step_a1; \
00094         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00095 \
00096         t2v.v = a2v.v; \
00097         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00098         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00099 \
00100         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00101         alpha2 += step_a2; \
00102         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00103 \
00104 /* ----------------------------------------------------------- */ \
00105 \
00106         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
00107 \
00108         t2v.v = b2v.v; \
00109         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00110         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00111 \
00112         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00113         alpha3 += step_a3; \
00114         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
00115 \
00116         t3v.v = b3v.v; \
00117         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00118         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00119 \
00120         _mm_store_ps( ( float* )alpha4, b4v.v ); \
00121         alpha4 += step_a4; \
00122 \
00123         t1v.v = b1v.v; \
00124         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00125         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00126 \
00127         _mm_store_ps( ( float* )alpha1, b1v.v ); \
00128         alpha1 += step_a1; \
00129         a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00130 \
00131         t2v.v = b2v.v; \
00132         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00133         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00134 \
00135         _mm_store_ps( ( float* )alpha2, b2v.v ); \
00136         alpha2 += step_a2; \
00137         a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00138 \
00139 /* ----------------------------------------------------------- */ \
00140 \
00141         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
00142 \
00143         t2v.v = a2v.v; \
00144         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00145         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00146 \
00147         _mm_store_ps( ( float* )alpha3, b3v.v ); \
00148         alpha3 += step_a3; \
00149         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00150 \
00151         t3v.v = a3v.v; \
00152         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00153         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00154 \
00155         _mm_store_ps( ( float* )alpha4, a4v.v ); \
00156         alpha4 += step_a4; \
00157 \
00158         t1v.v = a1v.v; \
00159         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00160         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00161 \
00162         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00163         alpha1 += step_a1; \
00164         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00165 \
00166         t2v.v = a2v.v; \
00167         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00168         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00169 \
00170         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00171         alpha2 += step_a2; \
00172         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00173 \
00174 /* ----------------------------------------------------------- */ \
00175 \
00176         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
00177 \
00178         t2v.v = b2v.v; \
00179         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00180         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00181 \
00182         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00183         alpha3 += step_a3; \
00184         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
00185 \
00186         t3v.v = b3v.v; \
00187         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00188         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00189 \
00190         _mm_store_ps( ( float* )alpha4, b4v.v ); \
00191         alpha4 += step_a4; \
00192 \
00193         t1v.v = b1v.v; \
00194         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00195         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00196 \
00197         _mm_store_ps( ( float* )alpha1, b1v.v ); \
00198         alpha1 += step_a1; \
00199         a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \
00200 \
00201         t2v.v = b2v.v; \
00202         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00203         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00204 \
00205         _mm_store_ps( ( float* )alpha2, b2v.v ); \
00206         alpha2 += step_a2; \
00207         a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00208 \
00209 \
00210 /* ----------------------------------------------------------- */ \
00211 \
00212         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
00213 \
00214         t2v.v = a2v.v; \
00215         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00216         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00217 \
00218         _mm_store_ps( ( float* )alpha3, b3v.v ); \
00219         alpha3 += step_a3; \
00220         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00221 \
00222         t3v.v = a3v.v; \
00223         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00224         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00225 \
00226         _mm_store_ps( ( float* )alpha4, a4v.v ); \
00227         alpha4 += step_a4; \
00228 \
00229         t1v.v = a1v.v; \
00230         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00231         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00232 \
00233         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00234         alpha1 += step_a1; \
00235         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00236 \
00237         t2v.v = a2v.v; \
00238         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00239         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00240 \
00241         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00242         alpha2 += step_a2; \
00243         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00244 \
00245 /* ----------------------------------------------------------- */ \
00246 \
00247         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
00248 \
00249         t2v.v = b2v.v; \
00250         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00251         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00252 \
00253         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00254         alpha3 += step_a3; \
00255         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
00256 \
00257         t3v.v = b3v.v; \
00258         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00259         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00260 \
00261         _mm_store_ps( ( float* )alpha4, b4v.v ); \
00262         alpha4 += step_a4; \
00263 \
00264         t1v.v = b1v.v; \
00265         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00266         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00267 \
00268         _mm_store_ps( ( float* )alpha1, b1v.v ); \
00269         alpha1 += step_a1; \
00270         a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00271 \
00272         t2v.v = b2v.v; \
00273         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00274         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00275 \
00276         _mm_store_ps( ( float* )alpha2, b2v.v ); \
00277         alpha2 += step_a2; \
00278         a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00279 \
00280 /* ----------------------------------------------------------- */ \
00281 \
00282         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
00283 \
00284         t2v.v = a2v.v; \
00285         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00286         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00287 \
00288         _mm_store_ps( ( float* )alpha3, b3v.v ); \
00289         alpha3 += step_a3; \
00290         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00291 \
00292         t3v.v = a3v.v; \
00293         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00294         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00295 \
00296         _mm_store_ps( ( float* )alpha4, a4v.v ); \
00297         alpha4 += step_a4; \
00298 \
00299         t1v.v = a1v.v; \
00300         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00301         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00302 \
00303         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00304         alpha1 += step_a1; \
00305         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00306 \
00307         t2v.v = a2v.v; \
00308         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00309         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00310 \
00311         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00312         alpha2 += step_a2; \
00313         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00314 \
00315 /* ----------------------------------------------------------- */ \
00316 \
00317         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
00318 \
00319         t2v.v = b2v.v; \
00320         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00321         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00322 \
00323         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00324         alpha3 += step_a3; \
00325         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
00326 \
00327         t3v.v = b3v.v; \
00328         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00329         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00330 \
00331         _mm_store_ps( ( float* )alpha4, b4v.v ); \
00332         alpha4 += step_a4; \
00333 \
00334         t1v.v = b1v.v; \
00335         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00336         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00337 \
00338         _mm_store_ps( ( float* )alpha1, b1v.v ); \
00339         alpha1 += step_a1; \
00340 \
00341         t2v.v = b2v.v; \
00342         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00343         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00344 \
00345         _mm_store_ps( ( float* )alpha2, b2v.v ); \
00346         alpha2 += step_a2; \
00347 \
00348         _mm_store_ps( ( float* )alpha3, b3v.v ); \
00349         alpha3 += step_a3; \
00350 \
00351 /* ----------------------------------------------------------- */ \
00352     } \
00353 \
00354     for ( i = 0; i < n_iter4; ++i ) \
00355     { \
00356 \
00357         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00358         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00359         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
00360 \
00361         t2v.v = a2v.v; \
00362         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00363         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00364 \
00365         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00366 \
00367         t3v.v = a3v.v; \
00368         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00369         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00370 \
00371         _mm_store_ps( ( float* )alpha4, a4v.v ); \
00372         alpha4 += step_a4; \
00373 \
00374         t1v.v = a1v.v; \
00375         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00376         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00377 \
00378         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00379         alpha1 += step_a1; \
00380 \
00381         t2v.v = a2v.v; \
00382         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00383         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00384 \
00385         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00386         alpha2 += step_a2; \
00387         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00388         alpha3 += step_a3; \
00389     } \
00390 \
00391     for ( i = 0; i < n_left; ++i ) \
00392     { \
00393         float              ga23_k1 = *gamma23_k1; \
00394         float              si23_k1 = *sigma23_k1; \
00395         float              ga34_k1 = *gamma34_k1; \
00396         float              si34_k1 = *sigma34_k1; \
00397         float              ga12_k2 = *gamma12_k2; \
00398         float              si12_k2 = *sigma12_k2; \
00399         float              ga23_k2 = *gamma23_k2; \
00400         float              si23_k2 = *sigma23_k2; \
00401         float              temp1; \
00402         float              temp2; \
00403         float              temp3; \
00404         float              temp4; \
00405 \
00406         temp2 = *alpha2; \
00407         temp3 = *alpha3; \
00408 \
00409         *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
00410         *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
00411 \
00412         temp3 = *alpha3; \
00413         temp4 = *alpha4; \
00414 \
00415         *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
00416         *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
00417 \
00418         temp1 = *alpha1; \
00419         temp2 = *alpha2; \
00420 \
00421         *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
00422         *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
00423 \
00424         temp2 = *alpha2; \
00425         temp3 = *alpha3; \
00426 \
00427         *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
00428         *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
00429 \
00430         alpha1 += 1; \
00431         alpha2 += 1; \
00432         alpha3 += 1; \
00433         alpha4 += 1; \
00434     } \
00435 }
00436 
00437 #define MAC_Apply_G_mx4s_asd( m_A, \
00438                               gamma23_k1, \
00439                               sigma23_k1, \
00440                               gamma34_k1, \
00441                               sigma34_k1, \
00442                               gamma12_k2, \
00443                               sigma12_k2, \
00444                               gamma23_k2, \
00445                               sigma23_k2, \
00446                               a1, inc_a1, \
00447                               a2, inc_a2, \
00448                               a3, inc_a3, \
00449                               a4, inc_a4 ) \
00450 {\
00451     int                n_iter16 = m_A / ( 2 * 8 ); \
00452     int                n_left16 = m_A % ( 2 * 8 ); \
00453     int                n_iter2  = n_left16 / ( 2 * 1 ); \
00454     int                n_left   = n_left16 % ( 2 * 1 ); \
00455     int                i; \
00456 \
00457     const int          step_a1 = inc_a1 * 2; \
00458     const int          step_a2 = inc_a2 * 2; \
00459     const int          step_a3 = inc_a3 * 2; \
00460     const int          step_a4 = inc_a4 * 2; \
00461 \
00462     double*   restrict alpha1 = a1; \
00463     double*   restrict alpha2 = a2; \
00464     double*   restrict alpha3 = a3; \
00465     double*   restrict alpha4 = a4; \
00466 \
00467     v2df_t             a1v, a2v, a3v, a4v; \
00468     v2df_t             b1v, b2v, b3v, b4v; \
00469     v2df_t             g23_k1v, s23_k1v; \
00470     v2df_t             g34_k1v, s34_k1v; \
00471     v2df_t             g12_k2v, s12_k2v; \
00472     v2df_t             g23_k2v, s23_k2v; \
00473     v2df_t             t1v, t2v, t3v; \
00474 \
00475     g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \
00476     s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \
00477     g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \
00478     s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \
00479     g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \
00480     s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \
00481     g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \
00482     s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \
00483 \
00484     for ( i = 0; i < n_iter16; ++i ) \
00485     { \
00486 \
00487         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00488         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00489         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
00490 \
00491         t2v.v = a2v.v; \
00492         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00493         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00494 \
00495         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00496 \
00497         t3v.v = a3v.v; \
00498         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00499         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00500 \
00501         _mm_store_pd( ( double* )alpha4, a4v.v ); \
00502         alpha4 += step_a4; \
00503 \
00504         t1v.v = a1v.v; \
00505         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00506         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00507 \
00508         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00509         alpha1 += step_a1; \
00510         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
00511 \
00512         t2v.v = a2v.v; \
00513         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00514         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00515 \
00516         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00517         alpha2 += step_a2; \
00518         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
00519 \
00520 /* ----------------------------------------------------------- */ \
00521 \
00522         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
00523 \
00524         t2v.v = b2v.v; \
00525         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00526         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00527 \
00528         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00529         alpha3 += step_a3; \
00530         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
00531 \
00532         t3v.v = b3v.v; \
00533         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00534         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00535 \
00536         _mm_store_pd( ( double* )alpha4, b4v.v ); \
00537         alpha4 += step_a4; \
00538 \
00539         t1v.v = b1v.v; \
00540         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00541         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00542 \
00543         _mm_store_pd( ( double* )alpha1, b1v.v ); \
00544         alpha1 += step_a1; \
00545         a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
00546 \
00547         t2v.v = b2v.v; \
00548         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00549         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00550 \
00551         _mm_store_pd( ( double* )alpha2, b2v.v ); \
00552         alpha2 += step_a2; \
00553         a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
00554 \
00555 /* ----------------------------------------------------------- */ \
00556 \
00557         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
00558 \
00559         t2v.v = a2v.v; \
00560         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00561         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00562 \
00563         _mm_store_pd( ( double* )alpha3, b3v.v ); \
00564         alpha3 += step_a3; \
00565         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00566 \
00567         t3v.v = a3v.v; \
00568         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00569         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00570 \
00571         _mm_store_pd( ( double* )alpha4, a4v.v ); \
00572         alpha4 += step_a4; \
00573 \
00574         t1v.v = a1v.v; \
00575         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00576         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00577 \
00578         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00579         alpha1 += step_a1; \
00580         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
00581 \
00582         t2v.v = a2v.v; \
00583         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00584         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00585 \
00586         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00587         alpha2 += step_a2; \
00588         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
00589 \
00590 /* ----------------------------------------------------------- */ \
00591 \
00592         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
00593 \
00594         t2v.v = b2v.v; \
00595         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00596         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00597 \
00598         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00599         alpha3 += step_a3; \
00600         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
00601 \
00602         t3v.v = b3v.v; \
00603         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00604         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00605 \
00606         _mm_store_pd( ( double* )alpha4, b4v.v ); \
00607         alpha4 += step_a4; \
00608 \
00609         t1v.v = b1v.v; \
00610         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00611         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00612 \
00613         _mm_store_pd( ( double* )alpha1, b1v.v ); \
00614         alpha1 += step_a1; \
00615         a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \
00616 \
00617         t2v.v = b2v.v; \
00618         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00619         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00620 \
00621         _mm_store_pd( ( double* )alpha2, b2v.v ); \
00622         alpha2 += step_a2; \
00623         a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
00624 \
00625 \
00626 /* ----------------------------------------------------------- */ \
00627 \
00628         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
00629 \
00630         t2v.v = a2v.v; \
00631         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00632         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00633 \
00634         _mm_store_pd( ( double* )alpha3, b3v.v ); \
00635         alpha3 += step_a3; \
00636         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00637 \
00638         t3v.v = a3v.v; \
00639         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00640         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00641 \
00642         _mm_store_pd( ( double* )alpha4, a4v.v ); \
00643         alpha4 += step_a4; \
00644 \
00645         t1v.v = a1v.v; \
00646         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00647         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00648 \
00649         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00650         alpha1 += step_a1; \
00651         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
00652 \
00653         t2v.v = a2v.v; \
00654         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00655         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00656 \
00657         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00658         alpha2 += step_a2; \
00659         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
00660 \
00661 /* ----------------------------------------------------------- */ \
00662 \
00663         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
00664 \
00665         t2v.v = b2v.v; \
00666         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00667         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00668 \
00669         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00670         alpha3 += step_a3; \
00671         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
00672 \
00673         t3v.v = b3v.v; \
00674         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00675         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00676 \
00677         _mm_store_pd( ( double* )alpha4, b4v.v ); \
00678         alpha4 += step_a4; \
00679 \
00680         t1v.v = b1v.v; \
00681         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00682         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00683 \
00684         _mm_store_pd( ( double* )alpha1, b1v.v ); \
00685         alpha1 += step_a1; \
00686         a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
00687 \
00688         t2v.v = b2v.v; \
00689         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00690         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00691 \
00692         _mm_store_pd( ( double* )alpha2, b2v.v ); \
00693         alpha2 += step_a2; \
00694         a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
00695 \
00696 /* ----------------------------------------------------------- */ \
00697 \
00698         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
00699 \
00700         t2v.v = a2v.v; \
00701         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00702         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00703 \
00704         _mm_store_pd( ( double* )alpha3, b3v.v ); \
00705         alpha3 += step_a3; \
00706         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00707 \
00708         t3v.v = a3v.v; \
00709         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00710         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00711 \
00712         _mm_store_pd( ( double* )alpha4, a4v.v ); \
00713         alpha4 += step_a4; \
00714 \
00715         t1v.v = a1v.v; \
00716         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00717         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00718 \
00719         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00720         alpha1 += step_a1; \
00721         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
00722 \
00723         t2v.v = a2v.v; \
00724         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00725         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00726 \
00727         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00728         alpha2 += step_a2; \
00729         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
00730 \
00731 /* ----------------------------------------------------------- */ \
00732 \
00733         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
00734 \
00735         t2v.v = b2v.v; \
00736         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00737         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00738 \
00739         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00740         alpha3 += step_a3; \
00741         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
00742 \
00743         t3v.v = b3v.v; \
00744         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00745         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00746 \
00747         _mm_store_pd( ( double* )alpha4, b4v.v ); \
00748         alpha4 += step_a4; \
00749 \
00750         t1v.v = b1v.v; \
00751         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00752         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00753 \
00754         _mm_store_pd( ( double* )alpha1, b1v.v ); \
00755         alpha1 += step_a1; \
00756 \
00757         t2v.v = b2v.v; \
00758         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00759         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00760 \
00761         _mm_store_pd( ( double* )alpha2, b2v.v ); \
00762         alpha2 += step_a2; \
00763 \
00764         _mm_store_pd( ( double* )alpha3, b3v.v ); \
00765         alpha3 += step_a3; \
00766 \
00767 /* ----------------------------------------------------------- */ \
00768     } \
00769 \
00770     for ( i = 0; i < n_iter2; ++i ) \
00771     { \
00772 \
00773         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00774         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00775         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
00776 \
00777         t2v.v = a2v.v; \
00778         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00779         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00780 \
00781         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00782 \
00783         t3v.v = a3v.v; \
00784         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00785         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00786 \
00787         _mm_store_pd( ( double* )alpha4, a4v.v ); \
00788         alpha4 += step_a4; \
00789 \
00790         t1v.v = a1v.v; \
00791         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00792         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00793 \
00794         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00795         alpha1 += step_a1; \
00796 \
00797         t2v.v = a2v.v; \
00798         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00799         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00800 \
00801         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00802         alpha2 += step_a2; \
00803         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00804         alpha3 += step_a3; \
00805     } \
00806 \
00807     if ( n_left == 1 ) \
00808     { \
00809         double             ga23_k1 = *gamma23_k1; \
00810         double             si23_k1 = *sigma23_k1; \
00811         double             ga34_k1 = *gamma34_k1; \
00812         double             si34_k1 = *sigma34_k1; \
00813         double             ga12_k2 = *gamma12_k2; \
00814         double             si12_k2 = *sigma12_k2; \
00815         double             ga23_k2 = *gamma23_k2; \
00816         double             si23_k2 = *sigma23_k2; \
00817         double             temp1; \
00818         double             temp2; \
00819         double             temp3; \
00820         double             temp4; \
00821 \
00822         temp2 = *alpha2; \
00823         temp3 = *alpha3; \
00824 \
00825         *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \
00826         *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \
00827 \
00828         temp3 = *alpha3; \
00829         temp4 = *alpha4; \
00830 \
00831         *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \
00832         *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \
00833 \
00834         temp1 = *alpha1; \
00835         temp2 = *alpha2; \
00836 \
00837         *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \
00838         *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \
00839 \
00840         temp2 = *alpha2; \
00841         temp3 = *alpha3; \
00842 \
00843         *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \
00844         *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \
00845 \
00846     } \
00847 }
00848 
00849 #define MAC_Apply_G_mx4s_asc( m_A, \
00850                               gamma23_k1, \
00851                               sigma23_k1, \
00852                               gamma34_k1, \
00853                               sigma34_k1, \
00854                               gamma12_k2, \
00855                               sigma12_k2, \
00856                               gamma23_k2, \
00857                               sigma23_k2, \
00858                               a1, inc_a1, \
00859                               a2, inc_a2, \
00860                               a3, inc_a3, \
00861                               a4, inc_a4 ) \
00862 {\
00863     int                n_iter16 = m_A / ( 2 * 8 ); \
00864     int                n_left16 = m_A % ( 2 * 8 ); \
00865     int                n_iter2  = n_left16 / ( 2 * 1 ); \
00866     int                n_left   = n_left16 % ( 2 * 1 ); \
00867     int                i; \
00868 \
00869     const int          step_a1 = inc_a1 * 2; \
00870     const int          step_a2 = inc_a2 * 2; \
00871     const int          step_a3 = inc_a3 * 2; \
00872     const int          step_a4 = inc_a4 * 2; \
00873 \
00874     scomplex* restrict alpha1 = a1; \
00875     scomplex* restrict alpha2 = a2; \
00876     scomplex* restrict alpha3 = a3; \
00877     scomplex* restrict alpha4 = a4; \
00878 \
00879     v4sf_t             a1v, a2v, a3v, a4v; \
00880     v4sf_t             b1v, b2v, b3v, b4v; \
00881     v4sf_t             g23_k1v, s23_k1v; \
00882     v4sf_t             g34_k1v, s34_k1v; \
00883     v4sf_t             g12_k2v, s12_k2v; \
00884     v4sf_t             g23_k2v, s23_k2v; \
00885     v4sf_t             t1v, t2v, t3v; \
00886 \
00887     g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \
00888     s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \
00889     g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \
00890     s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \
00891     g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \
00892     s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \
00893     g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \
00894     s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \
00895 \
00896     for ( i = 0; i < n_iter16; ++i ) \
00897     { \
00898 \
00899         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00900         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00901         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
00902 \
00903         t2v.v = a2v.v; \
00904         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00905         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00906 \
00907         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00908 \
00909         t3v.v = a3v.v; \
00910         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00911         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00912 \
00913         _mm_store_ps( ( float* )alpha4, a4v.v ); \
00914         alpha4 += step_a4; \
00915 \
00916         t1v.v = a1v.v; \
00917         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00918         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00919 \
00920         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00921         alpha1 += step_a1; \
00922         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00923 \
00924         t2v.v = a2v.v; \
00925         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00926         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00927 \
00928         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00929         alpha2 += step_a2; \
00930         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00931 \
00932 /* ----------------------------------------------------------- */ \
00933 \
00934         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
00935 \
00936         t2v.v = b2v.v; \
00937         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
00938         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00939 \
00940         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00941         alpha3 += step_a3; \
00942         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
00943 \
00944         t3v.v = b3v.v; \
00945         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
00946         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00947 \
00948         _mm_store_ps( ( float* )alpha4, b4v.v ); \
00949         alpha4 += step_a4; \
00950 \
00951         t1v.v = b1v.v; \
00952         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
00953         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00954 \
00955         _mm_store_ps( ( float* )alpha1, b1v.v ); \
00956         alpha1 += step_a1; \
00957         a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00958 \
00959         t2v.v = b2v.v; \
00960         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
00961         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00962 \
00963         _mm_store_ps( ( float* )alpha2, b2v.v ); \
00964         alpha2 += step_a2; \
00965         a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
00966 \
00967 /* ----------------------------------------------------------- */ \
00968 \
00969         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
00970 \
00971         t2v.v = a2v.v; \
00972         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
00973         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
00974 \
00975         _mm_store_ps( ( float* )alpha3, b3v.v ); \
00976         alpha3 += step_a3; \
00977         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00978 \
00979         t3v.v = a3v.v; \
00980         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
00981         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
00982 \
00983         _mm_store_ps( ( float* )alpha4, a4v.v ); \
00984         alpha4 += step_a4; \
00985 \
00986         t1v.v = a1v.v; \
00987         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
00988         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
00989 \
00990         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00991         alpha1 += step_a1; \
00992         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
00993 \
00994         t2v.v = a2v.v; \
00995         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
00996         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
00997 \
00998         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00999         alpha2 += step_a2; \
01000         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
01001 \
01002 /* ----------------------------------------------------------- */ \
01003 \
01004         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
01005 \
01006         t2v.v = b2v.v; \
01007         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
01008         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01009 \
01010         _mm_store_ps( ( float* )alpha3, a3v.v ); \
01011         alpha3 += step_a3; \
01012         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
01013 \
01014         t3v.v = b3v.v; \
01015         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
01016         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01017 \
01018         _mm_store_ps( ( float* )alpha4, b4v.v ); \
01019         alpha4 += step_a4; \
01020 \
01021         t1v.v = b1v.v; \
01022         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
01023         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01024 \
01025         _mm_store_ps( ( float* )alpha1, b1v.v ); \
01026         alpha1 += step_a1; \
01027         a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \
01028 \
01029         t2v.v = b2v.v; \
01030         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
01031         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01032 \
01033         _mm_store_ps( ( float* )alpha2, b2v.v ); \
01034         alpha2 += step_a2; \
01035         a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
01036 \
01037 \
01038 /* ----------------------------------------------------------- */ \
01039 \
01040         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
01041 \
01042         t2v.v = a2v.v; \
01043         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01044         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01045 \
01046         _mm_store_ps( ( float* )alpha3, b3v.v ); \
01047         alpha3 += step_a3; \
01048         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
01049 \
01050         t3v.v = a3v.v; \
01051         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01052         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01053 \
01054         _mm_store_ps( ( float* )alpha4, a4v.v ); \
01055         alpha4 += step_a4; \
01056 \
01057         t1v.v = a1v.v; \
01058         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01059         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01060 \
01061         _mm_store_ps( ( float* )alpha1, a1v.v ); \
01062         alpha1 += step_a1; \
01063         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
01064 \
01065         t2v.v = a2v.v; \
01066         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01067         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01068 \
01069         _mm_store_ps( ( float* )alpha2, a2v.v ); \
01070         alpha2 += step_a2; \
01071         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
01072 \
01073 /* ----------------------------------------------------------- */ \
01074 \
01075         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
01076 \
01077         t2v.v = b2v.v; \
01078         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
01079         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01080 \
01081         _mm_store_ps( ( float* )alpha3, a3v.v ); \
01082         alpha3 += step_a3; \
01083         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
01084 \
01085         t3v.v = b3v.v; \
01086         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
01087         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01088 \
01089         _mm_store_ps( ( float* )alpha4, b4v.v ); \
01090         alpha4 += step_a4; \
01091 \
01092         t1v.v = b1v.v; \
01093         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
01094         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01095 \
01096         _mm_store_ps( ( float* )alpha1, b1v.v ); \
01097         alpha1 += step_a1; \
01098         a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
01099 \
01100         t2v.v = b2v.v; \
01101         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
01102         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01103 \
01104         _mm_store_ps( ( float* )alpha2, b2v.v ); \
01105         alpha2 += step_a2; \
01106         a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
01107 \
01108 /* ----------------------------------------------------------- */ \
01109 \
01110         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
01111 \
01112         t2v.v = a2v.v; \
01113         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01114         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01115 \
01116         _mm_store_ps( ( float* )alpha3, b3v.v ); \
01117         alpha3 += step_a3; \
01118         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
01119 \
01120         t3v.v = a3v.v; \
01121         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01122         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01123 \
01124         _mm_store_ps( ( float* )alpha4, a4v.v ); \
01125         alpha4 += step_a4; \
01126 \
01127         t1v.v = a1v.v; \
01128         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01129         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01130 \
01131         _mm_store_ps( ( float* )alpha1, a1v.v ); \
01132         alpha1 += step_a1; \
01133         b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \
01134 \
01135         t2v.v = a2v.v; \
01136         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01137         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01138 \
01139         _mm_store_ps( ( float* )alpha2, a2v.v ); \
01140         alpha2 += step_a2; \
01141         b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \
01142 \
01143 /* ----------------------------------------------------------- */ \
01144 \
01145         b4v.v = _mm_load_ps( ( float* )alpha4 ); \
01146 \
01147         t2v.v = b2v.v; \
01148         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
01149         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01150 \
01151         _mm_store_ps( ( float* )alpha3, a3v.v ); \
01152         alpha3 += step_a3; \
01153         b1v.v = _mm_load_ps( ( float* )alpha1 ); \
01154 \
01155         t3v.v = b3v.v; \
01156         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
01157         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01158 \
01159         _mm_store_ps( ( float* )alpha4, b4v.v ); \
01160         alpha4 += step_a4; \
01161 \
01162         t1v.v = b1v.v; \
01163         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
01164         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01165 \
01166         _mm_store_ps( ( float* )alpha1, b1v.v ); \
01167         alpha1 += step_a1; \
01168 \
01169         t2v.v = b2v.v; \
01170         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
01171         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01172 \
01173         _mm_store_ps( ( float* )alpha2, b2v.v ); \
01174         alpha2 += step_a2; \
01175 \
01176         _mm_store_ps( ( float* )alpha3, b3v.v ); \
01177         alpha3 += step_a3; \
01178 \
01179 /* ----------------------------------------------------------- */ \
01180     } \
01181 \
01182     for ( i = 0; i < n_iter2; ++i ) \
01183     { \
01184 \
01185         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
01186         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
01187         a4v.v = _mm_load_ps( ( float* )alpha4 ); \
01188 \
01189         t2v.v = a2v.v; \
01190         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01191         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01192 \
01193         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
01194 \
01195         t3v.v = a3v.v; \
01196         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01197         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01198 \
01199         _mm_store_ps( ( float* )alpha4, a4v.v ); \
01200         alpha4 += step_a4; \
01201 \
01202         t1v.v = a1v.v; \
01203         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01204         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01205 \
01206         _mm_store_ps( ( float* )alpha1, a1v.v ); \
01207         alpha1 += step_a1; \
01208 \
01209         t2v.v = a2v.v; \
01210         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01211         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01212 \
01213         _mm_store_ps( ( float* )alpha2, a2v.v ); \
01214         alpha2 += step_a2; \
01215         _mm_store_ps( ( float* )alpha3, a3v.v ); \
01216         alpha3 += step_a3; \
01217     } \
01218 \
01219     if ( n_left == 1 ) \
01220     { \
01221         float             ga23_k1 = *gamma23_k1; \
01222         float             si23_k1 = *sigma23_k1; \
01223         float             ga34_k1 = *gamma34_k1; \
01224         float             si34_k1 = *sigma34_k1; \
01225         float             ga12_k2 = *gamma12_k2; \
01226         float             si12_k2 = *sigma12_k2; \
01227         float             ga23_k2 = *gamma23_k2; \
01228         float             si23_k2 = *sigma23_k2; \
01229         scomplex          temp1; \
01230         scomplex          temp2; \
01231         scomplex          temp3; \
01232         scomplex          temp4; \
01233 \
01234         temp2 = *alpha2; \
01235         temp3 = *alpha3; \
01236 \
01237         alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \
01238         alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \
01239 \
01240         alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \
01241         alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \
01242 \
01243         temp3 = *alpha3; \
01244         temp4 = *alpha4; \
01245 \
01246         alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \
01247         alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \
01248 \
01249         alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \
01250         alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \
01251 \
01252         temp1 = *alpha1; \
01253         temp2 = *alpha2; \
01254 \
01255         alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \
01256         alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \
01257 \
01258         alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \
01259         alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \
01260 \
01261         temp2 = *alpha2; \
01262         temp3 = *alpha3; \
01263 \
01264         alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \
01265         alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \
01266 \
01267         alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \
01268         alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \
01269 \
01270     } \
01271 }
01272 
01273 #define MAC_Apply_G_mx4s_asz( m_A, \
01274                               gamma23_k1, \
01275                               sigma23_k1, \
01276                               gamma34_k1, \
01277                               sigma34_k1, \
01278                               gamma12_k2, \
01279                               sigma12_k2, \
01280                               gamma23_k2, \
01281                               sigma23_k2, \
01282                               a1, inc_a1, \
01283                               a2, inc_a2, \
01284                               a3, inc_a3, \
01285                               a4, inc_a4 ) \
01286 {\
01287     int                n_iter = m_A / 8; \
01288     int                n_left = m_A % 8; \
01289     int                i; \
01290 \
01291     const int          step_a1 = inc_a1 * 1; \
01292     const int          step_a2 = inc_a2 * 1; \
01293     const int          step_a3 = inc_a3 * 1; \
01294     const int          step_a4 = inc_a4 * 1; \
01295 \
01296     dcomplex* restrict alpha1 = a1; \
01297     dcomplex* restrict alpha2 = a2; \
01298     dcomplex* restrict alpha3 = a3; \
01299     dcomplex* restrict alpha4 = a4; \
01300 \
01301     v2df_t             a1v, a2v, a3v, a4v; \
01302     v2df_t             b1v, b2v, b3v, b4v; \
01303     v2df_t             g23_k1v, s23_k1v; \
01304     v2df_t             g34_k1v, s34_k1v; \
01305     v2df_t             g12_k2v, s12_k2v; \
01306     v2df_t             g23_k2v, s23_k2v; \
01307     v2df_t             t1v, t2v, t3v; \
01308 \
01309     g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \
01310     s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \
01311     g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \
01312     s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \
01313     g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \
01314     s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \
01315     g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \
01316     s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \
01317 \
01318     for ( i = 0; i < n_iter; ++i ) \
01319     { \
01320 \
01321         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
01322         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
01323         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
01324 \
01325         t2v.v = a2v.v; \
01326         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01327         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01328 \
01329         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
01330 \
01331         t3v.v = a3v.v; \
01332         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01333         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01334 \
01335         _mm_store_pd( ( double* )alpha4, a4v.v ); \
01336         alpha4 += step_a4; \
01337 \
01338         t1v.v = a1v.v; \
01339         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01340         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01341 \
01342         _mm_store_pd( ( double* )alpha1, a1v.v ); \
01343         alpha1 += step_a1; \
01344         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
01345 \
01346         t2v.v = a2v.v; \
01347         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01348         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01349 \
01350         _mm_store_pd( ( double* )alpha2, a2v.v ); \
01351         alpha2 += step_a2; \
01352         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
01353 \
01354 /* ----------------------------------------------------------- */ \
01355 \
01356         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
01357 \
01358         t2v.v = b2v.v; \
01359         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
01360         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01361 \
01362         _mm_store_pd( ( double* )alpha3, a3v.v ); \
01363         alpha3 += step_a3; \
01364         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
01365 \
01366         t3v.v = b3v.v; \
01367         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
01368         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01369 \
01370         _mm_store_pd( ( double* )alpha4, b4v.v ); \
01371         alpha4 += step_a4; \
01372 \
01373         t1v.v = b1v.v; \
01374         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
01375         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01376 \
01377         _mm_store_pd( ( double* )alpha1, b1v.v ); \
01378         alpha1 += step_a1; \
01379         a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
01380 \
01381         t2v.v = b2v.v; \
01382         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
01383         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01384 \
01385         _mm_store_pd( ( double* )alpha2, b2v.v ); \
01386         alpha2 += step_a2; \
01387         a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
01388 \
01389 /* ----------------------------------------------------------- */ \
01390 \
01391         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
01392 \
01393         t2v.v = a2v.v; \
01394         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01395         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01396 \
01397         _mm_store_pd( ( double* )alpha3, b3v.v ); \
01398         alpha3 += step_a3; \
01399         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
01400 \
01401         t3v.v = a3v.v; \
01402         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01403         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01404 \
01405         _mm_store_pd( ( double* )alpha4, a4v.v ); \
01406         alpha4 += step_a4; \
01407 \
01408         t1v.v = a1v.v; \
01409         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01410         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01411 \
01412         _mm_store_pd( ( double* )alpha1, a1v.v ); \
01413         alpha1 += step_a1; \
01414         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
01415 \
01416         t2v.v = a2v.v; \
01417         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01418         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01419 \
01420         _mm_store_pd( ( double* )alpha2, a2v.v ); \
01421         alpha2 += step_a2; \
01422         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
01423 \
01424 /* ----------------------------------------------------------- */ \
01425 \
01426         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
01427 \
01428         t2v.v = b2v.v; \
01429         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
01430         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01431 \
01432         _mm_store_pd( ( double* )alpha3, a3v.v ); \
01433         alpha3 += step_a3; \
01434         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
01435 \
01436         t3v.v = b3v.v; \
01437         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
01438         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01439 \
01440         _mm_store_pd( ( double* )alpha4, b4v.v ); \
01441         alpha4 += step_a4; \
01442 \
01443         t1v.v = b1v.v; \
01444         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
01445         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01446 \
01447         _mm_store_pd( ( double* )alpha1, b1v.v ); \
01448         alpha1 += step_a1; \
01449         a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \
01450 \
01451         t2v.v = b2v.v; \
01452         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
01453         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01454 \
01455         _mm_store_pd( ( double* )alpha2, b2v.v ); \
01456         alpha2 += step_a2; \
01457         a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
01458 \
01459 /* ----------------------------------------------------------- */ \
01460 \
01461         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
01462 \
01463         t2v.v = a2v.v; \
01464         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01465         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01466 \
01467         _mm_store_pd( ( double* )alpha3, b3v.v ); \
01468         alpha3 += step_a3; \
01469         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
01470 \
01471         t3v.v = a3v.v; \
01472         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01473         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01474 \
01475         _mm_store_pd( ( double* )alpha4, a4v.v ); \
01476         alpha4 += step_a4; \
01477 \
01478         t1v.v = a1v.v; \
01479         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01480         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01481 \
01482         _mm_store_pd( ( double* )alpha1, a1v.v ); \
01483         alpha1 += step_a1; \
01484         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
01485 \
01486         t2v.v = a2v.v; \
01487         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01488         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01489 \
01490         _mm_store_pd( ( double* )alpha2, a2v.v ); \
01491         alpha2 += step_a2; \
01492         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
01493 \
01494 /* ----------------------------------------------------------- */ \
01495 \
01496         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
01497 \
01498         t2v.v = b2v.v; \
01499         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
01500         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01501 \
01502         _mm_store_pd( ( double* )alpha3, a3v.v ); \
01503         alpha3 += step_a3; \
01504         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
01505 \
01506         t3v.v = b3v.v; \
01507         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
01508         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01509 \
01510         _mm_store_pd( ( double* )alpha4, b4v.v ); \
01511         alpha4 += step_a4; \
01512 \
01513         t1v.v = b1v.v; \
01514         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
01515         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01516 \
01517         _mm_store_pd( ( double* )alpha1, b1v.v ); \
01518         alpha1 += step_a1; \
01519         a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
01520 \
01521         t2v.v = b2v.v; \
01522         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
01523         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01524 \
01525         _mm_store_pd( ( double* )alpha2, b2v.v ); \
01526         alpha2 += step_a2; \
01527         a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
01528 \
01529 /* ----------------------------------------------------------- */ \
01530 \
01531         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
01532 \
01533         t2v.v = a2v.v; \
01534         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01535         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01536 \
01537         _mm_store_pd( ( double* )alpha3, b3v.v ); \
01538         alpha3 += step_a3; \
01539         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
01540 \
01541         t3v.v = a3v.v; \
01542         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01543         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01544 \
01545         _mm_store_pd( ( double* )alpha4, a4v.v ); \
01546         alpha4 += step_a4; \
01547 \
01548         t1v.v = a1v.v; \
01549         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01550         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01551 \
01552         _mm_store_pd( ( double* )alpha1, a1v.v ); \
01553         alpha1 += step_a1; \
01554         b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \
01555 \
01556         t2v.v = a2v.v; \
01557         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01558         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01559 \
01560         _mm_store_pd( ( double* )alpha2, a2v.v ); \
01561         alpha2 += step_a2; \
01562         b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \
01563 \
01564 /* ----------------------------------------------------------- */ \
01565 \
01566         b4v.v = _mm_load_pd( ( double* )alpha4 ); \
01567 \
01568         t2v.v = b2v.v; \
01569         b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \
01570         b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01571 \
01572         _mm_store_pd( ( double* )alpha3, a3v.v ); \
01573         alpha3 += step_a3; \
01574         b1v.v = _mm_load_pd( ( double* )alpha1 ); \
01575 \
01576         t3v.v = b3v.v; \
01577         b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \
01578         b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01579 \
01580         _mm_store_pd( ( double* )alpha4, b4v.v ); \
01581         alpha4 += step_a4; \
01582 \
01583         t1v.v = b1v.v; \
01584         b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \
01585         b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01586 \
01587         _mm_store_pd( ( double* )alpha1, b1v.v ); \
01588         alpha1 += step_a1; \
01589 \
01590         t2v.v = b2v.v; \
01591         b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \
01592         b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01593 \
01594         _mm_store_pd( ( double* )alpha2, b2v.v ); \
01595         alpha2 += step_a2; \
01596 \
01597         _mm_store_pd( ( double* )alpha3, b3v.v ); \
01598         alpha3 += step_a3; \
01599 \
01600 /* ----------------------------------------------------------- */ \
01601     } \
01602 \
01603     for ( i = 0; i < n_left; ++i ) \
01604     { \
01605 \
01606         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
01607         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
01608         a4v.v = _mm_load_pd( ( double* )alpha4 ); \
01609 \
01610         t2v.v = a2v.v; \
01611         a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \
01612         a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \
01613 \
01614         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
01615 \
01616         t3v.v = a3v.v; \
01617         a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \
01618         a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \
01619 \
01620         _mm_store_pd( ( double* )alpha4, a4v.v ); \
01621         alpha4 += step_a4; \
01622 \
01623         t1v.v = a1v.v; \
01624         a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \
01625         a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \
01626 \
01627         _mm_store_pd( ( double* )alpha1, a1v.v ); \
01628         alpha1 += step_a1; \
01629 \
01630         t2v.v = a2v.v; \
01631         a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \
01632         a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \
01633 \
01634         _mm_store_pd( ( double* )alpha2, a2v.v ); \
01635         alpha2 += step_a2; \
01636         _mm_store_pd( ( double* )alpha3, a3v.v ); \
01637         alpha3 += step_a3; \
01638     } \
01639 }
01640 
01641 #endif