libflame  revision_anchor
FLA_Apply_G_mx2_asm.h
Go to the documentation of this file.
00001 /*
00002 
00003     Copyright (C) 2014, The University of Texas at Austin
00004 
00005     This file is part of libflame and is available under the 3-Clause
00006     BSD license, which can be found in the LICENSE file at the top-level
00007     directory, or at http://opensource.org/licenses/BSD-3-Clause
00008 
00009 */
00010 
00011 
00012 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
00013 
00014 #define MAC_Apply_G_mx2_ass MAC_Apply_G_mx2_ops
00015 #define MAC_Apply_G_mx2_asd MAC_Apply_G_mx2_opd
00016 #define MAC_Apply_G_mx2_asc MAC_Apply_G_mx2_opc
00017 #define MAC_Apply_G_mx2_asz MAC_Apply_G_mx2_opz
00018 
00019 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
00020 
00021 #define MAC_Apply_G_mx2_ass( m_A, \
00022                              gamma12, \
00023                              sigma12, \
00024                              a1, inc_a1, \
00025                              a2, inc_a2 ) \
00026 {\
00027     int              n_iter32  = m_A / ( 4 * 8 ); \
00028     int              n_left32  = m_A % ( 4 * 8 ); \
00029     int              n_iter4   = n_left32 / ( 4 * 1 ); \
00030     int              n_left    = n_left32 % ( 4 * 1 ); \
00031     int              i; \
00032 \
00033     const int        step_a1 = inc_a1 * 4; \
00034     const int        step_a2 = inc_a2 * 4; \
00035 \
00036     float*  restrict alpha1 = a1; \
00037     float*  restrict alpha2 = a2; \
00038 \
00039     v4sf_t           a1v, a2v; \
00040     v4sf_t           g12v, s12v; \
00041     v4sf_t           t1v; \
00042 \
00043     g12v.v = _mm_load1_ps( gamma12 ); \
00044     s12v.v = _mm_load1_ps( sigma12 ); \
00045 \
00046     for ( i = 0; i < n_iter32; ++i ) \
00047     { \
00048 \
00049         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00050         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00051 \
00052         t1v.v = a1v.v; \
00053         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00054         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00055 \
00056         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00057         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00058 \
00059         alpha1 += step_a1; \
00060         alpha2 += step_a2; \
00061 \
00062         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00063         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00064 \
00065         t1v.v = a1v.v; \
00066         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00067         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00068 \
00069         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00070         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00071 \
00072         alpha1 += step_a1; \
00073         alpha2 += step_a2; \
00074 \
00075         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00076         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00077 \
00078         t1v.v = a1v.v; \
00079         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00080         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00081 \
00082         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00083         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00084 \
00085         alpha1 += step_a1; \
00086         alpha2 += step_a2; \
00087 \
00088         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00089         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00090 \
00091         t1v.v = a1v.v; \
00092         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00093         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00094 \
00095         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00096         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00097 \
00098         alpha1 += step_a1; \
00099         alpha2 += step_a2; \
00100 \
00101         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00102         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00103 \
00104         t1v.v = a1v.v; \
00105         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00106         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00107 \
00108         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00109         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00110 \
00111         alpha1 += step_a1; \
00112         alpha2 += step_a2; \
00113 \
00114         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00115         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00116 \
00117         t1v.v = a1v.v; \
00118         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00119         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00120 \
00121         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00122         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00123 \
00124         alpha1 += step_a1; \
00125         alpha2 += step_a2; \
00126 \
00127         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00128         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00129 \
00130         t1v.v = a1v.v; \
00131         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00132         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00133 \
00134         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00135         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00136 \
00137         alpha1 += step_a1; \
00138         alpha2 += step_a2; \
00139 \
00140         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00141         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00142 \
00143         t1v.v = a1v.v; \
00144         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00145         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00146 \
00147         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00148         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00149 \
00150         alpha1 += step_a1; \
00151         alpha2 += step_a2; \
00152     } \
00153 \
00154     for ( i = 0; i < n_iter4; ++i ) \
00155     { \
00156 \
00157         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00158         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00159 \
00160         t1v.v = a1v.v; \
00161         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00162         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00163 \
00164         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00165         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00166 \
00167         alpha1 += step_a1; \
00168         alpha2 += step_a2; \
00169     } \
00170 \
00171     for ( i = 0; i < n_left; ++i ) \
00172     { \
00173         float ga12 = *gamma12; \
00174         float si12 = *sigma12; \
00175         float temp1; \
00176         float temp2; \
00177 \
00178         temp1 = *alpha1; \
00179         temp2 = *alpha2; \
00180 \
00181         *alpha1 = temp1 * ga12 + temp2 * si12; \
00182         *alpha2 = temp2 * ga12 - temp1 * si12; \
00183 \
00184         alpha1 += 1; \
00185         alpha2 += 1; \
00186     } \
00187 }
00188 
00189 #define MAC_Apply_G_mx2_asd( m_A, \
00190                              gamma12, \
00191                              sigma12, \
00192                              a1, inc_a1, \
00193                              a2, inc_a2 ) \
00194 {\
00195     int              n_iter16  = m_A / ( 2 * 8 ); \
00196     int              n_left16  = m_A % ( 2 * 8 ); \
00197     int              n_iter2   = n_left16 / ( 2 * 1 ); \
00198     int              n_left    = n_left16 % ( 2 * 1 ); \
00199     int              i; \
00200 \
00201     const int        step_a1 = inc_a1 * 2; \
00202     const int        step_a2 = inc_a2 * 2; \
00203 \
00204     double* restrict alpha1 = a1; \
00205     double* restrict alpha2 = a2; \
00206 \
00207     v2df_t           a1v, a2v; \
00208     v2df_t           g12v, s12v; \
00209     v2df_t           t1v; \
00210 \
00211     g12v.v = _mm_loaddup_pd( gamma12 ); \
00212     s12v.v = _mm_loaddup_pd( sigma12 ); \
00213 \
00214     for ( i = 0; i < n_iter16; ++i ) \
00215     { \
00216 \
00217         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00218         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00219 \
00220         t1v.v = a1v.v; \
00221         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00222         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00223 \
00224         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00225         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00226 \
00227         alpha1 += step_a1; \
00228         alpha2 += step_a2; \
00229 \
00230         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00231         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00232 \
00233         t1v.v = a1v.v; \
00234         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00235         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00236 \
00237         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00238         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00239 \
00240         alpha1 += step_a1; \
00241         alpha2 += step_a2; \
00242 \
00243         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00244         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00245 \
00246         t1v.v = a1v.v; \
00247         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00248         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00249 \
00250         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00251         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00252 \
00253         alpha1 += step_a1; \
00254         alpha2 += step_a2; \
00255 \
00256         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00257         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00258 \
00259         t1v.v = a1v.v; \
00260         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00261         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00262 \
00263         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00264         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00265 \
00266         alpha1 += step_a1; \
00267         alpha2 += step_a2; \
00268 \
00269         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00270         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00271 \
00272         t1v.v = a1v.v; \
00273         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00274         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00275 \
00276         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00277         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00278 \
00279         alpha1 += step_a1; \
00280         alpha2 += step_a2; \
00281 \
00282         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00283         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00284 \
00285         t1v.v = a1v.v; \
00286         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00287         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00288 \
00289         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00290         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00291 \
00292         alpha1 += step_a1; \
00293         alpha2 += step_a2; \
00294 \
00295         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00296         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00297 \
00298         t1v.v = a1v.v; \
00299         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00300         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00301 \
00302         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00303         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00304 \
00305         alpha1 += step_a1; \
00306         alpha2 += step_a2; \
00307 \
00308         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00309         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00310 \
00311         t1v.v = a1v.v; \
00312         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00313         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00314 \
00315         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00316         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00317 \
00318         alpha1 += step_a1; \
00319         alpha2 += step_a2; \
00320     } \
00321 \
00322     for ( i = 0; i < n_iter2; ++i ) \
00323     { \
00324 \
00325         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00326         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00327 \
00328         t1v.v = a1v.v; \
00329         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00330         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00331 \
00332         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00333         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00334 \
00335         alpha1 += step_a1; \
00336         alpha2 += step_a2; \
00337     } \
00338 \
00339     if ( n_left == 1 ) \
00340     { \
00341         double ga12 = *gamma12; \
00342         double si12 = *sigma12; \
00343         double temp1; \
00344         double temp2; \
00345 \
00346         temp1 = *alpha1; \
00347         temp2 = *alpha2; \
00348 \
00349         *alpha1 = temp1 * ga12 + temp2 * si12; \
00350         *alpha2 = temp2 * ga12 - temp1 * si12; \
00351     } \
00352 }
00353 
00354 #define MAC_Apply_G_mx2_asc( m_A, \
00355                              gamma12, \
00356                              sigma12, \
00357                              a1, inc_a1, \
00358                              a2, inc_a2 ) \
00359 {\
00360     int                n_iter16  = m_A / ( 2 * 8 ); \
00361     int                n_left16  = m_A % ( 2 * 8 ); \
00362     int                n_iter2   = n_left16 / ( 2 * 1 ); \
00363     int                n_left    = n_left16 % ( 2 * 1 ); \
00364     int                i; \
00365 \
00366     const int          step_a1 = inc_a1 * 2; \
00367     const int          step_a2 = inc_a2 * 2; \
00368 \
00369     scomplex* restrict alpha1 = a1; \
00370     scomplex* restrict alpha2 = a2; \
00371 \
00372     v4sf_t             a1v, a2v; \
00373     v4sf_t             g12v, s12v; \
00374     v4sf_t             t1v; \
00375 \
00376     g12v.v = _mm_load1_ps( gamma12 ); \
00377     s12v.v = _mm_load1_ps( sigma12 ); \
00378 \
00379     for ( i = 0; i < n_iter16; ++i ) \
00380     { \
00381 \
00382         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00383         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00384 \
00385         t1v.v = a1v.v; \
00386         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00387         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00388 \
00389         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00390         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00391 \
00392         alpha1 += step_a1; \
00393         alpha2 += step_a2; \
00394 \
00395         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00396         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00397 \
00398         t1v.v = a1v.v; \
00399         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00400         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00401 \
00402         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00403         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00404 \
00405         alpha1 += step_a1; \
00406         alpha2 += step_a2; \
00407 \
00408         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00409         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00410 \
00411         t1v.v = a1v.v; \
00412         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00413         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00414 \
00415         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00416         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00417 \
00418         alpha1 += step_a1; \
00419         alpha2 += step_a2; \
00420 \
00421         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00422         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00423 \
00424         t1v.v = a1v.v; \
00425         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00426         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00427 \
00428         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00429         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00430 \
00431         alpha1 += step_a1; \
00432         alpha2 += step_a2; \
00433 \
00434         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00435         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00436 \
00437         t1v.v = a1v.v; \
00438         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00439         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00440 \
00441         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00442         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00443 \
00444         alpha1 += step_a1; \
00445         alpha2 += step_a2; \
00446 \
00447         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00448         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00449 \
00450         t1v.v = a1v.v; \
00451         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00452         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00453 \
00454         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00455         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00456 \
00457         alpha1 += step_a1; \
00458         alpha2 += step_a2; \
00459 \
00460         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00461         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00462 \
00463         t1v.v = a1v.v; \
00464         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00465         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00466 \
00467         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00468         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00469 \
00470         alpha1 += step_a1; \
00471         alpha2 += step_a2; \
00472 \
00473         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00474         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00475 \
00476         t1v.v = a1v.v; \
00477         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00478         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00479 \
00480         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00481         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00482 \
00483         alpha1 += step_a1; \
00484         alpha2 += step_a2; \
00485     } \
00486 \
00487     for ( i = 0; i < n_iter2; ++i ) \
00488     { \
00489 \
00490         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00491         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00492 \
00493         t1v.v = a1v.v; \
00494         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00495         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00496 \
00497         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00498         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00499 \
00500         alpha1 += step_a1; \
00501         alpha2 += step_a2; \
00502     } \
00503 \
00504     if ( n_left == 1 ) \
00505     { \
00506         float    ga12 = *gamma12; \
00507         float    si12 = *sigma12; \
00508         scomplex temp1; \
00509         scomplex temp2; \
00510 \
00511         temp1 = *alpha1; \
00512         temp2 = *alpha2; \
00513 \
00514         alpha1->real = temp1.real * ga12 + temp2.real * si12; \
00515         alpha2->real = temp2.real * ga12 - temp1.real * si12; \
00516 \
00517         alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
00518         alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
00519     } \
00520 }
00521 
00522 #define MAC_Apply_G_mx2_asz( m_A, \
00523                              gamma12, \
00524                              sigma12, \
00525                              a1, inc_a1, \
00526                              a2, inc_a2 ) \
00527 {\
00528     int                n_iter  = m_A / 8; \
00529     int                n_left  = m_A % 8; \
00530     int                i; \
00531 \
00532     const int          step_a1 = inc_a1 * 1; \
00533     const int          step_a2 = inc_a2 * 1; \
00534 \
00535     dcomplex* restrict alpha1 = a1; \
00536     dcomplex* restrict alpha2 = a2; \
00537 \
00538     v2df_t             a1v, a2v; \
00539     v2df_t             g12v, s12v; \
00540     v2df_t             t1v; \
00541 \
00542     g12v.v = _mm_loaddup_pd( gamma12 ); \
00543     s12v.v = _mm_loaddup_pd( sigma12 ); \
00544 \
00545     for ( i = 0; i < n_iter; ++i ) \
00546     { \
00547 \
00548         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00549         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00550 \
00551         t1v.v = a1v.v; \
00552         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00553         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00554 \
00555         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00556         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00557 \
00558         alpha1 += step_a1; \
00559         alpha2 += step_a2; \
00560 \
00561         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00562         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00563 \
00564         t1v.v = a1v.v; \
00565         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00566         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00567 \
00568         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00569         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00570 \
00571         alpha1 += step_a1; \
00572         alpha2 += step_a2; \
00573 \
00574         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00575         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00576 \
00577         t1v.v = a1v.v; \
00578         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00579         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00580 \
00581         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00582         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00583 \
00584         alpha1 += step_a1; \
00585         alpha2 += step_a2; \
00586 \
00587         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00588         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00589 \
00590         t1v.v = a1v.v; \
00591         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00592         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00593 \
00594         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00595         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00596 \
00597         alpha1 += step_a1; \
00598         alpha2 += step_a2; \
00599 \
00600         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00601         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00602 \
00603         t1v.v = a1v.v; \
00604         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00605         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00606 \
00607         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00608         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00609 \
00610         alpha1 += step_a1; \
00611         alpha2 += step_a2; \
00612 \
00613         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00614         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00615 \
00616         t1v.v = a1v.v; \
00617         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00618         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00619 \
00620         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00621         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00622 \
00623         alpha1 += step_a1; \
00624         alpha2 += step_a2; \
00625 \
00626         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00627         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00628 \
00629         t1v.v = a1v.v; \
00630         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00631         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00632 \
00633         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00634         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00635 \
00636         alpha1 += step_a1; \
00637         alpha2 += step_a2; \
00638 \
00639         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00640         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00641 \
00642         t1v.v = a1v.v; \
00643         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00644         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00645 \
00646         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00647         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00648 \
00649         alpha1 += step_a1; \
00650         alpha2 += step_a2; \
00651     } \
00652 \
00653     for ( i = 0; i < n_left; ++i ) \
00654     { \
00655         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00656         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00657 \
00658         t1v.v = a1v.v; \
00659         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00660         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00661 \
00662         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00663         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00664 \
00665         alpha1 += step_a1; \
00666         alpha2 += step_a2; \
00667     } \
00668 }
00669 
00670 #endif