libflame  revision_anchor
FLA_Apply_G_mx3_asm.h
Go to the documentation of this file.
00001 /*
00002 
00003     Copyright (C) 2014, The University of Texas at Austin
00004 
00005     This file is part of libflame and is available under the 3-Clause
00006     BSD license, which can be found in the LICENSE file at the top-level
00007     directory, or at http://opensource.org/licenses/BSD-3-Clause
00008 
00009 */
00010 
00011 
00012 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
00013 
00014 #define MAC_Apply_G_mx3_ass MAC_Apply_G_mx3_ops
00015 #define MAC_Apply_G_mx3_asd MAC_Apply_G_mx3_opd
00016 #define MAC_Apply_G_mx3_asc MAC_Apply_G_mx3_opc
00017 #define MAC_Apply_G_mx3_asz MAC_Apply_G_mx3_opz
00018 
00019 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
00020 
00021 #define MAC_Apply_G_mx3_ass( m_A, \
00022                              gamma12, \
00023                              sigma12, \
00024                              gamma23, \
00025                              sigma23, \
00026                              a1, inc_a1, \
00027                              a2, inc_a2, \
00028                              a3, inc_a3 ) \
00029 {\
00030     int              n_iter32 = m_A / ( 4 * 8 ); \
00031     int              n_left32 = m_A % ( 4 * 8 ); \
00032     int              n_iter4  = n_left32 / ( 4 * 1 ); \
00033     int              n_left   = n_left32 % ( 4 * 1 ); \
00034     int              i; \
00035 \
00036     const int        step_a1 = inc_a1 * 4; \
00037     const int        step_a2 = inc_a1 * 4; \
00038     const int        step_a3 = inc_a1 * 4; \
00039 \
00040     float* restrict alpha1 = a1; \
00041     float* restrict alpha2 = a2; \
00042     float* restrict alpha3 = a3; \
00043 \
00044     v4sf_t    a1v, a2v, a3v; \
00045     v4sf_t    g12v, s12v; \
00046     v4sf_t    g23v, s23v; \
00047     v4sf_t    t1v, t2v; \
00048 \
00049     g12v.v = _mm_load1_ps( gamma12 ); \
00050     s12v.v = _mm_load1_ps( sigma12 ); \
00051     g23v.v = _mm_load1_ps( gamma23 ); \
00052     s23v.v = _mm_load1_ps( sigma23 ); \
00053 \
00054     for ( i = 0; i < n_iter32; ++i ) \
00055     { \
00056 \
00057         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00058         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00059 \
00060         t1v.v = a1v.v; \
00061         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00062         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00063 \
00064         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00065         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00066         alpha1 += step_a1; \
00067 \
00068         t2v.v = a2v.v; \
00069         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00070         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00071 \
00072         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00073         alpha2 += step_a2; \
00074         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00075         alpha3 += step_a3; \
00076 \
00077         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00078         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00079 \
00080         t1v.v = a1v.v; \
00081         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00082         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00083 \
00084         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00085         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00086         alpha1 += step_a1; \
00087 \
00088         t2v.v = a2v.v; \
00089         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00090         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00091 \
00092         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00093         alpha2 += step_a2; \
00094         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00095         alpha3 += step_a3; \
00096 \
00097         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00098         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00099 \
00100         t1v.v = a1v.v; \
00101         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00102         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00103 \
00104         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00105         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00106         alpha1 += step_a1; \
00107 \
00108         t2v.v = a2v.v; \
00109         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00110         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00111 \
00112         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00113         alpha2 += step_a2; \
00114         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00115         alpha3 += step_a3; \
00116 \
00117         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00118         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00119 \
00120         t1v.v = a1v.v; \
00121         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00122         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00123 \
00124         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00125         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00126         alpha1 += step_a1; \
00127 \
00128         t2v.v = a2v.v; \
00129         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00130         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00131 \
00132         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00133         alpha2 += step_a2; \
00134         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00135         alpha3 += step_a3; \
00136 \
00137         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00138         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00139 \
00140         t1v.v = a1v.v; \
00141         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00142         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00143 \
00144         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00145         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00146         alpha1 += step_a1; \
00147 \
00148         t2v.v = a2v.v; \
00149         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00150         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00151 \
00152         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00153         alpha2 += step_a2; \
00154         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00155         alpha3 += step_a3; \
00156 \
00157         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00158         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00159 \
00160         t1v.v = a1v.v; \
00161         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00162         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00163 \
00164         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00165         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00166         alpha1 += step_a1; \
00167 \
00168         t2v.v = a2v.v; \
00169         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00170         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00171 \
00172         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00173         alpha2 += step_a2; \
00174         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00175         alpha3 += step_a3; \
00176 \
00177         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00178         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00179 \
00180         t1v.v = a1v.v; \
00181         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00182         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00183 \
00184         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00185         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00186         alpha1 += step_a1; \
00187 \
00188         t2v.v = a2v.v; \
00189         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00190         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00191 \
00192         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00193         alpha2 += step_a2; \
00194         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00195         alpha3 += step_a3; \
00196 \
00197         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00198         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00199 \
00200         t1v.v = a1v.v; \
00201         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00202         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00203 \
00204         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00205         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00206         alpha1 += step_a1; \
00207 \
00208         t2v.v = a2v.v; \
00209         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00210         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00211 \
00212         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00213         alpha2 += step_a2; \
00214         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00215         alpha3 += step_a3; \
00216     } \
00217 \
00218     for ( i = 0; i < n_iter4; ++i ) \
00219     { \
00220 \
00221         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00222         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00223 \
00224         t1v.v = a1v.v; \
00225         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00226         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00227 \
00228         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00229         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00230         alpha1 += step_a1; \
00231 \
00232         t2v.v = a2v.v; \
00233         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00234         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00235 \
00236         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00237         alpha2 += step_a2; \
00238         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00239         alpha3 += step_a3; \
00240     } \
00241 \
00242     for ( i = 0; i < n_left; ++i ) \
00243     { \
00244         float ga12 = *gamma12; \
00245         float si12 = *sigma12; \
00246         float ga23 = *gamma23; \
00247         float si23 = *sigma23; \
00248         float temp1; \
00249         float temp2; \
00250         float temp3; \
00251 \
00252         temp1 = *alpha1; \
00253         temp2 = *alpha2; \
00254 \
00255         *alpha1 = temp1 * ga12 + temp2 * si12; \
00256         *alpha2 = temp2 * ga12 - temp1 * si12; \
00257 \
00258         temp2 = *alpha2; \
00259         temp3 = *alpha3; \
00260 \
00261         *alpha2 = temp2 * ga23 + temp3 * si23; \
00262         *alpha3 = temp3 * ga23 - temp2 * si23; \
00263 \
00264         alpha1 += 1; \
00265         alpha2 += 1; \
00266         alpha3 += 1; \
00267     } \
00268 }
00269 
00270 #define MAC_Apply_G_mx3_asd( m_A, \
00271                              gamma12, \
00272                              sigma12, \
00273                              gamma23, \
00274                              sigma23, \
00275                              a1, inc_a1, \
00276                              a2, inc_a2, \
00277                              a3, inc_a3 ) \
00278 {\
00279     int              n_iter16 = m_A / ( 2 * 8 ); \
00280     int              n_left16 = m_A % ( 2 * 8 ); \
00281     int              n_iter2  = n_left16 / ( 2 * 1 ); \
00282     int              n_left   = n_left16 % ( 2 * 1 ); \
00283     int              i; \
00284 \
00285     const int        step_a1 = inc_a1 * 2; \
00286     const int        step_a2 = inc_a1 * 2; \
00287     const int        step_a3 = inc_a1 * 2; \
00288 \
00289     double* restrict alpha1 = a1; \
00290     double* restrict alpha2 = a2; \
00291     double* restrict alpha3 = a3; \
00292 \
00293     v2df_t           a1v, a2v, a3v; \
00294     v2df_t           g12v, s12v; \
00295     v2df_t           g23v, s23v; \
00296     v2df_t           t1v, t2v; \
00297 \
00298     g12v.v = _mm_loaddup_pd( gamma12 ); \
00299     s12v.v = _mm_loaddup_pd( sigma12 ); \
00300     g23v.v = _mm_loaddup_pd( gamma23 ); \
00301     s23v.v = _mm_loaddup_pd( sigma23 ); \
00302 \
00303     for ( i = 0; i < n_iter16; ++i ) \
00304     { \
00305 \
00306         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00307         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00308 \
00309         t1v.v = a1v.v; \
00310         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00311         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00312 \
00313         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00314         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00315         alpha1 += step_a1; \
00316 \
00317         t2v.v = a2v.v; \
00318         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00319         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00320 \
00321         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00322         alpha2 += step_a2; \
00323         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00324         alpha3 += step_a3; \
00325 \
00326         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00327         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00328 \
00329         t1v.v = a1v.v; \
00330         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00331         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00332 \
00333         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00334         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00335         alpha1 += step_a1; \
00336 \
00337         t2v.v = a2v.v; \
00338         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00339         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00340 \
00341         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00342         alpha2 += step_a2; \
00343         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00344         alpha3 += step_a3; \
00345 \
00346         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00347         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00348 \
00349         t1v.v = a1v.v; \
00350         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00351         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00352 \
00353         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00354         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00355         alpha1 += step_a1; \
00356 \
00357         t2v.v = a2v.v; \
00358         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00359         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00360 \
00361         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00362         alpha2 += step_a2; \
00363         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00364         alpha3 += step_a3; \
00365 \
00366         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00367         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00368 \
00369         t1v.v = a1v.v; \
00370         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00371         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00372 \
00373         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00374         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00375         alpha1 += step_a1; \
00376 \
00377         t2v.v = a2v.v; \
00378         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00379         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00380 \
00381         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00382         alpha2 += step_a2; \
00383         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00384         alpha3 += step_a3; \
00385 \
00386         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00387         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00388 \
00389         t1v.v = a1v.v; \
00390         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00391         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00392 \
00393         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00394         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00395         alpha1 += step_a1; \
00396 \
00397         t2v.v = a2v.v; \
00398         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00399         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00400 \
00401         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00402         alpha2 += step_a2; \
00403         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00404         alpha3 += step_a3; \
00405 \
00406         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00407         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00408 \
00409         t1v.v = a1v.v; \
00410         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00411         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00412 \
00413         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00414         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00415         alpha1 += step_a1; \
00416 \
00417         t2v.v = a2v.v; \
00418         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00419         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00420 \
00421         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00422         alpha2 += step_a2; \
00423         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00424         alpha3 += step_a3; \
00425 \
00426         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00427         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00428 \
00429         t1v.v = a1v.v; \
00430         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00431         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00432 \
00433         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00434         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00435         alpha1 += step_a1; \
00436 \
00437         t2v.v = a2v.v; \
00438         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00439         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00440 \
00441         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00442         alpha2 += step_a2; \
00443         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00444         alpha3 += step_a3; \
00445 \
00446         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00447         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00448 \
00449         t1v.v = a1v.v; \
00450         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00451         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00452 \
00453         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00454         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00455         alpha1 += step_a1; \
00456 \
00457         t2v.v = a2v.v; \
00458         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00459         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00460 \
00461         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00462         alpha2 += step_a2; \
00463         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00464         alpha3 += step_a3; \
00465     } \
00466 \
00467     for ( i = 0; i < n_iter2; ++i ) \
00468     { \
00469 \
00470         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00471         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00472 \
00473         t1v.v = a1v.v; \
00474         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00475         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00476 \
00477         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00478         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00479         alpha1 += step_a1; \
00480 \
00481         t2v.v = a2v.v; \
00482         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00483         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00484 \
00485         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00486         alpha2 += step_a2; \
00487         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00488         alpha3 += step_a3; \
00489     } \
00490 \
00491     if ( n_left == 1 ) \
00492     { \
00493         double ga12 = *gamma12; \
00494         double si12 = *sigma12; \
00495         double ga23 = *gamma23; \
00496         double si23 = *sigma23; \
00497         double temp1; \
00498         double temp2; \
00499         double temp3; \
00500 \
00501         temp1 = *alpha1; \
00502         temp2 = *alpha2; \
00503 \
00504         *alpha1 = temp1 * ga12 + temp2 * si12; \
00505         *alpha2 = temp2 * ga12 - temp1 * si12; \
00506 \
00507         temp2 = *alpha2; \
00508         temp3 = *alpha3; \
00509 \
00510         *alpha2 = temp2 * ga23 + temp3 * si23; \
00511         *alpha3 = temp3 * ga23 - temp2 * si23; \
00512     } \
00513 }
00514 
00515 #define MAC_Apply_G_mx3_asc( m_A, \
00516                              gamma12, \
00517                              sigma12, \
00518                              gamma23, \
00519                              sigma23, \
00520                              a1, inc_a1, \
00521                              a2, inc_a2, \
00522                              a3, inc_a3 ) \
00523 { \
00524     int                n_iter16 = m_A / ( 2 * 8 ); \
00525     int                n_left16 = m_A % ( 2 * 8 ); \
00526     int                n_iter2  = n_left16 / ( 2 * 1 ); \
00527     int                n_left   = n_left16 % ( 2 * 1 ); \
00528     int                i; \
00529 \
00530     const int          step_a1 = inc_a1 * 2; \
00531     const int          step_a2 = inc_a1 * 2; \
00532     const int          step_a3 = inc_a1 * 2; \
00533 \
00534     scomplex* restrict alpha1 = a1; \
00535     scomplex* restrict alpha2 = a2; \
00536     scomplex* restrict alpha3 = a3; \
00537 \
00538     v4sf_t             a1v, a2v, a3v; \
00539     v4sf_t             g12v, s12v; \
00540     v4sf_t             g23v, s23v; \
00541     v4sf_t             t1v, t2v; \
00542 \
00543     g12v.v = _mm_load1_ps( gamma12 ); \
00544     s12v.v = _mm_load1_ps( sigma12 ); \
00545     g23v.v = _mm_load1_ps( gamma23 ); \
00546     s23v.v = _mm_load1_ps( sigma23 ); \
00547 \
00548     for ( i = 0; i < n_iter16; ++i ) \
00549     { \
00550 \
00551         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00552         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00553 \
00554         t1v.v = a1v.v; \
00555         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00556         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00557 \
00558         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00559         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00560         alpha1 += step_a1; \
00561 \
00562         t2v.v = a2v.v; \
00563         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00564         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00565 \
00566         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00567         alpha2 += step_a2; \
00568         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00569         alpha3 += step_a3; \
00570 \
00571         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00572         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00573 \
00574         t1v.v = a1v.v; \
00575         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00576         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00577 \
00578         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00579         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00580         alpha1 += step_a1; \
00581 \
00582         t2v.v = a2v.v; \
00583         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00584         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00585 \
00586         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00587         alpha2 += step_a2; \
00588         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00589         alpha3 += step_a3; \
00590 \
00591         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00592         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00593 \
00594         t1v.v = a1v.v; \
00595         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00596         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00597 \
00598         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00599         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00600         alpha1 += step_a1; \
00601 \
00602         t2v.v = a2v.v; \
00603         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00604         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00605 \
00606         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00607         alpha2 += step_a2; \
00608         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00609         alpha3 += step_a3; \
00610 \
00611         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00612         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00613 \
00614         t1v.v = a1v.v; \
00615         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00616         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00617 \
00618         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00619         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00620         alpha1 += step_a1; \
00621 \
00622         t2v.v = a2v.v; \
00623         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00624         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00625 \
00626         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00627         alpha2 += step_a2; \
00628         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00629         alpha3 += step_a3; \
00630 \
00631         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00632         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00633 \
00634         t1v.v = a1v.v; \
00635         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00636         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00637 \
00638         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00639         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00640         alpha1 += step_a1; \
00641 \
00642         t2v.v = a2v.v; \
00643         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00644         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00645 \
00646         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00647         alpha2 += step_a2; \
00648         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00649         alpha3 += step_a3; \
00650 \
00651         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00652         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00653 \
00654         t1v.v = a1v.v; \
00655         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00656         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00657 \
00658         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00659         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00660         alpha1 += step_a1; \
00661 \
00662         t2v.v = a2v.v; \
00663         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00664         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00665 \
00666         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00667         alpha2 += step_a2; \
00668         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00669         alpha3 += step_a3; \
00670 \
00671         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00672         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00673 \
00674         t1v.v = a1v.v; \
00675         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00676         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00677 \
00678         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00679         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00680         alpha1 += step_a1; \
00681 \
00682         t2v.v = a2v.v; \
00683         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00684         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00685 \
00686         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00687         alpha2 += step_a2; \
00688         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00689         alpha3 += step_a3; \
00690 \
00691         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00692         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00693 \
00694         t1v.v = a1v.v; \
00695         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00696         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00697 \
00698         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00699         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00700         alpha1 += step_a1; \
00701 \
00702         t2v.v = a2v.v; \
00703         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00704         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00705 \
00706         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00707         alpha2 += step_a2; \
00708         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00709         alpha3 += step_a3; \
00710     } \
00711 \
00712     for ( i = 0; i < n_iter2; ++i ) \
00713     { \
00714 \
00715         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00716         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00717 \
00718         t1v.v = a1v.v; \
00719         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00720         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00721 \
00722         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00723         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00724         alpha1 += step_a1; \
00725 \
00726         t2v.v = a2v.v; \
00727         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00728         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00729 \
00730         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00731         alpha2 += step_a2; \
00732         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00733         alpha3 += step_a3; \
00734     } \
00735 \
00736     if ( n_left == 1 ) \
00737     { \
00738         float ga12 = *gamma12; \
00739         float si12 = *sigma12; \
00740         float ga23 = *gamma23; \
00741         float si23 = *sigma23; \
00742         scomplex temp1; \
00743         scomplex temp2; \
00744         scomplex temp3; \
00745 \
00746         temp1 = *alpha1; \
00747         temp2 = *alpha2; \
00748 \
00749         alpha1->real = temp1.real * ga12 + temp2.real * si12; \
00750         alpha2->real = temp2.real * ga12 - temp1.real * si12; \
00751 \
00752         alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
00753         alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
00754 \
00755         temp2 = *alpha2; \
00756         temp3 = *alpha3; \
00757 \
00758         alpha2->real = temp2.real * ga23 + temp3.real * si23; \
00759         alpha3->real = temp3.real * ga23 - temp2.real * si23; \
00760 \
00761         alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \
00762         alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \
00763     } \
00764 }
00765 
00766 #define MAC_Apply_G_mx3_asz( m_A, \
00767                              gamma12, \
00768                              sigma12, \
00769                              gamma23, \
00770                              sigma23, \
00771                              a1, inc_a1, \
00772                              a2, inc_a2, \
00773                              a3, inc_a3 ) \
00774 {\
00775     int                n_iter = m_A / 8; \
00776     int                n_left = m_A % 8; \
00777     int                i; \
00778 \
00779     const int          step_a1 = inc_a1 * 1; \
00780     const int          step_a2 = inc_a1 * 1; \
00781     const int          step_a3 = inc_a1 * 1; \
00782 \
00783     dcomplex* restrict alpha1 = a1; \
00784     dcomplex* restrict alpha2 = a2; \
00785     dcomplex* restrict alpha3 = a3; \
00786 \
00787     v2df_t             a1v, a2v, a3v; \
00788     v2df_t             g12v, s12v; \
00789     v2df_t             g23v, s23v; \
00790     v2df_t             t1v, t2v; \
00791 \
00792     g12v.v = _mm_loaddup_pd( gamma12 ); \
00793     s12v.v = _mm_loaddup_pd( sigma12 ); \
00794     g23v.v = _mm_loaddup_pd( gamma23 ); \
00795     s23v.v = _mm_loaddup_pd( sigma23 ); \
00796 \
00797     for ( i = 0; i < n_iter; ++i ) \
00798     { \
00799 \
00800         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00801         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00802 \
00803         t1v.v = a1v.v; \
00804         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00805         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00806 \
00807         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00808         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00809         alpha1 += step_a1; \
00810 \
00811         t2v.v = a2v.v; \
00812         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00813         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00814 \
00815         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00816         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00817         alpha2 += step_a2; \
00818         alpha3 += step_a3; \
00819 \
00820         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00821         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00822 \
00823         t1v.v = a1v.v; \
00824         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00825         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00826 \
00827         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00828         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00829         alpha1 += step_a1; \
00830 \
00831         t2v.v = a2v.v; \
00832         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00833         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00834 \
00835         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00836         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00837         alpha2 += step_a2; \
00838         alpha3 += step_a3; \
00839 \
00840         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00841         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00842 \
00843         t1v.v = a1v.v; \
00844         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00845         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00846 \
00847         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00848         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00849         alpha1 += step_a1; \
00850 \
00851         t2v.v = a2v.v; \
00852         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00853         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00854 \
00855         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00856         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00857         alpha2 += step_a2; \
00858         alpha3 += step_a3; \
00859 \
00860         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00861         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00862 \
00863         t1v.v = a1v.v; \
00864         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00865         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00866 \
00867         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00868         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00869         alpha1 += step_a1; \
00870 \
00871         t2v.v = a2v.v; \
00872         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00873         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00874 \
00875         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00876         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00877         alpha2 += step_a2; \
00878         alpha3 += step_a3; \
00879 \
00880         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00881         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00882 \
00883         t1v.v = a1v.v; \
00884         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00885         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00886 \
00887         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00888         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00889         alpha1 += step_a1; \
00890 \
00891         t2v.v = a2v.v; \
00892         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00893         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00894 \
00895         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00896         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00897         alpha2 += step_a2; \
00898         alpha3 += step_a3; \
00899 \
00900         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00901         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00902 \
00903         t1v.v = a1v.v; \
00904         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00905         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00906 \
00907         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00908         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00909         alpha1 += step_a1; \
00910 \
00911         t2v.v = a2v.v; \
00912         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00913         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00914 \
00915         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00916         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00917         alpha2 += step_a2; \
00918         alpha3 += step_a3; \
00919 \
00920         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00921         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00922 \
00923         t1v.v = a1v.v; \
00924         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00925         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00926 \
00927         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00928         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00929         alpha1 += step_a1; \
00930 \
00931         t2v.v = a2v.v; \
00932         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00933         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00934 \
00935         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00936         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00937         alpha2 += step_a2; \
00938         alpha3 += step_a3; \
00939 \
00940         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00941         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00942 \
00943         t1v.v = a1v.v; \
00944         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00945         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00946 \
00947         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00948         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00949         alpha1 += step_a1; \
00950 \
00951         t2v.v = a2v.v; \
00952         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00953         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00954 \
00955         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00956         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00957         alpha2 += step_a2; \
00958         alpha3 += step_a3; \
00959     } \
00960 \
00961     for ( i = 0; i < n_left; ++i ) \
00962     { \
00963         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00964         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00965 \
00966         t1v.v = a1v.v; \
00967         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00968         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00969 \
00970         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00971         alpha1 += step_a1; \
00972         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00973 \
00974         t2v.v = a2v.v; \
00975         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00976         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00977 \
00978         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00979         alpha2 += step_a2; \
00980         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00981         alpha3 += step_a3; \
00982     } \
00983 }
00984 
00985 #endif