libflame  revision_anchor
FLA_Apply_G_mx3b_asm.h
Go to the documentation of this file.
00001 /*
00002 
00003     Copyright (C) 2014, The University of Texas at Austin
00004 
00005     This file is part of libflame and is available under the 3-Clause
00006     BSD license, which can be found in the LICENSE file at the top-level
00007     directory, or at http://opensource.org/licenses/BSD-3-Clause
00008 
00009 */
00010 
00011 
00012 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS
00013 
00014 #define MAC_Apply_G_mx3b_ass MAC_Apply_G_mx3b_ops
00015 #define MAC_Apply_G_mx3b_asd MAC_Apply_G_mx3b_opd
00016 #define MAC_Apply_G_mx3b_asc MAC_Apply_G_mx3b_opc
00017 #define MAC_Apply_G_mx3b_asz MAC_Apply_G_mx3b_opz
00018 
00019 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS
00020 
00021 #define MAC_Apply_G_mx3b_ass( m_A, \
00022                               gamma12, \
00023                               sigma12, \
00024                               gamma23, \
00025                               sigma23, \
00026                               a1, inc_a1, \
00027                               a2, inc_a2, \
00028                               a3, inc_a3 ) \
00029 {\
00030     int              n_iter32 = m_A / ( 4 * 8 ); \
00031     int              n_left32 = m_A % ( 4 * 8 ); \
00032     int              n_iter4  = n_left32 / ( 4 * 1 ); \
00033     int              n_left   = n_left32 % ( 4 * 1 ); \
00034     int              i; \
00035 \
00036     const int        step_a1 = inc_a1 * 4; \
00037     const int        step_a2 = inc_a2 * 4; \
00038     const int        step_a3 = inc_a3 * 4; \
00039 \
00040     float*  restrict alpha1 = a1; \
00041     float*  restrict alpha2 = a2; \
00042     float*  restrict alpha3 = a3; \
00043 \
00044     v4sf_t           a1v, a2v, a3v; \
00045     v4sf_t           g12v, s12v; \
00046     v4sf_t           g23v, s23v; \
00047     v4sf_t           t1v, t2v; \
00048 \
00049     g12v.v = _mm_load1_ps( gamma12 ); \
00050     s12v.v = _mm_load1_ps( sigma12 ); \
00051     g23v.v = _mm_load1_ps( gamma23 ); \
00052     s23v.v = _mm_load1_ps( sigma23 ); \
00053 \
00054     for ( i = 0; i < n_iter32; ++i ) \
00055     { \
00056 \
00057         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00058         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00059 \
00060         t2v.v = a2v.v; \
00061         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00062         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00063 \
00064         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00065         alpha3 += step_a3; \
00066         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00067 \
00068         t1v.v = a1v.v; \
00069         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00070         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00071 \
00072         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00073         alpha1 += step_a1; \
00074         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00075         alpha2 += step_a2; \
00076 \
00077         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00078         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00079 \
00080         t2v.v = a2v.v; \
00081         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00082         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00083 \
00084         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00085         alpha3 += step_a3; \
00086         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00087 \
00088         t1v.v = a1v.v; \
00089         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00090         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00091 \
00092         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00093         alpha1 += step_a1; \
00094         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00095         alpha2 += step_a2; \
00096 \
00097         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00098         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00099 \
00100         t2v.v = a2v.v; \
00101         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00102         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00103 \
00104         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00105         alpha3 += step_a3; \
00106         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00107 \
00108         t1v.v = a1v.v; \
00109         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00110         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00111 \
00112         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00113         alpha1 += step_a1; \
00114         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00115         alpha2 += step_a2; \
00116 \
00117         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00118         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00119 \
00120         t2v.v = a2v.v; \
00121         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00122         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00123 \
00124         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00125         alpha3 += step_a3; \
00126         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00127 \
00128         t1v.v = a1v.v; \
00129         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00130         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00131 \
00132         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00133         alpha1 += step_a1; \
00134         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00135         alpha2 += step_a2; \
00136 \
00137         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00138         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00139 \
00140         t2v.v = a2v.v; \
00141         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00142         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00143 \
00144         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00145         alpha3 += step_a3; \
00146         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00147 \
00148         t1v.v = a1v.v; \
00149         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00150         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00151 \
00152         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00153         alpha1 += step_a1; \
00154         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00155         alpha2 += step_a2; \
00156 \
00157         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00158         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00159 \
00160         t2v.v = a2v.v; \
00161         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00162         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00163 \
00164         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00165         alpha3 += step_a3; \
00166         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00167 \
00168         t1v.v = a1v.v; \
00169         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00170         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00171 \
00172         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00173         alpha1 += step_a1; \
00174         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00175         alpha2 += step_a2; \
00176 \
00177         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00178         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00179 \
00180         t2v.v = a2v.v; \
00181         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00182         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00183 \
00184         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00185         alpha3 += step_a3; \
00186         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00187 \
00188         t1v.v = a1v.v; \
00189         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00190         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00191 \
00192         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00193         alpha1 += step_a1; \
00194         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00195         alpha2 += step_a2; \
00196 \
00197         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00198         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00199 \
00200         t2v.v = a2v.v; \
00201         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00202         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00203 \
00204         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00205         alpha3 += step_a3; \
00206         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00207 \
00208         t1v.v = a1v.v; \
00209         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00210         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00211 \
00212         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00213         alpha1 += step_a1; \
00214         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00215         alpha2 += step_a2; \
00216     } \
00217 \
00218     for ( i = 0; i < n_iter4; ++i ) \
00219     { \
00220 \
00221         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00222         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00223 \
00224         t2v.v = a2v.v; \
00225         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00226         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00227 \
00228         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00229         alpha3 += step_a3; \
00230         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00231 \
00232         t1v.v = a1v.v; \
00233         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00234         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00235 \
00236         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00237         alpha1 += step_a1; \
00238         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00239         alpha2 += step_a2; \
00240     } \
00241 \
00242     for ( i = 0; i < n_left; ++i ) \
00243     { \
00244         float ga12 = *gamma12; \
00245         float si12 = *sigma12; \
00246         float ga23 = *gamma23; \
00247         float si23 = *sigma23; \
00248         float temp1; \
00249         float temp2; \
00250         float temp3; \
00251 \
00252         temp2 = *alpha2; \
00253         temp3 = *alpha3; \
00254 \
00255         *alpha2 = temp2 * ga23 + temp3 * si23; \
00256         *alpha3 = temp3 * ga23 - temp2 * si23; \
00257 \
00258         temp1 = *alpha1; \
00259         temp2 = *alpha2; \
00260 \
00261         *alpha1 = temp1 * ga12 + temp2 * si12; \
00262         *alpha2 = temp2 * ga12 - temp1 * si12; \
00263 \
00264         alpha1 += 1; \
00265         alpha2 += 1; \
00266         alpha3 += 1; \
00267     } \
00268 }
00269 
00270 #define MAC_Apply_G_mx3b_asd( m_A, \
00271                               gamma12, \
00272                               sigma12, \
00273                               gamma23, \
00274                               sigma23, \
00275                               a1, inc_a1, \
00276                               a2, inc_a2, \
00277                               a3, inc_a3 ) \
00278 {\
00279     int              n_iter16 = m_A / ( 2 * 8 ); \
00280     int              n_left16 = m_A % ( 2 * 8 ); \
00281     int              n_iter2  = n_left16 / ( 2 * 1 ); \
00282     int              n_left   = n_left16 % ( 2 * 1 ); \
00283     int              i; \
00284 \
00285     const int        step_a1 = inc_a1 * 2; \
00286     const int        step_a2 = inc_a2 * 2; \
00287     const int        step_a3 = inc_a3 * 2; \
00288 \
00289     double* restrict alpha1 = a1; \
00290     double* restrict alpha2 = a2; \
00291     double* restrict alpha3 = a3; \
00292 \
00293     v2df_t           a1v, a2v, a3v; \
00294     v2df_t           g12v, s12v; \
00295     v2df_t           g23v, s23v; \
00296     v2df_t           t1v, t2v; \
00297 \
00298     g12v.v = _mm_loaddup_pd( gamma12 ); \
00299     s12v.v = _mm_loaddup_pd( sigma12 ); \
00300     g23v.v = _mm_loaddup_pd( gamma23 ); \
00301     s23v.v = _mm_loaddup_pd( sigma23 ); \
00302 \
00303     for ( i = 0; i < n_iter16; ++i ) \
00304     { \
00305 \
00306         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00307         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00308 \
00309         t2v.v = a2v.v; \
00310         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00311         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00312 \
00313         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00314         alpha3 += step_a3; \
00315         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00316 \
00317         t1v.v = a1v.v; \
00318         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00319         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00320 \
00321         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00322         alpha1 += step_a1; \
00323         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00324         alpha2 += step_a2; \
00325 \
00326         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00327         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00328 \
00329         t2v.v = a2v.v; \
00330         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00331         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00332 \
00333         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00334         alpha3 += step_a3; \
00335         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00336 \
00337         t1v.v = a1v.v; \
00338         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00339         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00340 \
00341         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00342         alpha1 += step_a1; \
00343         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00344         alpha2 += step_a2; \
00345 \
00346         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00347         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00348 \
00349         t2v.v = a2v.v; \
00350         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00351         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00352 \
00353         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00354         alpha3 += step_a3; \
00355         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00356 \
00357         t1v.v = a1v.v; \
00358         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00359         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00360 \
00361         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00362         alpha1 += step_a1; \
00363         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00364         alpha2 += step_a2; \
00365 \
00366         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00367         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00368 \
00369         t2v.v = a2v.v; \
00370         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00371         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00372 \
00373         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00374         alpha3 += step_a3; \
00375         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00376 \
00377         t1v.v = a1v.v; \
00378         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00379         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00380 \
00381         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00382         alpha1 += step_a1; \
00383         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00384         alpha2 += step_a2; \
00385 \
00386         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00387         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00388 \
00389         t2v.v = a2v.v; \
00390         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00391         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00392 \
00393         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00394         alpha3 += step_a3; \
00395         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00396 \
00397         t1v.v = a1v.v; \
00398         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00399         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00400 \
00401         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00402         alpha1 += step_a1; \
00403         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00404         alpha2 += step_a2; \
00405 \
00406         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00407         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00408 \
00409         t2v.v = a2v.v; \
00410         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00411         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00412 \
00413         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00414         alpha3 += step_a3; \
00415         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00416 \
00417         t1v.v = a1v.v; \
00418         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00419         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00420 \
00421         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00422         alpha1 += step_a1; \
00423         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00424         alpha2 += step_a2; \
00425 \
00426         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00427         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00428 \
00429         t2v.v = a2v.v; \
00430         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00431         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00432 \
00433         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00434         alpha3 += step_a3; \
00435         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00436 \
00437         t1v.v = a1v.v; \
00438         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00439         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00440 \
00441         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00442         alpha1 += step_a1; \
00443         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00444         alpha2 += step_a2; \
00445 \
00446         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00447         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00448 \
00449         t2v.v = a2v.v; \
00450         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00451         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00452 \
00453         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00454         alpha3 += step_a3; \
00455         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00456 \
00457         t1v.v = a1v.v; \
00458         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00459         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00460 \
00461         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00462         alpha1 += step_a1; \
00463         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00464         alpha2 += step_a2; \
00465     } \
00466 \
00467     for ( i = 0; i < n_iter2; ++i ) \
00468     { \
00469 \
00470         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00471         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00472 \
00473         t2v.v = a2v.v; \
00474         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00475         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00476 \
00477         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00478         alpha3 += step_a3; \
00479         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00480 \
00481         t1v.v = a1v.v; \
00482         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00483         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00484 \
00485         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00486         alpha1 += step_a1; \
00487         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00488         alpha2 += step_a2; \
00489     } \
00490 \
00491     if ( n_left == 1 ) \
00492     { \
00493         double ga12 = *gamma12; \
00494         double si12 = *sigma12; \
00495         double ga23 = *gamma23; \
00496         double si23 = *sigma23; \
00497         double temp1; \
00498         double temp2; \
00499         double temp3; \
00500 \
00501         temp2 = *alpha2; \
00502         temp3 = *alpha3; \
00503 \
00504         *alpha2 = temp2 * ga23 + temp3 * si23; \
00505         *alpha3 = temp3 * ga23 - temp2 * si23; \
00506 \
00507         temp1 = *alpha1; \
00508         temp2 = *alpha2; \
00509 \
00510         *alpha1 = temp1 * ga12 + temp2 * si12; \
00511         *alpha2 = temp2 * ga12 - temp1 * si12; \
00512     } \
00513 }
00514 
00515 #define MAC_Apply_G_mx3b_asc( m_A, \
00516                               gamma12, \
00517                               sigma12, \
00518                               gamma23, \
00519                               sigma23, \
00520                               a1, inc_a1, \
00521                               a2, inc_a2, \
00522                               a3, inc_a3 ) \
00523 {\
00524     int                n_iter16 = m_A / ( 2 * 8 ); \
00525     int                n_left16 = m_A % ( 2 * 8 ); \
00526     int                n_iter2  = n_left16 / ( 2 * 1 ); \
00527     int                n_left   = n_left16 % ( 2 * 1 ); \
00528     int                i; \
00529 \
00530     const int          step_a1 = inc_a1 * 2; \
00531     const int          step_a2 = inc_a2 * 2; \
00532     const int          step_a3 = inc_a3 * 2; \
00533 \
00534     scomplex* restrict alpha1 = a1; \
00535     scomplex* restrict alpha2 = a2; \
00536     scomplex* restrict alpha3 = a3; \
00537 \
00538     v4sf_t             a1v, a2v, a3v; \
00539     v4sf_t             g12v, s12v; \
00540     v4sf_t             g23v, s23v; \
00541     v4sf_t             t1v, t2v; \
00542 \
00543     g12v.v = _mm_load1_ps( gamma12 ); \
00544     s12v.v = _mm_load1_ps( sigma12 ); \
00545     g23v.v = _mm_load1_ps( gamma23 ); \
00546     s23v.v = _mm_load1_ps( sigma23 ); \
00547 \
00548     for ( i = 0; i < n_iter16; ++i ) \
00549     { \
00550 \
00551         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00552         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00553 \
00554         t2v.v = a2v.v; \
00555         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00556         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00557 \
00558         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00559         alpha3 += step_a3; \
00560         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00561 \
00562         t1v.v = a1v.v; \
00563         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00564         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00565 \
00566         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00567         alpha1 += step_a1; \
00568         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00569         alpha2 += step_a2; \
00570 \
00571         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00572         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00573 \
00574         t2v.v = a2v.v; \
00575         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00576         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00577 \
00578         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00579         alpha3 += step_a3; \
00580         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00581 \
00582         t1v.v = a1v.v; \
00583         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00584         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00585 \
00586         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00587         alpha1 += step_a1; \
00588         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00589         alpha2 += step_a2; \
00590 \
00591         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00592         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00593 \
00594         t2v.v = a2v.v; \
00595         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00596         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00597 \
00598         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00599         alpha3 += step_a3; \
00600         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00601 \
00602         t1v.v = a1v.v; \
00603         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00604         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00605 \
00606         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00607         alpha1 += step_a1; \
00608         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00609         alpha2 += step_a2; \
00610 \
00611         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00612         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00613 \
00614         t2v.v = a2v.v; \
00615         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00616         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00617 \
00618         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00619         alpha3 += step_a3; \
00620         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00621 \
00622         t1v.v = a1v.v; \
00623         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00624         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00625 \
00626         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00627         alpha1 += step_a1; \
00628         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00629         alpha2 += step_a2; \
00630 \
00631         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00632         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00633 \
00634         t2v.v = a2v.v; \
00635         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00636         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00637 \
00638         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00639         alpha3 += step_a3; \
00640         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00641 \
00642         t1v.v = a1v.v; \
00643         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00644         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00645 \
00646         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00647         alpha1 += step_a1; \
00648         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00649         alpha2 += step_a2; \
00650 \
00651         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00652         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00653 \
00654         t2v.v = a2v.v; \
00655         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00656         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00657 \
00658         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00659         alpha3 += step_a3; \
00660         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00661 \
00662         t1v.v = a1v.v; \
00663         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00664         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00665 \
00666         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00667         alpha1 += step_a1; \
00668         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00669         alpha2 += step_a2; \
00670 \
00671         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00672         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00673 \
00674         t2v.v = a2v.v; \
00675         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00676         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00677 \
00678         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00679         alpha3 += step_a3; \
00680         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00681 \
00682         t1v.v = a1v.v; \
00683         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00684         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00685 \
00686         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00687         alpha1 += step_a1; \
00688         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00689         alpha2 += step_a2; \
00690 \
00691         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00692         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00693 \
00694         t2v.v = a2v.v; \
00695         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00696         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00697 \
00698         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00699         alpha3 += step_a3; \
00700         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00701 \
00702         t1v.v = a1v.v; \
00703         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00704         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00705 \
00706         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00707         alpha1 += step_a1; \
00708         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00709         alpha2 += step_a2; \
00710     } \
00711 \
00712     for ( i = 0; i < n_iter2; ++i ) \
00713     { \
00714 \
00715         a2v.v = _mm_load_ps( ( float* )alpha2 ); \
00716         a3v.v = _mm_load_ps( ( float* )alpha3 ); \
00717 \
00718         t2v.v = a2v.v; \
00719         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00720         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00721 \
00722         _mm_store_ps( ( float* )alpha3, a3v.v ); \
00723         alpha3 += step_a3; \
00724         a1v.v = _mm_load_ps( ( float* )alpha1 ); \
00725 \
00726         t1v.v = a1v.v; \
00727         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00728         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00729 \
00730         _mm_store_ps( ( float* )alpha1, a1v.v ); \
00731         alpha1 += step_a1; \
00732         _mm_store_ps( ( float* )alpha2, a2v.v ); \
00733         alpha2 += step_a2; \
00734     } \
00735 \
00736     if ( n_left == 1 ) \
00737     { \
00738         float ga12 = *gamma12; \
00739         float si12 = *sigma12; \
00740         float ga23 = *gamma23; \
00741         float si23 = *sigma23; \
00742         scomplex temp1; \
00743         scomplex temp2; \
00744         scomplex temp3; \
00745 \
00746         temp1 = *alpha1; \
00747         temp2 = *alpha2; \
00748 \
00749         alpha1->real = temp1.real * ga12 + temp2.real * si12; \
00750         alpha2->real = temp2.real * ga12 - temp1.real * si12; \
00751 \
00752         alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \
00753         alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \
00754 \
00755         temp2 = *alpha2; \
00756         temp3 = *alpha3; \
00757 \
00758         alpha2->real = temp2.real * ga23 + temp3.real * si23; \
00759         alpha3->real = temp3.real * ga23 - temp2.real * si23; \
00760 \
00761         alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \
00762         alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \
00763     } \
00764 }
00765 
00766 #define MAC_Apply_G_mx3b_asz( m_A, \
00767                               gamma12, \
00768                               sigma12, \
00769                               gamma23, \
00770                               sigma23, \
00771                               a1, inc_a1, \
00772                               a2, inc_a2, \
00773                               a3, inc_a3 ) \
00774 {\
00775     int                n_iter = m_A / 8; \
00776     int                n_left = m_A % 8; \
00777     int                i; \
00778 \
00779     const int          step_a1 = inc_a1 * 1; \
00780     const int          step_a2 = inc_a2 * 1; \
00781     const int          step_a3 = inc_a3 * 1; \
00782 \
00783     dcomplex* restrict alpha1 = a1; \
00784     dcomplex* restrict alpha2 = a2; \
00785     dcomplex* restrict alpha3 = a3; \
00786 \
00787     v2df_t             a1v, a2v, a3v; \
00788     v2df_t             g12v, s12v; \
00789     v2df_t             g23v, s23v; \
00790     v2df_t             t1v, t2v; \
00791 \
00792     g12v.v = _mm_loaddup_pd( gamma12 ); \
00793     s12v.v = _mm_loaddup_pd( sigma12 ); \
00794     g23v.v = _mm_loaddup_pd( gamma23 ); \
00795     s23v.v = _mm_loaddup_pd( sigma23 ); \
00796 \
00797     for ( i = 0; i < n_iter; ++i ) \
00798     { \
00799 \
00800         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00801         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00802 \
00803         t2v.v = a2v.v; \
00804         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00805         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00806 \
00807         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00808         alpha3 += step_a3; \
00809         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00810 \
00811         t1v.v = a1v.v; \
00812         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00813         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00814 \
00815         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00816         alpha1 += step_a1; \
00817         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00818         alpha2 += step_a2; \
00819 \
00820         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00821         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00822 \
00823         t2v.v = a2v.v; \
00824         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00825         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00826 \
00827         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00828         alpha3 += step_a3; \
00829         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00830 \
00831         t1v.v = a1v.v; \
00832         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00833         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00834 \
00835         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00836         alpha1 += step_a1; \
00837         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00838         alpha2 += step_a2; \
00839 \
00840         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00841         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00842 \
00843         t2v.v = a2v.v; \
00844         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00845         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00846 \
00847         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00848         alpha3 += step_a3; \
00849         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00850 \
00851         t1v.v = a1v.v; \
00852         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00853         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00854 \
00855         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00856         alpha1 += step_a1; \
00857         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00858         alpha2 += step_a2; \
00859 \
00860         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00861         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00862 \
00863         t2v.v = a2v.v; \
00864         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00865         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00866 \
00867         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00868         alpha3 += step_a3; \
00869         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00870 \
00871         t1v.v = a1v.v; \
00872         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00873         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00874 \
00875         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00876         alpha1 += step_a1; \
00877         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00878         alpha2 += step_a2; \
00879 \
00880         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00881         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00882 \
00883         t2v.v = a2v.v; \
00884         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00885         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00886 \
00887         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00888         alpha3 += step_a3; \
00889         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00890 \
00891         t1v.v = a1v.v; \
00892         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00893         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00894 \
00895         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00896         alpha1 += step_a1; \
00897         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00898         alpha2 += step_a2; \
00899 \
00900         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00901         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00902 \
00903         t2v.v = a2v.v; \
00904         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00905         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00906 \
00907         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00908         alpha3 += step_a3; \
00909         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00910 \
00911         t1v.v = a1v.v; \
00912         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00913         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00914 \
00915         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00916         alpha1 += step_a1; \
00917         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00918         alpha2 += step_a2; \
00919 \
00920         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00921         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00922 \
00923         t2v.v = a2v.v; \
00924         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00925         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00926 \
00927         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00928         alpha3 += step_a3; \
00929         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00930 \
00931         t1v.v = a1v.v; \
00932         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00933         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00934 \
00935         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00936         alpha1 += step_a1; \
00937         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00938         alpha2 += step_a2; \
00939 \
00940         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00941         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00942 \
00943         t2v.v = a2v.v; \
00944         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00945         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00946 \
00947         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00948         alpha3 += step_a3; \
00949         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00950 \
00951         t1v.v = a1v.v; \
00952         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00953         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00954 \
00955         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00956         alpha1 += step_a1; \
00957         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00958         alpha2 += step_a2; \
00959     } \
00960 \
00961     for ( i = 0; i < n_left; ++i ) \
00962     { \
00963 \
00964         a2v.v = _mm_load_pd( ( double* )alpha2 ); \
00965         a3v.v = _mm_load_pd( ( double* )alpha3 ); \
00966 \
00967         t2v.v = a2v.v; \
00968         a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \
00969         a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \
00970 \
00971         _mm_store_pd( ( double* )alpha3, a3v.v ); \
00972         alpha3 += step_a3; \
00973         a1v.v = _mm_load_pd( ( double* )alpha1 ); \
00974 \
00975         t1v.v = a1v.v; \
00976         a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \
00977         a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \
00978 \
00979         _mm_store_pd( ( double* )alpha1, a1v.v ); \
00980         alpha1 += step_a1; \
00981         _mm_store_pd( ( double* )alpha2, a2v.v ); \
00982         alpha2 += step_a2; \
00983     } \
00984 }
00985 
00986 #endif