libflame
revision_anchor
|
00001 /* 00002 00003 Copyright (C) 2014, The University of Texas at Austin 00004 00005 This file is part of libflame and is available under the 3-Clause 00006 BSD license, which can be found in the LICENSE file at the top-level 00007 directory, or at http://opensource.org/licenses/BSD-3-Clause 00008 00009 */ 00010 00011 00012 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS 00013 00014 #define MAC_Apply_G_mx2_ass MAC_Apply_G_mx2_ops 00015 #define MAC_Apply_G_mx2_asd MAC_Apply_G_mx2_opd 00016 #define MAC_Apply_G_mx2_asc MAC_Apply_G_mx2_opc 00017 #define MAC_Apply_G_mx2_asz MAC_Apply_G_mx2_opz 00018 00019 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS 00020 00021 #define MAC_Apply_G_mx2_ass( m_A, \ 00022 gamma12, \ 00023 sigma12, \ 00024 a1, inc_a1, \ 00025 a2, inc_a2 ) \ 00026 {\ 00027 int n_iter32 = m_A / ( 4 * 8 ); \ 00028 int n_left32 = m_A % ( 4 * 8 ); \ 00029 int n_iter4 = n_left32 / ( 4 * 1 ); \ 00030 int n_left = n_left32 % ( 4 * 1 ); \ 00031 int i; \ 00032 \ 00033 const int step_a1 = inc_a1 * 4; \ 00034 const int step_a2 = inc_a2 * 4; \ 00035 \ 00036 float* restrict alpha1 = a1; \ 00037 float* restrict alpha2 = a2; \ 00038 \ 00039 v4sf_t a1v, a2v; \ 00040 v4sf_t g12v, s12v; \ 00041 v4sf_t t1v; \ 00042 \ 00043 g12v.v = _mm_load1_ps( gamma12 ); \ 00044 s12v.v = _mm_load1_ps( sigma12 ); \ 00045 \ 00046 for ( i = 0; i < n_iter32; ++i ) \ 00047 { \ 00048 \ 00049 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00050 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00051 \ 00052 t1v.v = a1v.v; \ 00053 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00054 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00055 \ 00056 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00057 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00058 \ 00059 alpha1 += step_a1; \ 00060 alpha2 += step_a2; \ 00061 \ 00062 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00063 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00064 \ 00065 t1v.v = a1v.v; \ 00066 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00067 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00068 \ 00069 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00070 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00071 \ 00072 alpha1 += step_a1; \ 00073 alpha2 += step_a2; \ 00074 \ 00075 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00076 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00077 \ 00078 t1v.v = a1v.v; \ 00079 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00080 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00081 \ 00082 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00083 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00084 \ 00085 alpha1 += step_a1; \ 00086 alpha2 += step_a2; \ 00087 \ 00088 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00089 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00090 \ 00091 t1v.v = a1v.v; \ 00092 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00093 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00094 \ 00095 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00096 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00097 \ 00098 alpha1 += step_a1; \ 00099 alpha2 += step_a2; \ 00100 \ 00101 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00102 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00103 \ 00104 t1v.v = a1v.v; \ 00105 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00106 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00107 \ 00108 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00109 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00110 \ 00111 alpha1 += step_a1; \ 00112 alpha2 += step_a2; \ 00113 \ 00114 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00115 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00116 \ 00117 t1v.v = a1v.v; \ 00118 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00119 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00120 \ 00121 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00122 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00123 \ 00124 alpha1 += step_a1; \ 00125 alpha2 += step_a2; \ 00126 \ 00127 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00128 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00129 \ 00130 t1v.v = a1v.v; \ 00131 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00132 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00133 \ 00134 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00135 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00136 \ 00137 alpha1 += step_a1; \ 00138 alpha2 += step_a2; \ 00139 \ 00140 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00141 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00142 \ 00143 t1v.v = a1v.v; \ 00144 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00145 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00146 \ 00147 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00148 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00149 \ 00150 alpha1 += step_a1; \ 00151 alpha2 += step_a2; \ 00152 } \ 00153 \ 00154 for ( i = 0; i < n_iter4; ++i ) \ 00155 { \ 00156 \ 00157 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00158 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00159 \ 00160 t1v.v = a1v.v; \ 00161 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00162 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00163 \ 00164 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00165 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00166 \ 00167 alpha1 += step_a1; \ 00168 alpha2 += step_a2; \ 00169 } \ 00170 \ 00171 for ( i = 0; i < n_left; ++i ) \ 00172 { \ 00173 float ga12 = *gamma12; \ 00174 float si12 = *sigma12; \ 00175 float temp1; \ 00176 float temp2; \ 00177 \ 00178 temp1 = *alpha1; \ 00179 temp2 = *alpha2; \ 00180 \ 00181 *alpha1 = temp1 * ga12 + temp2 * si12; \ 00182 *alpha2 = temp2 * ga12 - temp1 * si12; \ 00183 \ 00184 alpha1 += 1; \ 00185 alpha2 += 1; \ 00186 } \ 00187 } 00188 00189 #define MAC_Apply_G_mx2_asd( m_A, \ 00190 gamma12, \ 00191 sigma12, \ 00192 a1, inc_a1, \ 00193 a2, inc_a2 ) \ 00194 {\ 00195 int n_iter16 = m_A / ( 2 * 8 ); \ 00196 int n_left16 = m_A % ( 2 * 8 ); \ 00197 int n_iter2 = n_left16 / ( 2 * 1 ); \ 00198 int n_left = n_left16 % ( 2 * 1 ); \ 00199 int i; \ 00200 \ 00201 const int step_a1 = inc_a1 * 2; \ 00202 const int step_a2 = inc_a2 * 2; \ 00203 \ 00204 double* restrict alpha1 = a1; \ 00205 double* restrict alpha2 = a2; \ 00206 \ 00207 v2df_t a1v, a2v; \ 00208 v2df_t g12v, s12v; \ 00209 v2df_t t1v; \ 00210 \ 00211 g12v.v = _mm_loaddup_pd( gamma12 ); \ 00212 s12v.v = _mm_loaddup_pd( sigma12 ); \ 00213 \ 00214 for ( i = 0; i < n_iter16; ++i ) \ 00215 { \ 00216 \ 00217 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00218 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00219 \ 00220 t1v.v = a1v.v; \ 00221 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00222 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00223 \ 00224 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00225 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00226 \ 00227 alpha1 += step_a1; \ 00228 alpha2 += step_a2; \ 00229 \ 00230 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00231 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00232 \ 00233 t1v.v = a1v.v; \ 00234 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00235 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00236 \ 00237 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00238 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00239 \ 00240 alpha1 += step_a1; \ 00241 alpha2 += step_a2; \ 00242 \ 00243 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00244 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00245 \ 00246 t1v.v = a1v.v; \ 00247 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00248 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00249 \ 00250 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00251 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00252 \ 00253 alpha1 += step_a1; \ 00254 alpha2 += step_a2; \ 00255 \ 00256 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00257 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00258 \ 00259 t1v.v = a1v.v; \ 00260 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00261 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00262 \ 00263 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00264 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00265 \ 00266 alpha1 += step_a1; \ 00267 alpha2 += step_a2; \ 00268 \ 00269 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00270 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00271 \ 00272 t1v.v = a1v.v; \ 00273 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00274 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00275 \ 00276 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00277 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00278 \ 00279 alpha1 += step_a1; \ 00280 alpha2 += step_a2; \ 00281 \ 00282 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00283 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00284 \ 00285 t1v.v = a1v.v; \ 00286 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00287 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00288 \ 00289 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00290 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00291 \ 00292 alpha1 += step_a1; \ 00293 alpha2 += step_a2; \ 00294 \ 00295 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00296 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00297 \ 00298 t1v.v = a1v.v; \ 00299 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00300 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00301 \ 00302 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00303 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00304 \ 00305 alpha1 += step_a1; \ 00306 alpha2 += step_a2; \ 00307 \ 00308 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00309 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00310 \ 00311 t1v.v = a1v.v; \ 00312 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00313 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00314 \ 00315 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00316 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00317 \ 00318 alpha1 += step_a1; \ 00319 alpha2 += step_a2; \ 00320 } \ 00321 \ 00322 for ( i = 0; i < n_iter2; ++i ) \ 00323 { \ 00324 \ 00325 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00326 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00327 \ 00328 t1v.v = a1v.v; \ 00329 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00330 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00331 \ 00332 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00333 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00334 \ 00335 alpha1 += step_a1; \ 00336 alpha2 += step_a2; \ 00337 } \ 00338 \ 00339 if ( n_left == 1 ) \ 00340 { \ 00341 double ga12 = *gamma12; \ 00342 double si12 = *sigma12; \ 00343 double temp1; \ 00344 double temp2; \ 00345 \ 00346 temp1 = *alpha1; \ 00347 temp2 = *alpha2; \ 00348 \ 00349 *alpha1 = temp1 * ga12 + temp2 * si12; \ 00350 *alpha2 = temp2 * ga12 - temp1 * si12; \ 00351 } \ 00352 } 00353 00354 #define MAC_Apply_G_mx2_asc( m_A, \ 00355 gamma12, \ 00356 sigma12, \ 00357 a1, inc_a1, \ 00358 a2, inc_a2 ) \ 00359 {\ 00360 int n_iter16 = m_A / ( 2 * 8 ); \ 00361 int n_left16 = m_A % ( 2 * 8 ); \ 00362 int n_iter2 = n_left16 / ( 2 * 1 ); \ 00363 int n_left = n_left16 % ( 2 * 1 ); \ 00364 int i; \ 00365 \ 00366 const int step_a1 = inc_a1 * 2; \ 00367 const int step_a2 = inc_a2 * 2; \ 00368 \ 00369 scomplex* restrict alpha1 = a1; \ 00370 scomplex* restrict alpha2 = a2; \ 00371 \ 00372 v4sf_t a1v, a2v; \ 00373 v4sf_t g12v, s12v; \ 00374 v4sf_t t1v; \ 00375 \ 00376 g12v.v = _mm_load1_ps( gamma12 ); \ 00377 s12v.v = _mm_load1_ps( sigma12 ); \ 00378 \ 00379 for ( i = 0; i < n_iter16; ++i ) \ 00380 { \ 00381 \ 00382 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00383 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00384 \ 00385 t1v.v = a1v.v; \ 00386 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00387 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00388 \ 00389 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00390 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00391 \ 00392 alpha1 += step_a1; \ 00393 alpha2 += step_a2; \ 00394 \ 00395 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00396 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00397 \ 00398 t1v.v = a1v.v; \ 00399 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00400 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00401 \ 00402 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00403 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00404 \ 00405 alpha1 += step_a1; \ 00406 alpha2 += step_a2; \ 00407 \ 00408 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00409 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00410 \ 00411 t1v.v = a1v.v; \ 00412 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00413 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00414 \ 00415 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00416 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00417 \ 00418 alpha1 += step_a1; \ 00419 alpha2 += step_a2; \ 00420 \ 00421 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00422 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00423 \ 00424 t1v.v = a1v.v; \ 00425 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00426 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00427 \ 00428 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00429 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00430 \ 00431 alpha1 += step_a1; \ 00432 alpha2 += step_a2; \ 00433 \ 00434 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00435 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00436 \ 00437 t1v.v = a1v.v; \ 00438 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00439 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00440 \ 00441 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00442 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00443 \ 00444 alpha1 += step_a1; \ 00445 alpha2 += step_a2; \ 00446 \ 00447 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00448 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00449 \ 00450 t1v.v = a1v.v; \ 00451 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00452 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00453 \ 00454 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00455 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00456 \ 00457 alpha1 += step_a1; \ 00458 alpha2 += step_a2; \ 00459 \ 00460 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00461 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00462 \ 00463 t1v.v = a1v.v; \ 00464 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00465 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00466 \ 00467 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00468 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00469 \ 00470 alpha1 += step_a1; \ 00471 alpha2 += step_a2; \ 00472 \ 00473 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00474 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00475 \ 00476 t1v.v = a1v.v; \ 00477 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00478 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00479 \ 00480 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00481 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00482 \ 00483 alpha1 += step_a1; \ 00484 alpha2 += step_a2; \ 00485 } \ 00486 \ 00487 for ( i = 0; i < n_iter2; ++i ) \ 00488 { \ 00489 \ 00490 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00491 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00492 \ 00493 t1v.v = a1v.v; \ 00494 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00495 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00496 \ 00497 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00498 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00499 \ 00500 alpha1 += step_a1; \ 00501 alpha2 += step_a2; \ 00502 } \ 00503 \ 00504 if ( n_left == 1 ) \ 00505 { \ 00506 float ga12 = *gamma12; \ 00507 float si12 = *sigma12; \ 00508 scomplex temp1; \ 00509 scomplex temp2; \ 00510 \ 00511 temp1 = *alpha1; \ 00512 temp2 = *alpha2; \ 00513 \ 00514 alpha1->real = temp1.real * ga12 + temp2.real * si12; \ 00515 alpha2->real = temp2.real * ga12 - temp1.real * si12; \ 00516 \ 00517 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \ 00518 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \ 00519 } \ 00520 } 00521 00522 #define MAC_Apply_G_mx2_asz( m_A, \ 00523 gamma12, \ 00524 sigma12, \ 00525 a1, inc_a1, \ 00526 a2, inc_a2 ) \ 00527 {\ 00528 int n_iter = m_A / 8; \ 00529 int n_left = m_A % 8; \ 00530 int i; \ 00531 \ 00532 const int step_a1 = inc_a1 * 1; \ 00533 const int step_a2 = inc_a2 * 1; \ 00534 \ 00535 dcomplex* restrict alpha1 = a1; \ 00536 dcomplex* restrict alpha2 = a2; \ 00537 \ 00538 v2df_t a1v, a2v; \ 00539 v2df_t g12v, s12v; \ 00540 v2df_t t1v; \ 00541 \ 00542 g12v.v = _mm_loaddup_pd( gamma12 ); \ 00543 s12v.v = _mm_loaddup_pd( sigma12 ); \ 00544 \ 00545 for ( i = 0; i < n_iter; ++i ) \ 00546 { \ 00547 \ 00548 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00549 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00550 \ 00551 t1v.v = a1v.v; \ 00552 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00553 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00554 \ 00555 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00556 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00557 \ 00558 alpha1 += step_a1; \ 00559 alpha2 += step_a2; \ 00560 \ 00561 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00562 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00563 \ 00564 t1v.v = a1v.v; \ 00565 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00566 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00567 \ 00568 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00569 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00570 \ 00571 alpha1 += step_a1; \ 00572 alpha2 += step_a2; \ 00573 \ 00574 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00575 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00576 \ 00577 t1v.v = a1v.v; \ 00578 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00579 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00580 \ 00581 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00582 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00583 \ 00584 alpha1 += step_a1; \ 00585 alpha2 += step_a2; \ 00586 \ 00587 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00588 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00589 \ 00590 t1v.v = a1v.v; \ 00591 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00592 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00593 \ 00594 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00595 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00596 \ 00597 alpha1 += step_a1; \ 00598 alpha2 += step_a2; \ 00599 \ 00600 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00601 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00602 \ 00603 t1v.v = a1v.v; \ 00604 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00605 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00606 \ 00607 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00608 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00609 \ 00610 alpha1 += step_a1; \ 00611 alpha2 += step_a2; \ 00612 \ 00613 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00614 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00615 \ 00616 t1v.v = a1v.v; \ 00617 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00618 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00619 \ 00620 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00621 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00622 \ 00623 alpha1 += step_a1; \ 00624 alpha2 += step_a2; \ 00625 \ 00626 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00627 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00628 \ 00629 t1v.v = a1v.v; \ 00630 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00631 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00632 \ 00633 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00634 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00635 \ 00636 alpha1 += step_a1; \ 00637 alpha2 += step_a2; \ 00638 \ 00639 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00640 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00641 \ 00642 t1v.v = a1v.v; \ 00643 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00644 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00645 \ 00646 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00647 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00648 \ 00649 alpha1 += step_a1; \ 00650 alpha2 += step_a2; \ 00651 } \ 00652 \ 00653 for ( i = 0; i < n_left; ++i ) \ 00654 { \ 00655 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00656 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00657 \ 00658 t1v.v = a1v.v; \ 00659 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00660 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00661 \ 00662 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00663 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00664 \ 00665 alpha1 += step_a1; \ 00666 alpha2 += step_a2; \ 00667 } \ 00668 } 00669 00670 #endif