libflame
revision_anchor
|
00001 /* 00002 00003 Copyright (C) 2014, The University of Texas at Austin 00004 00005 This file is part of libflame and is available under the 3-Clause 00006 BSD license, which can be found in the LICENSE file at the top-level 00007 directory, or at http://opensource.org/licenses/BSD-3-Clause 00008 00009 */ 00010 00011 00012 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS 00013 00014 #define MAC_Apply_G_mx3_ass MAC_Apply_G_mx3_ops 00015 #define MAC_Apply_G_mx3_asd MAC_Apply_G_mx3_opd 00016 #define MAC_Apply_G_mx3_asc MAC_Apply_G_mx3_opc 00017 #define MAC_Apply_G_mx3_asz MAC_Apply_G_mx3_opz 00018 00019 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS 00020 00021 #define MAC_Apply_G_mx3_ass( m_A, \ 00022 gamma12, \ 00023 sigma12, \ 00024 gamma23, \ 00025 sigma23, \ 00026 a1, inc_a1, \ 00027 a2, inc_a2, \ 00028 a3, inc_a3 ) \ 00029 {\ 00030 int n_iter32 = m_A / ( 4 * 8 ); \ 00031 int n_left32 = m_A % ( 4 * 8 ); \ 00032 int n_iter4 = n_left32 / ( 4 * 1 ); \ 00033 int n_left = n_left32 % ( 4 * 1 ); \ 00034 int i; \ 00035 \ 00036 const int step_a1 = inc_a1 * 4; \ 00037 const int step_a2 = inc_a1 * 4; \ 00038 const int step_a3 = inc_a1 * 4; \ 00039 \ 00040 float* restrict alpha1 = a1; \ 00041 float* restrict alpha2 = a2; \ 00042 float* restrict alpha3 = a3; \ 00043 \ 00044 v4sf_t a1v, a2v, a3v; \ 00045 v4sf_t g12v, s12v; \ 00046 v4sf_t g23v, s23v; \ 00047 v4sf_t t1v, t2v; \ 00048 \ 00049 g12v.v = _mm_load1_ps( gamma12 ); \ 00050 s12v.v = _mm_load1_ps( sigma12 ); \ 00051 g23v.v = _mm_load1_ps( gamma23 ); \ 00052 s23v.v = _mm_load1_ps( sigma23 ); \ 00053 \ 00054 for ( i = 0; i < n_iter32; ++i ) \ 00055 { \ 00056 \ 00057 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00058 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00059 \ 00060 t1v.v = a1v.v; \ 00061 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00062 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00063 \ 00064 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00065 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00066 alpha1 += step_a1; \ 00067 \ 00068 t2v.v = a2v.v; \ 00069 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00070 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00071 \ 00072 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00073 alpha2 += step_a2; \ 00074 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00075 alpha3 += step_a3; \ 00076 \ 00077 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00078 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00079 \ 00080 t1v.v = a1v.v; \ 00081 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00082 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00083 \ 00084 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00085 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00086 alpha1 += step_a1; \ 00087 \ 00088 t2v.v = a2v.v; \ 00089 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00090 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00091 \ 00092 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00093 alpha2 += step_a2; \ 00094 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00095 alpha3 += step_a3; \ 00096 \ 00097 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00098 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00099 \ 00100 t1v.v = a1v.v; \ 00101 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00102 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00103 \ 00104 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00105 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00106 alpha1 += step_a1; \ 00107 \ 00108 t2v.v = a2v.v; \ 00109 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00110 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00111 \ 00112 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00113 alpha2 += step_a2; \ 00114 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00115 alpha3 += step_a3; \ 00116 \ 00117 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00118 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00119 \ 00120 t1v.v = a1v.v; \ 00121 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00122 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00123 \ 00124 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00125 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00126 alpha1 += step_a1; \ 00127 \ 00128 t2v.v = a2v.v; \ 00129 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00130 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00131 \ 00132 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00133 alpha2 += step_a2; \ 00134 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00135 alpha3 += step_a3; \ 00136 \ 00137 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00138 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00139 \ 00140 t1v.v = a1v.v; \ 00141 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00142 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00143 \ 00144 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00145 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00146 alpha1 += step_a1; \ 00147 \ 00148 t2v.v = a2v.v; \ 00149 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00150 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00151 \ 00152 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00153 alpha2 += step_a2; \ 00154 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00155 alpha3 += step_a3; \ 00156 \ 00157 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00158 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00159 \ 00160 t1v.v = a1v.v; \ 00161 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00162 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00163 \ 00164 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00165 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00166 alpha1 += step_a1; \ 00167 \ 00168 t2v.v = a2v.v; \ 00169 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00170 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00171 \ 00172 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00173 alpha2 += step_a2; \ 00174 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00175 alpha3 += step_a3; \ 00176 \ 00177 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00178 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00179 \ 00180 t1v.v = a1v.v; \ 00181 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00182 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00183 \ 00184 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00185 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00186 alpha1 += step_a1; \ 00187 \ 00188 t2v.v = a2v.v; \ 00189 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00190 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00191 \ 00192 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00193 alpha2 += step_a2; \ 00194 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00195 alpha3 += step_a3; \ 00196 \ 00197 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00198 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00199 \ 00200 t1v.v = a1v.v; \ 00201 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00202 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00203 \ 00204 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00205 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00206 alpha1 += step_a1; \ 00207 \ 00208 t2v.v = a2v.v; \ 00209 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00210 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00211 \ 00212 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00213 alpha2 += step_a2; \ 00214 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00215 alpha3 += step_a3; \ 00216 } \ 00217 \ 00218 for ( i = 0; i < n_iter4; ++i ) \ 00219 { \ 00220 \ 00221 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00222 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00223 \ 00224 t1v.v = a1v.v; \ 00225 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00226 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00227 \ 00228 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00229 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00230 alpha1 += step_a1; \ 00231 \ 00232 t2v.v = a2v.v; \ 00233 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00234 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00235 \ 00236 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00237 alpha2 += step_a2; \ 00238 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00239 alpha3 += step_a3; \ 00240 } \ 00241 \ 00242 for ( i = 0; i < n_left; ++i ) \ 00243 { \ 00244 float ga12 = *gamma12; \ 00245 float si12 = *sigma12; \ 00246 float ga23 = *gamma23; \ 00247 float si23 = *sigma23; \ 00248 float temp1; \ 00249 float temp2; \ 00250 float temp3; \ 00251 \ 00252 temp1 = *alpha1; \ 00253 temp2 = *alpha2; \ 00254 \ 00255 *alpha1 = temp1 * ga12 + temp2 * si12; \ 00256 *alpha2 = temp2 * ga12 - temp1 * si12; \ 00257 \ 00258 temp2 = *alpha2; \ 00259 temp3 = *alpha3; \ 00260 \ 00261 *alpha2 = temp2 * ga23 + temp3 * si23; \ 00262 *alpha3 = temp3 * ga23 - temp2 * si23; \ 00263 \ 00264 alpha1 += 1; \ 00265 alpha2 += 1; \ 00266 alpha3 += 1; \ 00267 } \ 00268 } 00269 00270 #define MAC_Apply_G_mx3_asd( m_A, \ 00271 gamma12, \ 00272 sigma12, \ 00273 gamma23, \ 00274 sigma23, \ 00275 a1, inc_a1, \ 00276 a2, inc_a2, \ 00277 a3, inc_a3 ) \ 00278 {\ 00279 int n_iter16 = m_A / ( 2 * 8 ); \ 00280 int n_left16 = m_A % ( 2 * 8 ); \ 00281 int n_iter2 = n_left16 / ( 2 * 1 ); \ 00282 int n_left = n_left16 % ( 2 * 1 ); \ 00283 int i; \ 00284 \ 00285 const int step_a1 = inc_a1 * 2; \ 00286 const int step_a2 = inc_a1 * 2; \ 00287 const int step_a3 = inc_a1 * 2; \ 00288 \ 00289 double* restrict alpha1 = a1; \ 00290 double* restrict alpha2 = a2; \ 00291 double* restrict alpha3 = a3; \ 00292 \ 00293 v2df_t a1v, a2v, a3v; \ 00294 v2df_t g12v, s12v; \ 00295 v2df_t g23v, s23v; \ 00296 v2df_t t1v, t2v; \ 00297 \ 00298 g12v.v = _mm_loaddup_pd( gamma12 ); \ 00299 s12v.v = _mm_loaddup_pd( sigma12 ); \ 00300 g23v.v = _mm_loaddup_pd( gamma23 ); \ 00301 s23v.v = _mm_loaddup_pd( sigma23 ); \ 00302 \ 00303 for ( i = 0; i < n_iter16; ++i ) \ 00304 { \ 00305 \ 00306 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00307 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00308 \ 00309 t1v.v = a1v.v; \ 00310 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00311 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00312 \ 00313 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00314 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00315 alpha1 += step_a1; \ 00316 \ 00317 t2v.v = a2v.v; \ 00318 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00319 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00320 \ 00321 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00322 alpha2 += step_a2; \ 00323 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00324 alpha3 += step_a3; \ 00325 \ 00326 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00327 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00328 \ 00329 t1v.v = a1v.v; \ 00330 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00331 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00332 \ 00333 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00334 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00335 alpha1 += step_a1; \ 00336 \ 00337 t2v.v = a2v.v; \ 00338 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00339 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00340 \ 00341 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00342 alpha2 += step_a2; \ 00343 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00344 alpha3 += step_a3; \ 00345 \ 00346 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00347 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00348 \ 00349 t1v.v = a1v.v; \ 00350 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00351 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00352 \ 00353 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00354 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00355 alpha1 += step_a1; \ 00356 \ 00357 t2v.v = a2v.v; \ 00358 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00359 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00360 \ 00361 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00362 alpha2 += step_a2; \ 00363 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00364 alpha3 += step_a3; \ 00365 \ 00366 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00367 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00368 \ 00369 t1v.v = a1v.v; \ 00370 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00371 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00372 \ 00373 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00374 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00375 alpha1 += step_a1; \ 00376 \ 00377 t2v.v = a2v.v; \ 00378 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00379 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00380 \ 00381 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00382 alpha2 += step_a2; \ 00383 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00384 alpha3 += step_a3; \ 00385 \ 00386 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00387 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00388 \ 00389 t1v.v = a1v.v; \ 00390 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00391 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00392 \ 00393 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00394 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00395 alpha1 += step_a1; \ 00396 \ 00397 t2v.v = a2v.v; \ 00398 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00399 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00400 \ 00401 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00402 alpha2 += step_a2; \ 00403 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00404 alpha3 += step_a3; \ 00405 \ 00406 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00407 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00408 \ 00409 t1v.v = a1v.v; \ 00410 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00411 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00412 \ 00413 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00414 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00415 alpha1 += step_a1; \ 00416 \ 00417 t2v.v = a2v.v; \ 00418 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00419 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00420 \ 00421 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00422 alpha2 += step_a2; \ 00423 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00424 alpha3 += step_a3; \ 00425 \ 00426 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00427 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00428 \ 00429 t1v.v = a1v.v; \ 00430 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00431 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00432 \ 00433 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00434 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00435 alpha1 += step_a1; \ 00436 \ 00437 t2v.v = a2v.v; \ 00438 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00439 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00440 \ 00441 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00442 alpha2 += step_a2; \ 00443 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00444 alpha3 += step_a3; \ 00445 \ 00446 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00447 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00448 \ 00449 t1v.v = a1v.v; \ 00450 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00451 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00452 \ 00453 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00454 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00455 alpha1 += step_a1; \ 00456 \ 00457 t2v.v = a2v.v; \ 00458 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00459 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00460 \ 00461 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00462 alpha2 += step_a2; \ 00463 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00464 alpha3 += step_a3; \ 00465 } \ 00466 \ 00467 for ( i = 0; i < n_iter2; ++i ) \ 00468 { \ 00469 \ 00470 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00471 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00472 \ 00473 t1v.v = a1v.v; \ 00474 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00475 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00476 \ 00477 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00478 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00479 alpha1 += step_a1; \ 00480 \ 00481 t2v.v = a2v.v; \ 00482 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00483 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00484 \ 00485 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00486 alpha2 += step_a2; \ 00487 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00488 alpha3 += step_a3; \ 00489 } \ 00490 \ 00491 if ( n_left == 1 ) \ 00492 { \ 00493 double ga12 = *gamma12; \ 00494 double si12 = *sigma12; \ 00495 double ga23 = *gamma23; \ 00496 double si23 = *sigma23; \ 00497 double temp1; \ 00498 double temp2; \ 00499 double temp3; \ 00500 \ 00501 temp1 = *alpha1; \ 00502 temp2 = *alpha2; \ 00503 \ 00504 *alpha1 = temp1 * ga12 + temp2 * si12; \ 00505 *alpha2 = temp2 * ga12 - temp1 * si12; \ 00506 \ 00507 temp2 = *alpha2; \ 00508 temp3 = *alpha3; \ 00509 \ 00510 *alpha2 = temp2 * ga23 + temp3 * si23; \ 00511 *alpha3 = temp3 * ga23 - temp2 * si23; \ 00512 } \ 00513 } 00514 00515 #define MAC_Apply_G_mx3_asc( m_A, \ 00516 gamma12, \ 00517 sigma12, \ 00518 gamma23, \ 00519 sigma23, \ 00520 a1, inc_a1, \ 00521 a2, inc_a2, \ 00522 a3, inc_a3 ) \ 00523 { \ 00524 int n_iter16 = m_A / ( 2 * 8 ); \ 00525 int n_left16 = m_A % ( 2 * 8 ); \ 00526 int n_iter2 = n_left16 / ( 2 * 1 ); \ 00527 int n_left = n_left16 % ( 2 * 1 ); \ 00528 int i; \ 00529 \ 00530 const int step_a1 = inc_a1 * 2; \ 00531 const int step_a2 = inc_a1 * 2; \ 00532 const int step_a3 = inc_a1 * 2; \ 00533 \ 00534 scomplex* restrict alpha1 = a1; \ 00535 scomplex* restrict alpha2 = a2; \ 00536 scomplex* restrict alpha3 = a3; \ 00537 \ 00538 v4sf_t a1v, a2v, a3v; \ 00539 v4sf_t g12v, s12v; \ 00540 v4sf_t g23v, s23v; \ 00541 v4sf_t t1v, t2v; \ 00542 \ 00543 g12v.v = _mm_load1_ps( gamma12 ); \ 00544 s12v.v = _mm_load1_ps( sigma12 ); \ 00545 g23v.v = _mm_load1_ps( gamma23 ); \ 00546 s23v.v = _mm_load1_ps( sigma23 ); \ 00547 \ 00548 for ( i = 0; i < n_iter16; ++i ) \ 00549 { \ 00550 \ 00551 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00552 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00553 \ 00554 t1v.v = a1v.v; \ 00555 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00556 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00557 \ 00558 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00559 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00560 alpha1 += step_a1; \ 00561 \ 00562 t2v.v = a2v.v; \ 00563 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00564 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00565 \ 00566 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00567 alpha2 += step_a2; \ 00568 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00569 alpha3 += step_a3; \ 00570 \ 00571 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00572 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00573 \ 00574 t1v.v = a1v.v; \ 00575 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00576 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00577 \ 00578 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00579 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00580 alpha1 += step_a1; \ 00581 \ 00582 t2v.v = a2v.v; \ 00583 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00584 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00585 \ 00586 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00587 alpha2 += step_a2; \ 00588 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00589 alpha3 += step_a3; \ 00590 \ 00591 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00592 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00593 \ 00594 t1v.v = a1v.v; \ 00595 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00596 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00597 \ 00598 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00599 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00600 alpha1 += step_a1; \ 00601 \ 00602 t2v.v = a2v.v; \ 00603 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00604 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00605 \ 00606 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00607 alpha2 += step_a2; \ 00608 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00609 alpha3 += step_a3; \ 00610 \ 00611 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00612 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00613 \ 00614 t1v.v = a1v.v; \ 00615 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00616 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00617 \ 00618 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00619 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00620 alpha1 += step_a1; \ 00621 \ 00622 t2v.v = a2v.v; \ 00623 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00624 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00625 \ 00626 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00627 alpha2 += step_a2; \ 00628 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00629 alpha3 += step_a3; \ 00630 \ 00631 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00632 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00633 \ 00634 t1v.v = a1v.v; \ 00635 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00636 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00637 \ 00638 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00639 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00640 alpha1 += step_a1; \ 00641 \ 00642 t2v.v = a2v.v; \ 00643 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00644 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00645 \ 00646 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00647 alpha2 += step_a2; \ 00648 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00649 alpha3 += step_a3; \ 00650 \ 00651 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00652 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00653 \ 00654 t1v.v = a1v.v; \ 00655 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00656 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00657 \ 00658 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00659 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00660 alpha1 += step_a1; \ 00661 \ 00662 t2v.v = a2v.v; \ 00663 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00664 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00665 \ 00666 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00667 alpha2 += step_a2; \ 00668 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00669 alpha3 += step_a3; \ 00670 \ 00671 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00672 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00673 \ 00674 t1v.v = a1v.v; \ 00675 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00676 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00677 \ 00678 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00679 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00680 alpha1 += step_a1; \ 00681 \ 00682 t2v.v = a2v.v; \ 00683 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00684 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00685 \ 00686 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00687 alpha2 += step_a2; \ 00688 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00689 alpha3 += step_a3; \ 00690 \ 00691 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00692 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00693 \ 00694 t1v.v = a1v.v; \ 00695 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00696 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00697 \ 00698 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00699 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00700 alpha1 += step_a1; \ 00701 \ 00702 t2v.v = a2v.v; \ 00703 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00704 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00705 \ 00706 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00707 alpha2 += step_a2; \ 00708 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00709 alpha3 += step_a3; \ 00710 } \ 00711 \ 00712 for ( i = 0; i < n_iter2; ++i ) \ 00713 { \ 00714 \ 00715 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00716 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00717 \ 00718 t1v.v = a1v.v; \ 00719 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00720 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00721 \ 00722 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00723 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00724 alpha1 += step_a1; \ 00725 \ 00726 t2v.v = a2v.v; \ 00727 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00728 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00729 \ 00730 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00731 alpha2 += step_a2; \ 00732 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00733 alpha3 += step_a3; \ 00734 } \ 00735 \ 00736 if ( n_left == 1 ) \ 00737 { \ 00738 float ga12 = *gamma12; \ 00739 float si12 = *sigma12; \ 00740 float ga23 = *gamma23; \ 00741 float si23 = *sigma23; \ 00742 scomplex temp1; \ 00743 scomplex temp2; \ 00744 scomplex temp3; \ 00745 \ 00746 temp1 = *alpha1; \ 00747 temp2 = *alpha2; \ 00748 \ 00749 alpha1->real = temp1.real * ga12 + temp2.real * si12; \ 00750 alpha2->real = temp2.real * ga12 - temp1.real * si12; \ 00751 \ 00752 alpha1->imag = temp1.imag * ga12 + temp2.imag * si12; \ 00753 alpha2->imag = temp2.imag * ga12 - temp1.imag * si12; \ 00754 \ 00755 temp2 = *alpha2; \ 00756 temp3 = *alpha3; \ 00757 \ 00758 alpha2->real = temp2.real * ga23 + temp3.real * si23; \ 00759 alpha3->real = temp3.real * ga23 - temp2.real * si23; \ 00760 \ 00761 alpha2->imag = temp2.imag * ga23 + temp3.imag * si23; \ 00762 alpha3->imag = temp3.imag * ga23 - temp2.imag * si23; \ 00763 } \ 00764 } 00765 00766 #define MAC_Apply_G_mx3_asz( m_A, \ 00767 gamma12, \ 00768 sigma12, \ 00769 gamma23, \ 00770 sigma23, \ 00771 a1, inc_a1, \ 00772 a2, inc_a2, \ 00773 a3, inc_a3 ) \ 00774 {\ 00775 int n_iter = m_A / 8; \ 00776 int n_left = m_A % 8; \ 00777 int i; \ 00778 \ 00779 const int step_a1 = inc_a1 * 1; \ 00780 const int step_a2 = inc_a1 * 1; \ 00781 const int step_a3 = inc_a1 * 1; \ 00782 \ 00783 dcomplex* restrict alpha1 = a1; \ 00784 dcomplex* restrict alpha2 = a2; \ 00785 dcomplex* restrict alpha3 = a3; \ 00786 \ 00787 v2df_t a1v, a2v, a3v; \ 00788 v2df_t g12v, s12v; \ 00789 v2df_t g23v, s23v; \ 00790 v2df_t t1v, t2v; \ 00791 \ 00792 g12v.v = _mm_loaddup_pd( gamma12 ); \ 00793 s12v.v = _mm_loaddup_pd( sigma12 ); \ 00794 g23v.v = _mm_loaddup_pd( gamma23 ); \ 00795 s23v.v = _mm_loaddup_pd( sigma23 ); \ 00796 \ 00797 for ( i = 0; i < n_iter; ++i ) \ 00798 { \ 00799 \ 00800 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00801 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00802 \ 00803 t1v.v = a1v.v; \ 00804 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00805 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00806 \ 00807 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00808 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00809 alpha1 += step_a1; \ 00810 \ 00811 t2v.v = a2v.v; \ 00812 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00813 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00814 \ 00815 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00816 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00817 alpha2 += step_a2; \ 00818 alpha3 += step_a3; \ 00819 \ 00820 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00821 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00822 \ 00823 t1v.v = a1v.v; \ 00824 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00825 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00826 \ 00827 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00828 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00829 alpha1 += step_a1; \ 00830 \ 00831 t2v.v = a2v.v; \ 00832 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00833 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00834 \ 00835 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00836 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00837 alpha2 += step_a2; \ 00838 alpha3 += step_a3; \ 00839 \ 00840 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00841 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00842 \ 00843 t1v.v = a1v.v; \ 00844 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00845 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00846 \ 00847 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00848 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00849 alpha1 += step_a1; \ 00850 \ 00851 t2v.v = a2v.v; \ 00852 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00853 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00854 \ 00855 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00856 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00857 alpha2 += step_a2; \ 00858 alpha3 += step_a3; \ 00859 \ 00860 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00861 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00862 \ 00863 t1v.v = a1v.v; \ 00864 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00865 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00866 \ 00867 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00868 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00869 alpha1 += step_a1; \ 00870 \ 00871 t2v.v = a2v.v; \ 00872 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00873 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00874 \ 00875 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00876 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00877 alpha2 += step_a2; \ 00878 alpha3 += step_a3; \ 00879 \ 00880 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00881 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00882 \ 00883 t1v.v = a1v.v; \ 00884 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00885 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00886 \ 00887 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00888 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00889 alpha1 += step_a1; \ 00890 \ 00891 t2v.v = a2v.v; \ 00892 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00893 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00894 \ 00895 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00896 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00897 alpha2 += step_a2; \ 00898 alpha3 += step_a3; \ 00899 \ 00900 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00901 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00902 \ 00903 t1v.v = a1v.v; \ 00904 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00905 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00906 \ 00907 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00908 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00909 alpha1 += step_a1; \ 00910 \ 00911 t2v.v = a2v.v; \ 00912 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00913 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00914 \ 00915 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00916 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00917 alpha2 += step_a2; \ 00918 alpha3 += step_a3; \ 00919 \ 00920 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00921 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00922 \ 00923 t1v.v = a1v.v; \ 00924 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00925 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00926 \ 00927 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00928 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00929 alpha1 += step_a1; \ 00930 \ 00931 t2v.v = a2v.v; \ 00932 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00933 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00934 \ 00935 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00936 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00937 alpha2 += step_a2; \ 00938 alpha3 += step_a3; \ 00939 \ 00940 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00941 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00942 \ 00943 t1v.v = a1v.v; \ 00944 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00945 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00946 \ 00947 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00948 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00949 alpha1 += step_a1; \ 00950 \ 00951 t2v.v = a2v.v; \ 00952 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00953 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00954 \ 00955 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00956 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00957 alpha2 += step_a2; \ 00958 alpha3 += step_a3; \ 00959 } \ 00960 \ 00961 for ( i = 0; i < n_left; ++i ) \ 00962 { \ 00963 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00964 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00965 \ 00966 t1v.v = a1v.v; \ 00967 a1v.v = t1v.v * g12v.v + a2v.v * s12v.v; \ 00968 a2v.v = a2v.v * g12v.v - t1v.v * s12v.v; \ 00969 \ 00970 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00971 alpha1 += step_a1; \ 00972 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00973 \ 00974 t2v.v = a2v.v; \ 00975 a2v.v = t2v.v * g23v.v + a3v.v * s23v.v; \ 00976 a3v.v = a3v.v * g23v.v - t2v.v * s23v.v; \ 00977 \ 00978 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00979 alpha2 += step_a2; \ 00980 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00981 alpha3 += step_a3; \ 00982 } \ 00983 } 00984 00985 #endif