libflame
revision_anchor
|
00001 /* 00002 00003 Copyright (C) 2014, The University of Texas at Austin 00004 00005 This file is part of libflame and is available under the 3-Clause 00006 BSD license, which can be found in the LICENSE file at the top-level 00007 directory, or at http://opensource.org/licenses/BSD-3-Clause 00008 00009 */ 00010 00011 00012 #if FLA_VECTOR_INTRINSIC_TYPE == FLA_NO_INTRINSICS 00013 00014 #define MAC_Apply_G_mx4s_ass MAC_Apply_G_mx4s_ops 00015 #define MAC_Apply_G_mx4s_asd MAC_Apply_G_mx4s_opd 00016 #define MAC_Apply_G_mx4s_asc MAC_Apply_G_mx4s_opc 00017 #define MAC_Apply_G_mx4s_asz MAC_Apply_G_mx4s_opz 00018 00019 #elif FLA_VECTOR_INTRINSIC_TYPE == FLA_SSE_INTRINSICS 00020 00021 #define MAC_Apply_G_mx4s_ass( m_A, \ 00022 gamma23_k1, \ 00023 sigma23_k1, \ 00024 gamma34_k1, \ 00025 sigma34_k1, \ 00026 gamma12_k2, \ 00027 sigma12_k2, \ 00028 gamma23_k2, \ 00029 sigma23_k2, \ 00030 a1, inc_a1, \ 00031 a2, inc_a2, \ 00032 a3, inc_a3, \ 00033 a4, inc_a4 ) \ 00034 {\ 00035 int n_iter32 = m_A / ( 4 * 8 ); \ 00036 int n_left32 = m_A % ( 4 * 8 ); \ 00037 int n_iter4 = n_left32 / ( 4 * 1 ); \ 00038 int n_left = n_left32 % ( 4 * 1 ); \ 00039 int i; \ 00040 \ 00041 const int step_a1 = inc_a1 * 4; \ 00042 const int step_a2 = inc_a2 * 4; \ 00043 const int step_a3 = inc_a3 * 4; \ 00044 const int step_a4 = inc_a4 * 4; \ 00045 \ 00046 float* restrict alpha1 = a1; \ 00047 float* restrict alpha2 = a2; \ 00048 float* restrict alpha3 = a3; \ 00049 float* restrict alpha4 = a4; \ 00050 \ 00051 v4sf_t a1v, a2v, a3v, a4v; \ 00052 v4sf_t b1v, b2v, b3v, b4v; \ 00053 v4sf_t g23_k1v, s23_k1v; \ 00054 v4sf_t g34_k1v, s34_k1v; \ 00055 v4sf_t g12_k2v, s12_k2v; \ 00056 v4sf_t g23_k2v, s23_k2v; \ 00057 v4sf_t t1v, t2v, t3v; \ 00058 \ 00059 g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \ 00060 s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \ 00061 g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \ 00062 s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \ 00063 g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \ 00064 s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \ 00065 g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \ 00066 s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \ 00067 \ 00068 for ( i = 0; i < n_iter32; ++i ) \ 00069 { \ 00070 \ 00071 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00072 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00073 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00074 \ 00075 t2v.v = a2v.v; \ 00076 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00077 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00078 \ 00079 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00080 \ 00081 t3v.v = a3v.v; \ 00082 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00083 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00084 \ 00085 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 00086 alpha4 += step_a4; \ 00087 \ 00088 t1v.v = a1v.v; \ 00089 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00090 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00091 \ 00092 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00093 alpha1 += step_a1; \ 00094 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00095 \ 00096 t2v.v = a2v.v; \ 00097 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00098 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00099 \ 00100 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00101 alpha2 += step_a2; \ 00102 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00103 \ 00104 /* ----------------------------------------------------------- */ \ 00105 \ 00106 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00107 \ 00108 t2v.v = b2v.v; \ 00109 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00110 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00111 \ 00112 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00113 alpha3 += step_a3; \ 00114 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00115 \ 00116 t3v.v = b3v.v; \ 00117 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00118 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00119 \ 00120 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 00121 alpha4 += step_a4; \ 00122 \ 00123 t1v.v = b1v.v; \ 00124 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00125 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00126 \ 00127 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 00128 alpha1 += step_a1; \ 00129 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00130 \ 00131 t2v.v = b2v.v; \ 00132 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00133 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00134 \ 00135 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 00136 alpha2 += step_a2; \ 00137 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00138 \ 00139 /* ----------------------------------------------------------- */ \ 00140 \ 00141 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00142 \ 00143 t2v.v = a2v.v; \ 00144 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00145 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00146 \ 00147 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 00148 alpha3 += step_a3; \ 00149 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00150 \ 00151 t3v.v = a3v.v; \ 00152 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00153 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00154 \ 00155 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 00156 alpha4 += step_a4; \ 00157 \ 00158 t1v.v = a1v.v; \ 00159 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00160 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00161 \ 00162 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00163 alpha1 += step_a1; \ 00164 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00165 \ 00166 t2v.v = a2v.v; \ 00167 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00168 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00169 \ 00170 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00171 alpha2 += step_a2; \ 00172 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00173 \ 00174 /* ----------------------------------------------------------- */ \ 00175 \ 00176 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00177 \ 00178 t2v.v = b2v.v; \ 00179 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00180 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00181 \ 00182 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00183 alpha3 += step_a3; \ 00184 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00185 \ 00186 t3v.v = b3v.v; \ 00187 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00188 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00189 \ 00190 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 00191 alpha4 += step_a4; \ 00192 \ 00193 t1v.v = b1v.v; \ 00194 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00195 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00196 \ 00197 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 00198 alpha1 += step_a1; \ 00199 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \ 00200 \ 00201 t2v.v = b2v.v; \ 00202 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00203 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00204 \ 00205 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 00206 alpha2 += step_a2; \ 00207 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00208 \ 00209 \ 00210 /* ----------------------------------------------------------- */ \ 00211 \ 00212 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00213 \ 00214 t2v.v = a2v.v; \ 00215 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00216 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00217 \ 00218 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 00219 alpha3 += step_a3; \ 00220 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00221 \ 00222 t3v.v = a3v.v; \ 00223 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00224 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00225 \ 00226 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 00227 alpha4 += step_a4; \ 00228 \ 00229 t1v.v = a1v.v; \ 00230 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00231 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00232 \ 00233 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00234 alpha1 += step_a1; \ 00235 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00236 \ 00237 t2v.v = a2v.v; \ 00238 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00239 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00240 \ 00241 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00242 alpha2 += step_a2; \ 00243 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00244 \ 00245 /* ----------------------------------------------------------- */ \ 00246 \ 00247 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00248 \ 00249 t2v.v = b2v.v; \ 00250 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00251 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00252 \ 00253 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00254 alpha3 += step_a3; \ 00255 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00256 \ 00257 t3v.v = b3v.v; \ 00258 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00259 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00260 \ 00261 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 00262 alpha4 += step_a4; \ 00263 \ 00264 t1v.v = b1v.v; \ 00265 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00266 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00267 \ 00268 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 00269 alpha1 += step_a1; \ 00270 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00271 \ 00272 t2v.v = b2v.v; \ 00273 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00274 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00275 \ 00276 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 00277 alpha2 += step_a2; \ 00278 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00279 \ 00280 /* ----------------------------------------------------------- */ \ 00281 \ 00282 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00283 \ 00284 t2v.v = a2v.v; \ 00285 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00286 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00287 \ 00288 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 00289 alpha3 += step_a3; \ 00290 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00291 \ 00292 t3v.v = a3v.v; \ 00293 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00294 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00295 \ 00296 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 00297 alpha4 += step_a4; \ 00298 \ 00299 t1v.v = a1v.v; \ 00300 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00301 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00302 \ 00303 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00304 alpha1 += step_a1; \ 00305 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00306 \ 00307 t2v.v = a2v.v; \ 00308 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00309 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00310 \ 00311 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00312 alpha2 += step_a2; \ 00313 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00314 \ 00315 /* ----------------------------------------------------------- */ \ 00316 \ 00317 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00318 \ 00319 t2v.v = b2v.v; \ 00320 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00321 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00322 \ 00323 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00324 alpha3 += step_a3; \ 00325 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00326 \ 00327 t3v.v = b3v.v; \ 00328 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00329 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00330 \ 00331 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 00332 alpha4 += step_a4; \ 00333 \ 00334 t1v.v = b1v.v; \ 00335 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00336 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00337 \ 00338 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 00339 alpha1 += step_a1; \ 00340 \ 00341 t2v.v = b2v.v; \ 00342 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00343 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00344 \ 00345 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 00346 alpha2 += step_a2; \ 00347 \ 00348 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 00349 alpha3 += step_a3; \ 00350 \ 00351 /* ----------------------------------------------------------- */ \ 00352 } \ 00353 \ 00354 for ( i = 0; i < n_iter4; ++i ) \ 00355 { \ 00356 \ 00357 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00358 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00359 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00360 \ 00361 t2v.v = a2v.v; \ 00362 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00363 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00364 \ 00365 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00366 \ 00367 t3v.v = a3v.v; \ 00368 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00369 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00370 \ 00371 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 00372 alpha4 += step_a4; \ 00373 \ 00374 t1v.v = a1v.v; \ 00375 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00376 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00377 \ 00378 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00379 alpha1 += step_a1; \ 00380 \ 00381 t2v.v = a2v.v; \ 00382 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00383 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00384 \ 00385 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00386 alpha2 += step_a2; \ 00387 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00388 alpha3 += step_a3; \ 00389 } \ 00390 \ 00391 for ( i = 0; i < n_left; ++i ) \ 00392 { \ 00393 float ga23_k1 = *gamma23_k1; \ 00394 float si23_k1 = *sigma23_k1; \ 00395 float ga34_k1 = *gamma34_k1; \ 00396 float si34_k1 = *sigma34_k1; \ 00397 float ga12_k2 = *gamma12_k2; \ 00398 float si12_k2 = *sigma12_k2; \ 00399 float ga23_k2 = *gamma23_k2; \ 00400 float si23_k2 = *sigma23_k2; \ 00401 float temp1; \ 00402 float temp2; \ 00403 float temp3; \ 00404 float temp4; \ 00405 \ 00406 temp2 = *alpha2; \ 00407 temp3 = *alpha3; \ 00408 \ 00409 *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \ 00410 *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \ 00411 \ 00412 temp3 = *alpha3; \ 00413 temp4 = *alpha4; \ 00414 \ 00415 *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \ 00416 *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \ 00417 \ 00418 temp1 = *alpha1; \ 00419 temp2 = *alpha2; \ 00420 \ 00421 *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \ 00422 *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \ 00423 \ 00424 temp2 = *alpha2; \ 00425 temp3 = *alpha3; \ 00426 \ 00427 *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \ 00428 *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \ 00429 \ 00430 alpha1 += 1; \ 00431 alpha2 += 1; \ 00432 alpha3 += 1; \ 00433 alpha4 += 1; \ 00434 } \ 00435 } 00436 00437 #define MAC_Apply_G_mx4s_asd( m_A, \ 00438 gamma23_k1, \ 00439 sigma23_k1, \ 00440 gamma34_k1, \ 00441 sigma34_k1, \ 00442 gamma12_k2, \ 00443 sigma12_k2, \ 00444 gamma23_k2, \ 00445 sigma23_k2, \ 00446 a1, inc_a1, \ 00447 a2, inc_a2, \ 00448 a3, inc_a3, \ 00449 a4, inc_a4 ) \ 00450 {\ 00451 int n_iter16 = m_A / ( 2 * 8 ); \ 00452 int n_left16 = m_A % ( 2 * 8 ); \ 00453 int n_iter2 = n_left16 / ( 2 * 1 ); \ 00454 int n_left = n_left16 % ( 2 * 1 ); \ 00455 int i; \ 00456 \ 00457 const int step_a1 = inc_a1 * 2; \ 00458 const int step_a2 = inc_a2 * 2; \ 00459 const int step_a3 = inc_a3 * 2; \ 00460 const int step_a4 = inc_a4 * 2; \ 00461 \ 00462 double* restrict alpha1 = a1; \ 00463 double* restrict alpha2 = a2; \ 00464 double* restrict alpha3 = a3; \ 00465 double* restrict alpha4 = a4; \ 00466 \ 00467 v2df_t a1v, a2v, a3v, a4v; \ 00468 v2df_t b1v, b2v, b3v, b4v; \ 00469 v2df_t g23_k1v, s23_k1v; \ 00470 v2df_t g34_k1v, s34_k1v; \ 00471 v2df_t g12_k2v, s12_k2v; \ 00472 v2df_t g23_k2v, s23_k2v; \ 00473 v2df_t t1v, t2v, t3v; \ 00474 \ 00475 g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \ 00476 s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \ 00477 g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \ 00478 s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \ 00479 g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \ 00480 s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \ 00481 g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \ 00482 s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \ 00483 \ 00484 for ( i = 0; i < n_iter16; ++i ) \ 00485 { \ 00486 \ 00487 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00488 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00489 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00490 \ 00491 t2v.v = a2v.v; \ 00492 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00493 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00494 \ 00495 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00496 \ 00497 t3v.v = a3v.v; \ 00498 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00499 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00500 \ 00501 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 00502 alpha4 += step_a4; \ 00503 \ 00504 t1v.v = a1v.v; \ 00505 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00506 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00507 \ 00508 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00509 alpha1 += step_a1; \ 00510 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 00511 \ 00512 t2v.v = a2v.v; \ 00513 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00514 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00515 \ 00516 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00517 alpha2 += step_a2; \ 00518 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 00519 \ 00520 /* ----------------------------------------------------------- */ \ 00521 \ 00522 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00523 \ 00524 t2v.v = b2v.v; \ 00525 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00526 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00527 \ 00528 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00529 alpha3 += step_a3; \ 00530 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00531 \ 00532 t3v.v = b3v.v; \ 00533 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00534 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00535 \ 00536 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 00537 alpha4 += step_a4; \ 00538 \ 00539 t1v.v = b1v.v; \ 00540 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00541 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00542 \ 00543 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 00544 alpha1 += step_a1; \ 00545 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 00546 \ 00547 t2v.v = b2v.v; \ 00548 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00549 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00550 \ 00551 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 00552 alpha2 += step_a2; \ 00553 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 00554 \ 00555 /* ----------------------------------------------------------- */ \ 00556 \ 00557 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00558 \ 00559 t2v.v = a2v.v; \ 00560 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00561 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00562 \ 00563 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 00564 alpha3 += step_a3; \ 00565 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00566 \ 00567 t3v.v = a3v.v; \ 00568 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00569 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00570 \ 00571 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 00572 alpha4 += step_a4; \ 00573 \ 00574 t1v.v = a1v.v; \ 00575 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00576 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00577 \ 00578 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00579 alpha1 += step_a1; \ 00580 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 00581 \ 00582 t2v.v = a2v.v; \ 00583 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00584 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00585 \ 00586 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00587 alpha2 += step_a2; \ 00588 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 00589 \ 00590 /* ----------------------------------------------------------- */ \ 00591 \ 00592 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00593 \ 00594 t2v.v = b2v.v; \ 00595 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00596 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00597 \ 00598 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00599 alpha3 += step_a3; \ 00600 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00601 \ 00602 t3v.v = b3v.v; \ 00603 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00604 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00605 \ 00606 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 00607 alpha4 += step_a4; \ 00608 \ 00609 t1v.v = b1v.v; \ 00610 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00611 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00612 \ 00613 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 00614 alpha1 += step_a1; \ 00615 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \ 00616 \ 00617 t2v.v = b2v.v; \ 00618 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00619 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00620 \ 00621 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 00622 alpha2 += step_a2; \ 00623 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 00624 \ 00625 \ 00626 /* ----------------------------------------------------------- */ \ 00627 \ 00628 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00629 \ 00630 t2v.v = a2v.v; \ 00631 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00632 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00633 \ 00634 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 00635 alpha3 += step_a3; \ 00636 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00637 \ 00638 t3v.v = a3v.v; \ 00639 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00640 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00641 \ 00642 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 00643 alpha4 += step_a4; \ 00644 \ 00645 t1v.v = a1v.v; \ 00646 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00647 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00648 \ 00649 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00650 alpha1 += step_a1; \ 00651 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 00652 \ 00653 t2v.v = a2v.v; \ 00654 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00655 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00656 \ 00657 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00658 alpha2 += step_a2; \ 00659 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 00660 \ 00661 /* ----------------------------------------------------------- */ \ 00662 \ 00663 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00664 \ 00665 t2v.v = b2v.v; \ 00666 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00667 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00668 \ 00669 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00670 alpha3 += step_a3; \ 00671 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00672 \ 00673 t3v.v = b3v.v; \ 00674 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00675 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00676 \ 00677 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 00678 alpha4 += step_a4; \ 00679 \ 00680 t1v.v = b1v.v; \ 00681 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00682 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00683 \ 00684 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 00685 alpha1 += step_a1; \ 00686 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 00687 \ 00688 t2v.v = b2v.v; \ 00689 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00690 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00691 \ 00692 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 00693 alpha2 += step_a2; \ 00694 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 00695 \ 00696 /* ----------------------------------------------------------- */ \ 00697 \ 00698 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00699 \ 00700 t2v.v = a2v.v; \ 00701 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00702 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00703 \ 00704 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 00705 alpha3 += step_a3; \ 00706 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00707 \ 00708 t3v.v = a3v.v; \ 00709 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00710 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00711 \ 00712 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 00713 alpha4 += step_a4; \ 00714 \ 00715 t1v.v = a1v.v; \ 00716 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00717 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00718 \ 00719 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00720 alpha1 += step_a1; \ 00721 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 00722 \ 00723 t2v.v = a2v.v; \ 00724 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00725 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00726 \ 00727 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00728 alpha2 += step_a2; \ 00729 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 00730 \ 00731 /* ----------------------------------------------------------- */ \ 00732 \ 00733 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00734 \ 00735 t2v.v = b2v.v; \ 00736 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00737 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00738 \ 00739 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00740 alpha3 += step_a3; \ 00741 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00742 \ 00743 t3v.v = b3v.v; \ 00744 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00745 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00746 \ 00747 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 00748 alpha4 += step_a4; \ 00749 \ 00750 t1v.v = b1v.v; \ 00751 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00752 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00753 \ 00754 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 00755 alpha1 += step_a1; \ 00756 \ 00757 t2v.v = b2v.v; \ 00758 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00759 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00760 \ 00761 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 00762 alpha2 += step_a2; \ 00763 \ 00764 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 00765 alpha3 += step_a3; \ 00766 \ 00767 /* ----------------------------------------------------------- */ \ 00768 } \ 00769 \ 00770 for ( i = 0; i < n_iter2; ++i ) \ 00771 { \ 00772 \ 00773 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 00774 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 00775 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 00776 \ 00777 t2v.v = a2v.v; \ 00778 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00779 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00780 \ 00781 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 00782 \ 00783 t3v.v = a3v.v; \ 00784 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00785 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00786 \ 00787 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 00788 alpha4 += step_a4; \ 00789 \ 00790 t1v.v = a1v.v; \ 00791 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00792 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00793 \ 00794 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 00795 alpha1 += step_a1; \ 00796 \ 00797 t2v.v = a2v.v; \ 00798 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00799 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00800 \ 00801 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 00802 alpha2 += step_a2; \ 00803 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 00804 alpha3 += step_a3; \ 00805 } \ 00806 \ 00807 if ( n_left == 1 ) \ 00808 { \ 00809 double ga23_k1 = *gamma23_k1; \ 00810 double si23_k1 = *sigma23_k1; \ 00811 double ga34_k1 = *gamma34_k1; \ 00812 double si34_k1 = *sigma34_k1; \ 00813 double ga12_k2 = *gamma12_k2; \ 00814 double si12_k2 = *sigma12_k2; \ 00815 double ga23_k2 = *gamma23_k2; \ 00816 double si23_k2 = *sigma23_k2; \ 00817 double temp1; \ 00818 double temp2; \ 00819 double temp3; \ 00820 double temp4; \ 00821 \ 00822 temp2 = *alpha2; \ 00823 temp3 = *alpha3; \ 00824 \ 00825 *alpha2 = temp2 * ga23_k1 + temp3 * si23_k1; \ 00826 *alpha3 = temp3 * ga23_k1 - temp2 * si23_k1; \ 00827 \ 00828 temp3 = *alpha3; \ 00829 temp4 = *alpha4; \ 00830 \ 00831 *alpha3 = temp3 * ga34_k1 + temp4 * si34_k1; \ 00832 *alpha4 = temp4 * ga34_k1 - temp3 * si34_k1; \ 00833 \ 00834 temp1 = *alpha1; \ 00835 temp2 = *alpha2; \ 00836 \ 00837 *alpha1 = temp1 * ga12_k2 + temp2 * si12_k2; \ 00838 *alpha2 = temp2 * ga12_k2 - temp1 * si12_k2; \ 00839 \ 00840 temp2 = *alpha2; \ 00841 temp3 = *alpha3; \ 00842 \ 00843 *alpha2 = temp2 * ga23_k2 + temp3 * si23_k2; \ 00844 *alpha3 = temp3 * ga23_k2 - temp2 * si23_k2; \ 00845 \ 00846 } \ 00847 } 00848 00849 #define MAC_Apply_G_mx4s_asc( m_A, \ 00850 gamma23_k1, \ 00851 sigma23_k1, \ 00852 gamma34_k1, \ 00853 sigma34_k1, \ 00854 gamma12_k2, \ 00855 sigma12_k2, \ 00856 gamma23_k2, \ 00857 sigma23_k2, \ 00858 a1, inc_a1, \ 00859 a2, inc_a2, \ 00860 a3, inc_a3, \ 00861 a4, inc_a4 ) \ 00862 {\ 00863 int n_iter16 = m_A / ( 2 * 8 ); \ 00864 int n_left16 = m_A % ( 2 * 8 ); \ 00865 int n_iter2 = n_left16 / ( 2 * 1 ); \ 00866 int n_left = n_left16 % ( 2 * 1 ); \ 00867 int i; \ 00868 \ 00869 const int step_a1 = inc_a1 * 2; \ 00870 const int step_a2 = inc_a2 * 2; \ 00871 const int step_a3 = inc_a3 * 2; \ 00872 const int step_a4 = inc_a4 * 2; \ 00873 \ 00874 scomplex* restrict alpha1 = a1; \ 00875 scomplex* restrict alpha2 = a2; \ 00876 scomplex* restrict alpha3 = a3; \ 00877 scomplex* restrict alpha4 = a4; \ 00878 \ 00879 v4sf_t a1v, a2v, a3v, a4v; \ 00880 v4sf_t b1v, b2v, b3v, b4v; \ 00881 v4sf_t g23_k1v, s23_k1v; \ 00882 v4sf_t g34_k1v, s34_k1v; \ 00883 v4sf_t g12_k2v, s12_k2v; \ 00884 v4sf_t g23_k2v, s23_k2v; \ 00885 v4sf_t t1v, t2v, t3v; \ 00886 \ 00887 g23_k1v.v = _mm_load1_ps( gamma23_k1 ); \ 00888 s23_k1v.v = _mm_load1_ps( sigma23_k1 ); \ 00889 g34_k1v.v = _mm_load1_ps( gamma34_k1 ); \ 00890 s34_k1v.v = _mm_load1_ps( sigma34_k1 ); \ 00891 g12_k2v.v = _mm_load1_ps( gamma12_k2 ); \ 00892 s12_k2v.v = _mm_load1_ps( sigma12_k2 ); \ 00893 g23_k2v.v = _mm_load1_ps( gamma23_k2 ); \ 00894 s23_k2v.v = _mm_load1_ps( sigma23_k2 ); \ 00895 \ 00896 for ( i = 0; i < n_iter16; ++i ) \ 00897 { \ 00898 \ 00899 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 00900 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 00901 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00902 \ 00903 t2v.v = a2v.v; \ 00904 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00905 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00906 \ 00907 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00908 \ 00909 t3v.v = a3v.v; \ 00910 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00911 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00912 \ 00913 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 00914 alpha4 += step_a4; \ 00915 \ 00916 t1v.v = a1v.v; \ 00917 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00918 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00919 \ 00920 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00921 alpha1 += step_a1; \ 00922 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00923 \ 00924 t2v.v = a2v.v; \ 00925 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00926 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00927 \ 00928 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00929 alpha2 += step_a2; \ 00930 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00931 \ 00932 /* ----------------------------------------------------------- */ \ 00933 \ 00934 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00935 \ 00936 t2v.v = b2v.v; \ 00937 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 00938 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00939 \ 00940 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 00941 alpha3 += step_a3; \ 00942 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00943 \ 00944 t3v.v = b3v.v; \ 00945 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 00946 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00947 \ 00948 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 00949 alpha4 += step_a4; \ 00950 \ 00951 t1v.v = b1v.v; \ 00952 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 00953 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00954 \ 00955 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 00956 alpha1 += step_a1; \ 00957 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00958 \ 00959 t2v.v = b2v.v; \ 00960 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 00961 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00962 \ 00963 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 00964 alpha2 += step_a2; \ 00965 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 00966 \ 00967 /* ----------------------------------------------------------- */ \ 00968 \ 00969 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 00970 \ 00971 t2v.v = a2v.v; \ 00972 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 00973 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 00974 \ 00975 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 00976 alpha3 += step_a3; \ 00977 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 00978 \ 00979 t3v.v = a3v.v; \ 00980 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 00981 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 00982 \ 00983 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 00984 alpha4 += step_a4; \ 00985 \ 00986 t1v.v = a1v.v; \ 00987 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 00988 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 00989 \ 00990 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 00991 alpha1 += step_a1; \ 00992 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 00993 \ 00994 t2v.v = a2v.v; \ 00995 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 00996 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 00997 \ 00998 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 00999 alpha2 += step_a2; \ 01000 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 01001 \ 01002 /* ----------------------------------------------------------- */ \ 01003 \ 01004 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 01005 \ 01006 t2v.v = b2v.v; \ 01007 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 01008 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01009 \ 01010 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 01011 alpha3 += step_a3; \ 01012 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 01013 \ 01014 t3v.v = b3v.v; \ 01015 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 01016 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01017 \ 01018 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 01019 alpha4 += step_a4; \ 01020 \ 01021 t1v.v = b1v.v; \ 01022 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 01023 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01024 \ 01025 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 01026 alpha1 += step_a1; \ 01027 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a3) ); \ 01028 \ 01029 t2v.v = b2v.v; \ 01030 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 01031 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01032 \ 01033 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 01034 alpha2 += step_a2; \ 01035 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 01036 \ 01037 \ 01038 /* ----------------------------------------------------------- */ \ 01039 \ 01040 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 01041 \ 01042 t2v.v = a2v.v; \ 01043 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01044 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01045 \ 01046 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 01047 alpha3 += step_a3; \ 01048 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 01049 \ 01050 t3v.v = a3v.v; \ 01051 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01052 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01053 \ 01054 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 01055 alpha4 += step_a4; \ 01056 \ 01057 t1v.v = a1v.v; \ 01058 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01059 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01060 \ 01061 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 01062 alpha1 += step_a1; \ 01063 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 01064 \ 01065 t2v.v = a2v.v; \ 01066 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01067 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01068 \ 01069 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 01070 alpha2 += step_a2; \ 01071 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 01072 \ 01073 /* ----------------------------------------------------------- */ \ 01074 \ 01075 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 01076 \ 01077 t2v.v = b2v.v; \ 01078 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 01079 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01080 \ 01081 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 01082 alpha3 += step_a3; \ 01083 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 01084 \ 01085 t3v.v = b3v.v; \ 01086 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 01087 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01088 \ 01089 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 01090 alpha4 += step_a4; \ 01091 \ 01092 t1v.v = b1v.v; \ 01093 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 01094 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01095 \ 01096 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 01097 alpha1 += step_a1; \ 01098 a2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 01099 \ 01100 t2v.v = b2v.v; \ 01101 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 01102 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01103 \ 01104 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 01105 alpha2 += step_a2; \ 01106 a3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 01107 \ 01108 /* ----------------------------------------------------------- */ \ 01109 \ 01110 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 01111 \ 01112 t2v.v = a2v.v; \ 01113 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01114 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01115 \ 01116 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 01117 alpha3 += step_a3; \ 01118 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 01119 \ 01120 t3v.v = a3v.v; \ 01121 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01122 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01123 \ 01124 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 01125 alpha4 += step_a4; \ 01126 \ 01127 t1v.v = a1v.v; \ 01128 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01129 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01130 \ 01131 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 01132 alpha1 += step_a1; \ 01133 b2v.v = _mm_load_ps( ( float* )(alpha2 + step_a2) ); \ 01134 \ 01135 t2v.v = a2v.v; \ 01136 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01137 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01138 \ 01139 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 01140 alpha2 += step_a2; \ 01141 b3v.v = _mm_load_ps( ( float* )(alpha3 + step_a3) ); \ 01142 \ 01143 /* ----------------------------------------------------------- */ \ 01144 \ 01145 b4v.v = _mm_load_ps( ( float* )alpha4 ); \ 01146 \ 01147 t2v.v = b2v.v; \ 01148 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 01149 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01150 \ 01151 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 01152 alpha3 += step_a3; \ 01153 b1v.v = _mm_load_ps( ( float* )alpha1 ); \ 01154 \ 01155 t3v.v = b3v.v; \ 01156 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 01157 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01158 \ 01159 _mm_store_ps( ( float* )alpha4, b4v.v ); \ 01160 alpha4 += step_a4; \ 01161 \ 01162 t1v.v = b1v.v; \ 01163 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 01164 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01165 \ 01166 _mm_store_ps( ( float* )alpha1, b1v.v ); \ 01167 alpha1 += step_a1; \ 01168 \ 01169 t2v.v = b2v.v; \ 01170 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 01171 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01172 \ 01173 _mm_store_ps( ( float* )alpha2, b2v.v ); \ 01174 alpha2 += step_a2; \ 01175 \ 01176 _mm_store_ps( ( float* )alpha3, b3v.v ); \ 01177 alpha3 += step_a3; \ 01178 \ 01179 /* ----------------------------------------------------------- */ \ 01180 } \ 01181 \ 01182 for ( i = 0; i < n_iter2; ++i ) \ 01183 { \ 01184 \ 01185 a2v.v = _mm_load_ps( ( float* )alpha2 ); \ 01186 a3v.v = _mm_load_ps( ( float* )alpha3 ); \ 01187 a4v.v = _mm_load_ps( ( float* )alpha4 ); \ 01188 \ 01189 t2v.v = a2v.v; \ 01190 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01191 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01192 \ 01193 a1v.v = _mm_load_ps( ( float* )alpha1 ); \ 01194 \ 01195 t3v.v = a3v.v; \ 01196 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01197 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01198 \ 01199 _mm_store_ps( ( float* )alpha4, a4v.v ); \ 01200 alpha4 += step_a4; \ 01201 \ 01202 t1v.v = a1v.v; \ 01203 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01204 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01205 \ 01206 _mm_store_ps( ( float* )alpha1, a1v.v ); \ 01207 alpha1 += step_a1; \ 01208 \ 01209 t2v.v = a2v.v; \ 01210 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01211 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01212 \ 01213 _mm_store_ps( ( float* )alpha2, a2v.v ); \ 01214 alpha2 += step_a2; \ 01215 _mm_store_ps( ( float* )alpha3, a3v.v ); \ 01216 alpha3 += step_a3; \ 01217 } \ 01218 \ 01219 if ( n_left == 1 ) \ 01220 { \ 01221 float ga23_k1 = *gamma23_k1; \ 01222 float si23_k1 = *sigma23_k1; \ 01223 float ga34_k1 = *gamma34_k1; \ 01224 float si34_k1 = *sigma34_k1; \ 01225 float ga12_k2 = *gamma12_k2; \ 01226 float si12_k2 = *sigma12_k2; \ 01227 float ga23_k2 = *gamma23_k2; \ 01228 float si23_k2 = *sigma23_k2; \ 01229 scomplex temp1; \ 01230 scomplex temp2; \ 01231 scomplex temp3; \ 01232 scomplex temp4; \ 01233 \ 01234 temp2 = *alpha2; \ 01235 temp3 = *alpha3; \ 01236 \ 01237 alpha2->real = temp2.real * ga23_k1 + temp3.real * si23_k1; \ 01238 alpha3->real = temp3.real * ga23_k1 - temp2.real * si23_k1; \ 01239 \ 01240 alpha2->imag = temp2.imag * ga23_k1 + temp3.imag * si23_k1; \ 01241 alpha3->imag = temp3.imag * ga23_k1 - temp2.imag * si23_k1; \ 01242 \ 01243 temp3 = *alpha3; \ 01244 temp4 = *alpha4; \ 01245 \ 01246 alpha3->real = temp3.real * ga34_k1 + temp4.real * si34_k1; \ 01247 alpha4->real = temp4.real * ga34_k1 - temp3.real * si34_k1; \ 01248 \ 01249 alpha3->imag = temp3.imag * ga34_k1 + temp4.imag * si34_k1; \ 01250 alpha4->imag = temp4.imag * ga34_k1 - temp3.imag * si34_k1; \ 01251 \ 01252 temp1 = *alpha1; \ 01253 temp2 = *alpha2; \ 01254 \ 01255 alpha1->real = temp1.real * ga12_k2 + temp2.real * si12_k2; \ 01256 alpha2->real = temp2.real * ga12_k2 - temp1.real * si12_k2; \ 01257 \ 01258 alpha1->imag = temp1.imag * ga12_k2 + temp2.imag * si12_k2; \ 01259 alpha2->imag = temp2.imag * ga12_k2 - temp1.imag * si12_k2; \ 01260 \ 01261 temp2 = *alpha2; \ 01262 temp3 = *alpha3; \ 01263 \ 01264 alpha2->real = temp2.real * ga23_k2 + temp3.real * si23_k2; \ 01265 alpha3->real = temp3.real * ga23_k2 - temp2.real * si23_k2; \ 01266 \ 01267 alpha2->imag = temp2.imag * ga23_k2 + temp3.imag * si23_k2; \ 01268 alpha3->imag = temp3.imag * ga23_k2 - temp2.imag * si23_k2; \ 01269 \ 01270 } \ 01271 } 01272 01273 #define MAC_Apply_G_mx4s_asz( m_A, \ 01274 gamma23_k1, \ 01275 sigma23_k1, \ 01276 gamma34_k1, \ 01277 sigma34_k1, \ 01278 gamma12_k2, \ 01279 sigma12_k2, \ 01280 gamma23_k2, \ 01281 sigma23_k2, \ 01282 a1, inc_a1, \ 01283 a2, inc_a2, \ 01284 a3, inc_a3, \ 01285 a4, inc_a4 ) \ 01286 {\ 01287 int n_iter = m_A / 8; \ 01288 int n_left = m_A % 8; \ 01289 int i; \ 01290 \ 01291 const int step_a1 = inc_a1 * 1; \ 01292 const int step_a2 = inc_a2 * 1; \ 01293 const int step_a3 = inc_a3 * 1; \ 01294 const int step_a4 = inc_a4 * 1; \ 01295 \ 01296 dcomplex* restrict alpha1 = a1; \ 01297 dcomplex* restrict alpha2 = a2; \ 01298 dcomplex* restrict alpha3 = a3; \ 01299 dcomplex* restrict alpha4 = a4; \ 01300 \ 01301 v2df_t a1v, a2v, a3v, a4v; \ 01302 v2df_t b1v, b2v, b3v, b4v; \ 01303 v2df_t g23_k1v, s23_k1v; \ 01304 v2df_t g34_k1v, s34_k1v; \ 01305 v2df_t g12_k2v, s12_k2v; \ 01306 v2df_t g23_k2v, s23_k2v; \ 01307 v2df_t t1v, t2v, t3v; \ 01308 \ 01309 g23_k1v.v = _mm_loaddup_pd( gamma23_k1 ); \ 01310 s23_k1v.v = _mm_loaddup_pd( sigma23_k1 ); \ 01311 g34_k1v.v = _mm_loaddup_pd( gamma34_k1 ); \ 01312 s34_k1v.v = _mm_loaddup_pd( sigma34_k1 ); \ 01313 g12_k2v.v = _mm_loaddup_pd( gamma12_k2 ); \ 01314 s12_k2v.v = _mm_loaddup_pd( sigma12_k2 ); \ 01315 g23_k2v.v = _mm_loaddup_pd( gamma23_k2 ); \ 01316 s23_k2v.v = _mm_loaddup_pd( sigma23_k2 ); \ 01317 \ 01318 for ( i = 0; i < n_iter; ++i ) \ 01319 { \ 01320 \ 01321 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 01322 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 01323 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01324 \ 01325 t2v.v = a2v.v; \ 01326 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01327 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01328 \ 01329 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01330 \ 01331 t3v.v = a3v.v; \ 01332 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01333 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01334 \ 01335 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 01336 alpha4 += step_a4; \ 01337 \ 01338 t1v.v = a1v.v; \ 01339 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01340 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01341 \ 01342 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 01343 alpha1 += step_a1; \ 01344 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 01345 \ 01346 t2v.v = a2v.v; \ 01347 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01348 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01349 \ 01350 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 01351 alpha2 += step_a2; \ 01352 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 01353 \ 01354 /* ----------------------------------------------------------- */ \ 01355 \ 01356 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01357 \ 01358 t2v.v = b2v.v; \ 01359 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 01360 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01361 \ 01362 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 01363 alpha3 += step_a3; \ 01364 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01365 \ 01366 t3v.v = b3v.v; \ 01367 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 01368 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01369 \ 01370 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 01371 alpha4 += step_a4; \ 01372 \ 01373 t1v.v = b1v.v; \ 01374 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 01375 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01376 \ 01377 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 01378 alpha1 += step_a1; \ 01379 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 01380 \ 01381 t2v.v = b2v.v; \ 01382 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 01383 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01384 \ 01385 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 01386 alpha2 += step_a2; \ 01387 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 01388 \ 01389 /* ----------------------------------------------------------- */ \ 01390 \ 01391 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01392 \ 01393 t2v.v = a2v.v; \ 01394 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01395 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01396 \ 01397 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 01398 alpha3 += step_a3; \ 01399 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01400 \ 01401 t3v.v = a3v.v; \ 01402 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01403 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01404 \ 01405 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 01406 alpha4 += step_a4; \ 01407 \ 01408 t1v.v = a1v.v; \ 01409 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01410 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01411 \ 01412 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 01413 alpha1 += step_a1; \ 01414 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 01415 \ 01416 t2v.v = a2v.v; \ 01417 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01418 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01419 \ 01420 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 01421 alpha2 += step_a2; \ 01422 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 01423 \ 01424 /* ----------------------------------------------------------- */ \ 01425 \ 01426 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01427 \ 01428 t2v.v = b2v.v; \ 01429 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 01430 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01431 \ 01432 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 01433 alpha3 += step_a3; \ 01434 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01435 \ 01436 t3v.v = b3v.v; \ 01437 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 01438 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01439 \ 01440 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 01441 alpha4 += step_a4; \ 01442 \ 01443 t1v.v = b1v.v; \ 01444 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 01445 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01446 \ 01447 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 01448 alpha1 += step_a1; \ 01449 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a3) ); \ 01450 \ 01451 t2v.v = b2v.v; \ 01452 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 01453 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01454 \ 01455 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 01456 alpha2 += step_a2; \ 01457 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 01458 \ 01459 /* ----------------------------------------------------------- */ \ 01460 \ 01461 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01462 \ 01463 t2v.v = a2v.v; \ 01464 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01465 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01466 \ 01467 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 01468 alpha3 += step_a3; \ 01469 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01470 \ 01471 t3v.v = a3v.v; \ 01472 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01473 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01474 \ 01475 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 01476 alpha4 += step_a4; \ 01477 \ 01478 t1v.v = a1v.v; \ 01479 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01480 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01481 \ 01482 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 01483 alpha1 += step_a1; \ 01484 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 01485 \ 01486 t2v.v = a2v.v; \ 01487 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01488 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01489 \ 01490 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 01491 alpha2 += step_a2; \ 01492 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 01493 \ 01494 /* ----------------------------------------------------------- */ \ 01495 \ 01496 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01497 \ 01498 t2v.v = b2v.v; \ 01499 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 01500 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01501 \ 01502 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 01503 alpha3 += step_a3; \ 01504 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01505 \ 01506 t3v.v = b3v.v; \ 01507 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 01508 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01509 \ 01510 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 01511 alpha4 += step_a4; \ 01512 \ 01513 t1v.v = b1v.v; \ 01514 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 01515 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01516 \ 01517 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 01518 alpha1 += step_a1; \ 01519 a2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 01520 \ 01521 t2v.v = b2v.v; \ 01522 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 01523 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01524 \ 01525 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 01526 alpha2 += step_a2; \ 01527 a3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 01528 \ 01529 /* ----------------------------------------------------------- */ \ 01530 \ 01531 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01532 \ 01533 t2v.v = a2v.v; \ 01534 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01535 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01536 \ 01537 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 01538 alpha3 += step_a3; \ 01539 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01540 \ 01541 t3v.v = a3v.v; \ 01542 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01543 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01544 \ 01545 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 01546 alpha4 += step_a4; \ 01547 \ 01548 t1v.v = a1v.v; \ 01549 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01550 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01551 \ 01552 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 01553 alpha1 += step_a1; \ 01554 b2v.v = _mm_load_pd( ( double* )(alpha2 + step_a2) ); \ 01555 \ 01556 t2v.v = a2v.v; \ 01557 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01558 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01559 \ 01560 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 01561 alpha2 += step_a2; \ 01562 b3v.v = _mm_load_pd( ( double* )(alpha3 + step_a3) ); \ 01563 \ 01564 /* ----------------------------------------------------------- */ \ 01565 \ 01566 b4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01567 \ 01568 t2v.v = b2v.v; \ 01569 b2v.v = t2v.v * g23_k1v.v + b3v.v * s23_k1v.v; \ 01570 b3v.v = b3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01571 \ 01572 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 01573 alpha3 += step_a3; \ 01574 b1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01575 \ 01576 t3v.v = b3v.v; \ 01577 b3v.v = t3v.v * g34_k1v.v + b4v.v * s34_k1v.v; \ 01578 b4v.v = b4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01579 \ 01580 _mm_store_pd( ( double* )alpha4, b4v.v ); \ 01581 alpha4 += step_a4; \ 01582 \ 01583 t1v.v = b1v.v; \ 01584 b1v.v = t1v.v * g12_k2v.v + b2v.v * s12_k2v.v; \ 01585 b2v.v = b2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01586 \ 01587 _mm_store_pd( ( double* )alpha1, b1v.v ); \ 01588 alpha1 += step_a1; \ 01589 \ 01590 t2v.v = b2v.v; \ 01591 b2v.v = t2v.v * g23_k2v.v + b3v.v * s23_k2v.v; \ 01592 b3v.v = b3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01593 \ 01594 _mm_store_pd( ( double* )alpha2, b2v.v ); \ 01595 alpha2 += step_a2; \ 01596 \ 01597 _mm_store_pd( ( double* )alpha3, b3v.v ); \ 01598 alpha3 += step_a3; \ 01599 \ 01600 /* ----------------------------------------------------------- */ \ 01601 } \ 01602 \ 01603 for ( i = 0; i < n_left; ++i ) \ 01604 { \ 01605 \ 01606 a2v.v = _mm_load_pd( ( double* )alpha2 ); \ 01607 a3v.v = _mm_load_pd( ( double* )alpha3 ); \ 01608 a4v.v = _mm_load_pd( ( double* )alpha4 ); \ 01609 \ 01610 t2v.v = a2v.v; \ 01611 a2v.v = t2v.v * g23_k1v.v + a3v.v * s23_k1v.v; \ 01612 a3v.v = a3v.v * g23_k1v.v - t2v.v * s23_k1v.v; \ 01613 \ 01614 a1v.v = _mm_load_pd( ( double* )alpha1 ); \ 01615 \ 01616 t3v.v = a3v.v; \ 01617 a3v.v = t3v.v * g34_k1v.v + a4v.v * s34_k1v.v; \ 01618 a4v.v = a4v.v * g34_k1v.v - t3v.v * s34_k1v.v; \ 01619 \ 01620 _mm_store_pd( ( double* )alpha4, a4v.v ); \ 01621 alpha4 += step_a4; \ 01622 \ 01623 t1v.v = a1v.v; \ 01624 a1v.v = t1v.v * g12_k2v.v + a2v.v * s12_k2v.v; \ 01625 a2v.v = a2v.v * g12_k2v.v - t1v.v * s12_k2v.v; \ 01626 \ 01627 _mm_store_pd( ( double* )alpha1, a1v.v ); \ 01628 alpha1 += step_a1; \ 01629 \ 01630 t2v.v = a2v.v; \ 01631 a2v.v = t2v.v * g23_k2v.v + a3v.v * s23_k2v.v; \ 01632 a3v.v = a3v.v * g23_k2v.v - t2v.v * s23_k2v.v; \ 01633 \ 01634 _mm_store_pd( ( double* )alpha2, a2v.v ); \ 01635 alpha2 += step_a2; \ 01636 _mm_store_pd( ( double* )alpha3, a3v.v ); \ 01637 alpha3 += step_a3; \ 01638 } \ 01639 } 01640 01641 #endif