{
double* restrict chi1;
double* restrict chi2;
double* restrict chi3;
double* restrict psi1;
int i;
int n_pre;
int n_run;
int n_left;
v2df_t a1v, a2v, a3v;
v2df_t x11v, x12v, x13v;
v2df_t x21v, x22v, x23v;
v2df_t y1v;
v2df_t y2v;
if ( inc_x1 != 1 ||
inc_x2 != 1 ||
inc_x3 != 1 ||
inc_y != 1 ) bl1_abort();
n_pre = 0;
if ( ( unsigned long ) y % 16 != 0 )
{
if ( ( unsigned long ) x1 % 16 == 0 ||
( unsigned long ) x2 % 16 == 0 ||
( unsigned long ) x3 % 16 == 0 ) bl1_abort();
n_pre = 1;
}
n_run = ( n - n_pre ) / 4;
n_left = ( n - n_pre ) % 4;
chi1 = x1;
chi2 = x2;
chi3 = x3;
psi1 = y;
if ( n_pre == 1 )
{
double alpha1_c = *alpha1;
double alpha2_c = *alpha2;
double alpha3_c = *alpha3;
double chi11_c = *chi1;
double chi12_c = *chi2;
double chi13_c = *chi3;
*psi1 += alpha1_c * chi11_c + alpha2_c * chi12_c + alpha3_c * chi13_c;
chi1 += inc_x1;
chi2 += inc_x2;
chi3 += inc_x3;
psi1 += inc_y;
}
a1v.v = _mm_loaddup_pd( ( double* )alpha1 );
a2v.v = _mm_loaddup_pd( ( double* )alpha2 );
a3v.v = _mm_loaddup_pd( ( double* )alpha3 );
for ( i = 0; i < n_run; ++i )
{
x11v.v = _mm_load_pd( ( double* )chi1 );
x12v.v = _mm_load_pd( ( double* )chi2 );
x13v.v = _mm_load_pd( ( double* )chi3 );
y1v.v = _mm_load_pd( ( double* )psi1 );
y1v.v += a1v.v * x11v.v + a2v.v * x12v.v + a3v.v * x13v.v;
_mm_store_pd( ( double* )psi1, y1v.v );
x21v.v = _mm_load_pd( ( double* )(chi1 + 2) );
x22v.v = _mm_load_pd( ( double* )(chi2 + 2) );
x23v.v = _mm_load_pd( ( double* )(chi3 + 2) );
y2v.v = _mm_load_pd( ( double* )(psi1 + 2) );
y2v.v += a1v.v * x21v.v + a2v.v * x22v.v + a3v.v * x23v.v;
_mm_store_pd( ( double* )(psi1 + 2), y2v.v );
chi1 += 4;
chi2 += 4;
chi3 += 4;
psi1 += 4;
}
if ( n_left > 0 )
{
double alpha1_c = *alpha1;
double alpha2_c = *alpha2;
double alpha3_c = *alpha3;
for ( i = 0; i < n_left; ++i )
{
double chi11_c = *chi1;
double chi12_c = *chi2;
double chi13_c = *chi3;
*psi1 += alpha1_c * chi11_c + alpha2_c * chi12_c + alpha3_c * chi13_c;
chi1 += inc_x1;
chi2 += inc_x2;
chi3 += inc_x3;
psi1 += inc_y;
}
}
}