{
double* restrict x1;
double* restrict y1;
double* restrict w1;
double* restrict z1;
double rho1, rho2, rho3;
double x1c, y1c, w1c, z1c;
int i;
int n_pre;
int n_run;
int n_left;
v2df_t rho1v, rho2v, rho3v;
v2df_t x1v, y1v, w1v, z1v;
v2df_t x2v, y2v, w2v, z2v;
if ( inc_x != 1 ||
inc_y != 1 ||
inc_w != 1 ||
inc_z != 1 ) bl1_abort();
n_pre = 0;
if ( ( unsigned long ) z % 16 != 0 )
{
if ( ( unsigned long ) x % 16 == 0 ||
( unsigned long ) y % 16 == 0 ||
( unsigned long ) w % 16 == 0 ) bl1_abort();
n_pre = 1;
}
n_run = ( n - n_pre ) / 4;
n_left = ( n - n_pre ) % 4;
x1 = x;
y1 = y;
w1 = w;
z1 = z;
rho1 = 0.0;
rho2 = 0.0;
rho3 = 0.0;
if ( n_pre == 1 )
{
x1c = *x1;
y1c = *y1;
w1c = *w1;
z1c = *z1;
rho1 += x1c * z1c;
rho2 += y1c * z1c;
rho3 += w1c * z1c;
x1 += inc_x;
y1 += inc_y;
w1 += inc_w;
z1 += inc_z;
}
rho1v.v = _mm_setzero_pd();
rho2v.v = _mm_setzero_pd();
rho3v.v = _mm_setzero_pd();
for ( i = 0; i < n_run; ++i )
{
x1v.v = _mm_load_pd( ( double* )x1 );
y1v.v = _mm_load_pd( ( double* )y1 );
w1v.v = _mm_load_pd( ( double* )w1 );
z1v.v = _mm_load_pd( ( double* )z1 );
rho1v.v += x1v.v * z1v.v;
rho2v.v += y1v.v * z1v.v;
rho3v.v += w1v.v * z1v.v;
x2v.v = _mm_load_pd( ( double* )(x1 + 2) );
y2v.v = _mm_load_pd( ( double* )(y1 + 2) );
w2v.v = _mm_load_pd( ( double* )(w1 + 2) );
z2v.v = _mm_load_pd( ( double* )(z1 + 2) );
rho1v.v += x2v.v * z2v.v;
rho2v.v += y2v.v * z2v.v;
rho3v.v += w2v.v * z2v.v;
x1 += 4;
y1 += 4;
w1 += 4;
z1 += 4;
}
rho1 += rho1v.d[0] + rho1v.d[1];
rho2 += rho2v.d[0] + rho2v.d[1];
rho3 += rho3v.d[0] + rho3v.d[1];
if ( n_left > 0 )
{
for ( i = 0; i < n_left; ++i )
{
x1c = *x1;
y1c = *y1;
w1c = *w1;
z1c = *z1;
rho1 += x1c * z1c;
rho2 += y1c * z1c;
rho3 += w1c * z1c;
x1 += inc_x;
y1 += inc_y;
w1 += inc_w;
z1 += inc_z;
}
}
*rho_xz = *beta * *rho_xz + rho1;
*rho_yz = *beta * *rho_yz + rho2;
*rho_wz = *beta * *rho_wz + rho3;
}