libflame
revision_anchor
|
Functions | |
void | bl1_sdotsv2 (conj1_t conjxy, int n, float *x, int inc_x, float *y, int inc_y, float *z, int inc_z, float *beta, float *rho_xz, float *rho_yz) |
void | bl1_ddotsv2 (conj1_t conjxy, int n, double *x, int inc_x, double *y, int inc_y, double *z, int inc_z, double *beta, double *rho_xz, double *rho_yz) |
void bl1_ddotsv2 | ( | conj1_t | conjxy, |
int | n, | ||
double * | x, | ||
int | inc_x, | ||
double * | y, | ||
int | inc_y, | ||
double * | z, | ||
int | inc_z, | ||
double * | beta, | ||
double * | rho_xz, | ||
double * | rho_yz | ||
) |
References bl1_abort(), v2df_t::d, and v2df_t::v.
Referenced by FLA_Fused_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Uhu_Yhu_Zhu_opd_var1(), FLA_Fused_UYx_ZVx_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().
{ double* restrict x1; double* restrict y1; double* restrict z1; double rho1, rho2; double x1c, y1c, z1c; int i; int n_pre; int n_run; int n_left; v2df_t rho1v, rho2v; v2df_t x1v, y1v, z1v; v2df_t x2v, y2v, z2v; if ( inc_x != 1 || inc_y != 1 || inc_z != 1 ) bl1_abort(); n_pre = 0; if ( ( unsigned long ) z % 16 != 0 ) { if ( ( unsigned long ) x % 16 == 0 || ( unsigned long ) y % 16 == 0 ) bl1_abort(); n_pre = 1; } n_run = ( n - n_pre ) / 4; n_left = ( n - n_pre ) % 4; x1 = x; y1 = y; z1 = z; rho1 = 0.0; rho2 = 0.0; if ( n_pre == 1 ) { x1c = *x1; y1c = *y1; z1c = *z1; rho1 += x1c * z1c; rho2 += y1c * z1c; x1 += inc_x; y1 += inc_y; z1 += inc_z; } rho1v.v = _mm_setzero_pd(); rho2v.v = _mm_setzero_pd(); for ( i = 0; i < n_run; ++i ) { x1v.v = _mm_load_pd( ( double* )x1 ); y1v.v = _mm_load_pd( ( double* )y1 ); z1v.v = _mm_load_pd( ( double* )z1 ); x2v.v = _mm_load_pd( ( double* )(x1 + 2) ); y2v.v = _mm_load_pd( ( double* )(y1 + 2) ); z2v.v = _mm_load_pd( ( double* )(z1 + 2) ); rho1v.v += x1v.v * z1v.v; rho2v.v += y1v.v * z1v.v; rho1v.v += x2v.v * z2v.v; rho2v.v += y2v.v * z2v.v; x1 += 4; y1 += 4; z1 += 4; } rho1 += rho1v.d[0] + rho1v.d[1]; rho2 += rho2v.d[0] + rho2v.d[1]; if ( n_left > 0 ) { for ( i = 0; i < n_left; ++i ) { x1c = *x1; y1c = *y1; z1c = *z1; rho1 += x1c * z1c; rho2 += y1c * z1c; x1 += inc_x; y1 += inc_y; z1 += inc_z; } } *rho_xz = *beta * *rho_xz + rho1; *rho_yz = *beta * *rho_yz + rho2; }
void bl1_sdotsv2 | ( | conj1_t | conjxy, |
int | n, | ||
float * | x, | ||
int | inc_x, | ||
float * | y, | ||
int | inc_y, | ||
float * | z, | ||
int | inc_z, | ||
float * | beta, | ||
float * | rho_xz, | ||
float * | rho_yz | ||
) |
References bl1_abort().
{ bl1_abort(); }