libflame
revision_anchor
|
Functions | |
void | bl1_saxpyv2b (int n, float *alpha1, float *alpha2, float *x1, int inc_x1, float *x2, int inc_x2, float *y, int inc_y) |
void | bl1_daxpyv2b (int n, double *alpha1, double *alpha2, double *x1, int inc_x1, double *x2, int inc_x2, double *y, int inc_y) |
void bl1_daxpyv2b | ( | int | n, |
double * | alpha1, | ||
double * | alpha2, | ||
double * | x1, | ||
int | inc_x1, | ||
double * | x2, | ||
int | inc_x2, | ||
double * | y, | ||
int | inc_y | ||
) |
References bl1_abort(), and v2df_t::v.
Referenced by FLA_Fused_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Gerc2_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().
{ double* restrict chi1; double* restrict chi2; double* restrict psi1; int i; int n_pre; int n_run; int n_left; v2df_t a1v, a2v; v2df_t x11v, x12v; v2df_t x21v, x22v; v2df_t y1v; v2df_t y2v; if ( inc_x1 != 1 || inc_x2 != 1 || inc_y != 1 ) bl1_abort(); n_pre = 0; if ( ( unsigned long ) y % 16 != 0 ) { if ( ( unsigned long ) x1 % 16 == 0 || ( unsigned long ) x2 % 16 == 0 ) bl1_abort(); n_pre = 1; } n_run = ( n - n_pre ) / 4; n_left = ( n - n_pre ) % 4; chi1 = x1; chi2 = x2; psi1 = y; if ( n_pre == 1 ) { double alpha1_c = *alpha1; double alpha2_c = *alpha2; double chi11_c = *chi1; double chi12_c = *chi2; double temp1; // psi1 = psi1 + alpha1 * chi11 + alpha2 * chi12; temp1 = alpha1_c * chi11_c + alpha2_c * chi12_c; *psi1 = *psi1 + temp1; chi1 += inc_x1; chi2 += inc_x2; psi1 += inc_y; } a1v.v = _mm_loaddup_pd( ( double* )alpha1 ); a2v.v = _mm_loaddup_pd( ( double* )alpha2 ); for ( i = 0; i < n_run; ++i ) { x11v.v = _mm_load_pd( ( double* )chi1 ); x12v.v = _mm_load_pd( ( double* )chi2 ); y1v.v = _mm_load_pd( ( double* )psi1 ); x21v.v = _mm_load_pd( ( double* )(chi1 + 2) ); x22v.v = _mm_load_pd( ( double* )(chi2 + 2) ); y2v.v = _mm_load_pd( ( double* )(psi1 + 2) ); y1v.v += a1v.v * x11v.v + a2v.v * x12v.v; y2v.v += a1v.v * x21v.v + a2v.v * x22v.v; _mm_store_pd( ( double* )psi1, y1v.v ); _mm_store_pd( ( double* )(psi1 + 2), y2v.v ); //chi1 += step_x1; //chi2 += step_x2; //psi1 += step_y; chi1 += 4; chi2 += 4; psi1 += 4; } if ( n_left > 0 ) { double alpha1_c = *alpha1; double alpha2_c = *alpha2; for ( i = 0; i < n_left; ++i ) { double chi11_c = *chi1; double chi12_c = *chi2; double psi1_c = *psi1; double temp1; temp1 = alpha1_c * chi11_c + alpha2_c * chi12_c; *psi1 = psi1_c + temp1; chi1 += inc_x1; chi2 += inc_x2; psi1 += inc_y; } } }
void bl1_saxpyv2b | ( | int | n, |
float * | alpha1, | ||
float * | alpha2, | ||
float * | x1, | ||
int | inc_x1, | ||
float * | x2, | ||
int | inc_x2, | ||
float * | y, | ||
int | inc_y | ||
) |
References bl1_abort().
{ bl1_abort(); }