libflame  revision_anchor
Functions
bl1_axpyv2bdotaxpy.c File Reference

(r)

Functions

void bl1_saxpyv2bdotaxpy (int n, float *beta, float *u, int inc_u, float *gamma, float *z, int inc_z, float *a, int inc_a, float *x, int inc_x, float *kappa, float *rho, float *w, int inc_w)
void bl1_daxpyv2bdotaxpy (int n, double *beta, double *u, int inc_u, double *gamma, double *z, int inc_z, double *a, int inc_a, double *x, int inc_x, double *kappa, double *rho, double *w, int inc_w)

Function Documentation

void bl1_daxpyv2bdotaxpy ( int  n,
double *  beta,
double *  u,
int  inc_u,
double *  gamma,
double *  z,
int  inc_z,
double *  a,
int  inc_a,
double *  x,
int  inc_x,
double *  kappa,
double *  rho,
double *  w,
int  inc_w 
)

References bl1_abort(), v2df_t::d, and v2df_t::v.

Referenced by FLA_Fused_Gerc2_Ahx_Ax_opd_var1(), and FLA_Fused_Her2_Ax_l_opd_var1().

{
    double*   restrict upsilon1;
    double*   restrict zeta1;
    double*   restrict alpha1;
    double*   restrict chi1;
    double*   restrict omega1;
    double             rho_c;
    int                i;
    v2df_t             b1v, g1v, k1v;
    v2df_t             rhov;
    v2df_t             u1v, z1v, a1v;
    v2df_t             u2v, z2v, a2v;
    v2df_t             x1v, w1v;
    v2df_t             x2v, w2v;

    int       n_pre;
    int       n_run;
    int       n_left;

    n_pre = 0;
    if ( ( unsigned long ) a % 16 != 0 )
    {
        if ( ( unsigned long ) u % 16 == 0 ||
             ( unsigned long ) z % 16 == 0 ||
             ( unsigned long ) x % 16 == 0 ||
             ( unsigned long ) w % 16 == 0 ) bl1_abort();

        n_pre = 1;
    }

    n_run       = ( n - n_pre ) / 4;
    n_left      = ( n - n_pre ) % 4;

    upsilon1 = u;
    zeta1    = z;
    alpha1   = a;
    chi1     = x;
    omega1   = w;


    rho_c   = 0.0;

    if ( n_pre == 1 )
    {
        double   beta_c     = *beta;
        double   gamma_c    = *gamma;
        double   kappa_c    = *kappa;

        double   upsilon1_c = *upsilon1;
        double   zeta1_c    = *zeta1;
        double   alpha1_c   = *alpha1;
        double   chi1_c     = *chi1;
        double   omega1_c   = *omega1;

        alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
        rho_c += alpha1_c * chi1_c;
        omega1_c += kappa_c * alpha1_c;

        *alpha1 = alpha1_c;
        *omega1 = omega1_c;

        upsilon1 += inc_u;
        zeta1    += inc_z;
        alpha1   += inc_a;
        chi1     += inc_x;
        omega1   += inc_w;
    }

    b1v.v = _mm_loaddup_pd( ( double* )beta );
    g1v.v = _mm_loaddup_pd( ( double* )gamma );
    k1v.v = _mm_loaddup_pd( ( double* )kappa );

    rhov.v = _mm_setzero_pd();

    for ( i = 0; i < n_run; ++i )
    {
        u1v.v = _mm_load_pd( ( double* )upsilon1 );
        z1v.v = _mm_load_pd( ( double* )zeta1 );
        a1v.v = _mm_load_pd( ( double* )alpha1 );

        a1v.v += b1v.v * u1v.v + g1v.v * z1v.v;

        u2v.v = _mm_load_pd( ( double* )(upsilon1 + 2) );
        z2v.v = _mm_load_pd( ( double* )(zeta1 + 2) );
        a2v.v = _mm_load_pd( ( double* )(alpha1 + 2) );

        a2v.v += b1v.v * u2v.v + g1v.v * z2v.v;

        x1v.v = _mm_load_pd( ( double* )chi1 );
        x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );

        w1v.v = _mm_load_pd( ( double* )omega1 );
        w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );

        rhov.v += a1v.v * x1v.v;
        rhov.v += a2v.v * x2v.v;

        w1v.v += k1v.v * a1v.v;
        w2v.v += k1v.v * a2v.v;

        _mm_store_pd( ( double* )alpha1, a1v.v );
        _mm_store_pd( ( double* )(alpha1 + 2), a2v.v );

        _mm_store_pd( ( double* )omega1, w1v.v );
        _mm_store_pd( ( double* )(omega1 + 2), w2v.v );


        upsilon1 += 4;
        zeta1    += 4;
        alpha1   += 4;
        chi1     += 4;
        omega1   += 4;
    }

    rho_c += rhov.d[0] + rhov.d[1];

    if ( n_left > 0 )
    {
        double beta_c  = *beta;
        double gamma_c = *gamma;
        double kappa_c = *kappa;

        for ( i = 0; i < n_left; ++i )
        {
            double   upsilon1_c = *upsilon1;
            double   zeta1_c    = *zeta1;
            double   alpha1_c   = *alpha1;
            double   chi1_c     = *chi1;
            double   omega1_c   = *omega1;

            alpha1_c += beta_c * upsilon1_c + gamma_c * zeta1_c;
            rho_c += alpha1_c * chi1_c;
            omega1_c += kappa_c * alpha1_c;

            *alpha1 = alpha1_c;
            *omega1 = omega1_c;

            upsilon1 += inc_u;
            zeta1    += inc_z;
            alpha1   += inc_a;
            chi1     += inc_x;
            omega1   += inc_w;
        }
    }

    *rho = rho_c;
}
void bl1_saxpyv2bdotaxpy ( int  n,
float *  beta,
float *  u,
int  inc_u,
float *  gamma,
float *  z,
int  inc_z,
float *  a,
int  inc_a,
float *  x,
int  inc_x,
float *  kappa,
float *  rho,
float *  w,
int  inc_w 
)

References bl1_abort().

{
    bl1_abort();
}