escript
Revision_
|
00001 00002 /***************************************************************************** 00003 * 00004 * Copyright (c) 2003-2014 by University of Queensland 00005 * http://www.uq.edu.au 00006 * 00007 * Primary Business: Queensland, Australia 00008 * Licensed under the Open Software License version 3.0 00009 * http://www.opensource.org/licenses/osl-3.0.php 00010 * 00011 * Development until 2012 by Earth Systems Science Computational Center (ESSCC) 00012 * Development 2012-2013 by School of Earth Sciences 00013 * Development from 2014 by Centre for Geoscience Computing (GeoComp) 00014 * 00015 *****************************************************************************/ 00016 00017 #ifndef __PASO_BLOCKOPS_H__ 00018 #define __PASO_BLOCKOPS_H__ 00019 00020 #include "Paso.h" 00021 #include <cstring> // memcpy 00022 00023 #ifdef USE_LAPACK 00024 #ifdef MKL_LAPACK 00025 #include <mkl_lapack.h> 00026 #include <mkl_cblas.h> 00027 #else 00028 extern "C" { 00029 #include <clapack.h> 00030 #include <cblas.h> 00031 } 00032 #endif 00033 #endif 00034 00035 namespace paso { 00036 00037 inline void BlockOps_Cpy_N(dim_t N, double* R, const double* V) 00038 { 00039 memcpy((void*)R, (void*)V, N*sizeof(double)); 00040 } 00041 00043 inline void BlockOps_SMV_2(double* R, const double* mat, const double* V) 00044 { 00045 const double S1 = V[0]; 00046 const double S2 = V[1]; 00047 const double A11 = mat[0]; 00048 const double A12 = mat[2]; 00049 const double A21 = mat[1]; 00050 const double A22 = mat[3]; 00051 R[0] -= A11 * S1 + A12 * S2; 00052 R[1] -= A21 * S1 + A22 * S2; 00053 } 00054 00056 inline void BlockOps_SMV_3(double* R, const double* mat, const double* V) 00057 { 00058 const double S1 = V[0]; 00059 const double S2 = V[1]; 00060 const double S3 = V[2]; 00061 const double A11 = mat[0]; 00062 const double A21 = mat[1]; 00063 const double A31 = mat[2]; 00064 const double A12 = mat[3]; 00065 const double A22 = mat[4]; 00066 const double A32 = mat[5]; 00067 const double A13 = mat[6]; 00068 const double A23 = mat[7]; 00069 const double A33 = mat[8]; 00070 R[0] -= A11 * S1 + A12 * S2 + A13 * S3; 00071 R[1] -= A21 * S1 + A22 * S2 + A23 * S3; 00072 R[2] -= A31 * S1 + A32 * S2 + A33 * S3; 00073 } 00074 00075 #define PASO_MISSING_CLAPACK Esys_setError(TYPE_ERROR, "You need to install a LAPACK version to enable operations on block sizes > 3.") 00076 00078 inline void BlockOps_SMV_N(dim_t N, double* R, const double* mat, const double* V) 00079 { 00080 #ifdef USE_LAPACK 00081 cblas_dgemv(CblasColMajor,CblasNoTrans, N, N, -1., mat, N, V, 1, 1., R, 1); 00082 #else 00083 PASO_MISSING_CLAPACK; 00084 #endif 00085 } 00086 00087 inline void BlockOps_MV_N(dim_t N, double* R, const double* mat, const double* V) 00088 { 00089 #ifdef USE_LAPACK 00090 cblas_dgemv(CblasColMajor,CblasNoTrans, N, N, 1., mat, N, V, 1, 0., R, 1); 00091 #else 00092 PASO_MISSING_CLAPACK; 00093 #endif 00094 } 00095 00096 inline void BlockOps_invM_2(double* invA, const double* A, int* failed) 00097 { 00098 const double A11 = A[0]; 00099 const double A12 = A[2]; 00100 const double A21 = A[1]; 00101 const double A22 = A[3]; 00102 double D = A11*A22-A12*A21; 00103 if (std::abs(D) > 0) { 00104 D = 1./D; 00105 invA[0] = A22*D; 00106 invA[1] = -A21*D; 00107 invA[2] = -A12*D; 00108 invA[3] = A11*D; 00109 } else { 00110 *failed = 1; 00111 } 00112 } 00113 00114 inline void BlockOps_invM_3(double* invA, const double* A, int* failed) 00115 { 00116 const double A11 = A[0]; 00117 const double A21 = A[1]; 00118 const double A31 = A[2]; 00119 const double A12 = A[3]; 00120 const double A22 = A[4]; 00121 const double A32 = A[5]; 00122 const double A13 = A[6]; 00123 const double A23 = A[7]; 00124 const double A33 = A[8]; 00125 double D = A11*(A22*A33-A23*A32) + 00126 A12*(A31*A23-A21*A33) + 00127 A13*(A21*A32-A31*A22); 00128 if (std::abs(D) > 0) { 00129 D = 1./D; 00130 invA[0] = (A22*A33-A23*A32)*D; 00131 invA[1] = (A31*A23-A21*A33)*D; 00132 invA[2] = (A21*A32-A31*A22)*D; 00133 invA[3] = (A13*A32-A12*A33)*D; 00134 invA[4] = (A11*A33-A31*A13)*D; 00135 invA[5] = (A12*A31-A11*A32)*D; 00136 invA[6] = (A12*A23-A13*A22)*D; 00137 invA[7] = (A13*A21-A11*A23)*D; 00138 invA[8] = (A11*A22-A12*A21)*D; 00139 } else { 00140 *failed = 1; 00141 } 00142 } 00143 00145 inline void BlockOps_invM_N(dim_t N, double* mat, int* pivot, int* failed) 00146 { 00147 #ifdef USE_LAPACK 00148 #ifdef MKL_LAPACK 00149 int res = 0; 00150 dgetrf(&N, &N, mat, &N, pivot, &res); 00151 if (res != 0) 00152 *failed = 1; 00153 #else 00154 int res = clapack_dgetrf(CblasColMajor, N, N, mat, N, pivot); 00155 if (res != 0) 00156 *failed = 1; 00157 #endif // MKL_LAPACK 00158 #else 00159 PASO_MISSING_CLAPACK; 00160 #endif 00161 } 00162 00164 inline void BlockOps_solve_N(dim_t N, double* X, double* mat, int* pivot, int* failed) 00165 { 00166 #ifdef USE_LAPACK 00167 #ifdef MKL_LAPACK 00168 int res = 0; 00169 int ONE = 1; 00170 dgetrs("N", &N, &ONE, mat, &N, pivot, X, &N, &res); 00171 if (res != 0) 00172 *failed = 1; 00173 #else 00174 int res = clapack_dgetrs(CblasColMajor, CblasNoTrans, N, 1, mat, N, pivot, X, N); 00175 if (res != 0) 00176 *failed = 1; 00177 #endif // MKL_LAPACK 00178 #else 00179 PASO_MISSING_CLAPACK; 00180 #endif 00181 } 00182 00184 inline void BlockOps_MViP_2(const double* mat, double* V) 00185 { 00186 const double S1 = V[0]; 00187 const double S2 = V[1]; 00188 const double A11 = mat[0]; 00189 const double A12 = mat[2]; 00190 const double A21 = mat[1]; 00191 const double A22 = mat[3]; 00192 V[0] = A11 * S1 + A12 * S2; 00193 V[1] = A21 * S1 + A22 * S2; 00194 } 00195 00197 inline void BlockOps_MViP_3(const double* mat, double* V) 00198 { 00199 const double S1 = V[0]; 00200 const double S2 = V[1]; 00201 const double S3 = V[2]; 00202 const double A11 = mat[0]; 00203 const double A21 = mat[1]; 00204 const double A31 = mat[2]; 00205 const double A12 = mat[3]; 00206 const double A22 = mat[4]; 00207 const double A32 = mat[5]; 00208 const double A13 = mat[6]; 00209 const double A23 = mat[7]; 00210 const double A33 = mat[8]; 00211 V[0] = A11 * S1 + A12 * S2 + A13 * S3; 00212 V[1] = A21 * S1 + A22 * S2 + A23 * S3; 00213 V[2] = A31 * S1 + A32 * S2 + A33 * S3; 00214 } 00215 00216 inline void BlockOps_solveAll(dim_t n_block, dim_t n, double* D, 00217 index_t* pivot, double* x) 00218 { 00219 if (n_block == 1) { 00220 #pragma omp parallel for 00221 for (dim_t i=0; i<n; ++i) 00222 x[i] *= D[i]; 00223 } else if (n_block == 2) { 00224 #pragma omp parallel for 00225 for (dim_t i=0; i<n; ++i) 00226 BlockOps_MViP_2(&D[4*i], &x[2*i]); 00227 } else if (n_block == 3) { 00228 #pragma omp parallel for 00229 for (dim_t i=0; i<n; ++i) 00230 BlockOps_MViP_3(&D[9*i], &x[3*i]); 00231 } else { 00232 int failed = 0; 00233 #pragma omp parallel for 00234 for (dim_t i=0; i<n; ++i) { 00235 const dim_t block_size = n_block*n_block; 00236 BlockOps_solve_N(n_block, &x[n_block*i], &D[block_size*i], &pivot[n_block*i], &failed); 00237 } 00238 if (failed > 0) { 00239 Esys_setError(ZERO_DIVISION_ERROR, "BlockOps_solveAll: solution failed."); 00240 } 00241 } 00242 } 00243 00244 } // namespace paso 00245 00246 #endif // __PASO_BLOCKOPS_H__ 00247