SHOGUN
v3.2.0
|
00001 /* 00002 * Compute the local alignment kernel 00003 * 00004 * Largely based on LAkernel.c (version 0.3) 00005 * 00006 * Copyright 2003 Jean-Philippe Vert 00007 * Copyright 2005 Jean-Philippe Vert, Hiroto Saigo 00008 * 00009 * Shogun specific adjustments Written (W) 2007-2008,2010 Soeren Sonnenburg 00010 * 00011 * Reference: 00012 * H. Saigo, J.-P. Vert, T. Akutsu and N. Ueda, "Protein homology 00013 * detection using string alignment kernels", Bioinformatics, 00014 * vol.20, p.1682-1689, 2004. 00015 * 00016 * This program is free software; you can redistribute it and/or modify 00017 * it under the terms of the GNU General Public License as published by 00018 * the Free Software Foundation; either version 3 of the License, or 00019 * (at your option) any later version. 00020 */ 00021 00022 #include <stdlib.h> 00023 #include <stdio.h> 00024 #include <math.h> 00025 #include <ctype.h> 00026 #include <string.h> 00027 #include <shogun/kernel/string/LocalAlignmentStringKernel.h> 00028 #include <shogun/kernel/normalizer/SqrtDiagKernelNormalizer.h> 00029 00030 using namespace shogun; 00031 00032 /****************/ 00033 /* The alphabet */ 00034 /****************/ 00035 00036 const int32_t NAA=20; /* Number of amino-acids */ 00037 const int32_t NLET=26; /* Number of letters in the alphabet */ 00038 const char* CLocalAlignmentStringKernel::aaList="ARNDCQEGHILKMFPSTWYV"; /* The list of amino acids */ 00039 00040 /*****************/ 00041 /* SW parameters */ 00042 /*****************/ 00043 00044 /* mutation matrix */ 00045 const int32_t CLocalAlignmentStringKernel::blosum[] = { 00046 6, 00047 -2, 8, 00048 -2, -1, 9, 00049 -3, -2, 2, 9, 00050 -1, -5, -4, -5, 13, 00051 -1, 1, 0, 0, -4, 8, 00052 -1, 0, 0, 2, -5, 3, 7, 00053 0, -3, -1, -2, -4, -3, -3, 8, 00054 -2, 0, 1, -2, -4, 1, 0, -3, 11, 00055 -2, -5, -5, -5, -2, -4, -5, -6, -5, 6, 00056 -2, -3, -5, -5, -2, -3, -4, -5, -4, 2, 6, 00057 -1, 3, 0, -1, -5, 2, 1, -2, -1, -4, -4, 7, 00058 -1, -2, -3, -5, -2, -1, -3, -4, -2, 2, 3, -2, 8, 00059 -3, -4, -5, -5, -4, -5, -5, -5, -2, 0, 1, -5, 0, 9, 00060 -1, -3, -3, -2, -4, -2, -2, -3, -3, -4, -4, -2, -4, -5, 11, 00061 2, -1, 1, 0, -1, 0, 0, 0, -1, -4, -4, 0, -2, -4, -1, 6, 00062 0, -2, 0, -2, -1, -1, -1, -2, -3, -1, -2, -1, -1, -3, -2, 2, 7, 00063 -4, -4, -6, -6, -3, -3, -4, -4, -4, -4, -2, -4, -2, 1, -6, -4, -4, 16, 00064 -3, -3, -3, -5, -4, -2, -3, -5, 3, -2, -2, -3, -1, 4, -4, -3, -2, 3, 10, 00065 0, -4, -4, -5, -1, -3, -4, -5, -5, 4, 1, -3, 1, -1, -4, -2, 0, -4, -2, 6}; 00066 00067 /* Index corresponding to the (i,j) entry (i,j=0..19) in the blosum matrix */ 00068 static int32_t BINDEX(int32_t i, int32_t j) 00069 { 00070 return (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2)); 00071 } 00072 00073 /********************* 00074 * Kernel parameters * 00075 *********************/ 00076 00077 const float64_t SCALING=0.1; /* Factor to scale all SW parameters */ 00078 00079 /* If you want to compute the sum over all local alignments (to get a valid kernel), uncomment the following line : */ 00080 /* If x=log(a) and y=log(b), compute log(a+b) : */ 00081 /* 00082 #define LOGP(x,y) (((x)>(y))?(x)+log1p(exp((y)-(x))):(y)+log1p(exp((x)-(y)))) 00083 */ 00084 00085 #define LOGP(x,y) LogSum(x,y) 00086 00087 /* OR if you want to compute the score of the best local alignment (to get the SW score by Viterbi), uncomment the following line : */ 00088 /* 00089 #define LOGP(x,y) (((x)>(y))?(x):(y)) 00090 */ 00091 00092 /* Usefule constants */ 00093 const float64_t LOG0=-10000; /* log(0) */ 00094 const float64_t INTSCALE=1000.0; /* critical for speed and precise computation*/ 00095 00096 int32_t CLocalAlignmentStringKernel::logsum_lookup[LOGSUM_TBL]; 00097 00098 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(int32_t size) 00099 : CStringKernel<char>(size) 00100 { 00101 init(); 00102 init_static_variables(); 00103 } 00104 00105 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel( 00106 CStringFeatures<char>* l, CStringFeatures<char>* r, 00107 float64_t opening, float64_t extension) 00108 : CStringKernel<char>() 00109 { 00110 init(); 00111 m_opening=opening; 00112 m_extension=extension; 00113 init_static_variables(); 00114 init(l, r); 00115 } 00116 00117 CLocalAlignmentStringKernel::~CLocalAlignmentStringKernel() 00118 { 00119 cleanup(); 00120 } 00121 00122 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r) 00123 { 00124 CStringKernel<char>::init(l, r); 00125 initialized=true; 00126 return init_normalizer(); 00127 } 00128 00129 void CLocalAlignmentStringKernel::cleanup() 00130 { 00131 SG_FREE(scaled_blosum); 00132 scaled_blosum=NULL; 00133 00134 SG_FREE(isAA); 00135 isAA=NULL; 00136 SG_FREE(aaIndex); 00137 aaIndex=NULL; 00138 00139 CKernel::cleanup(); 00140 } 00141 00142 /* LogSum - default log funciotion. fast, but not exact */ 00143 /* LogSum2 - precise, but slow. Note that these two functions need different figure types */ 00144 00145 void CLocalAlignmentStringKernel::init_logsum(){ 00146 int32_t i; 00147 for (i=0; i<LOGSUM_TBL; i++) 00148 logsum_lookup[i]=int32_t(INTSCALE* 00149 (log(1.+exp((float32_t)-i/INTSCALE)))); 00150 } 00151 00152 int32_t CLocalAlignmentStringKernel::LogSum(int32_t p1, int32_t p2) 00153 { 00154 int32_t diff; 00155 static int32_t firsttime=1; 00156 00157 if (firsttime) 00158 { 00159 init_logsum(); 00160 firsttime =0; 00161 } 00162 00163 diff=p1-p2; 00164 if (diff>=LOGSUM_TBL) 00165 return p1; 00166 else if (diff<=-LOGSUM_TBL) 00167 return p2; 00168 else if (diff>0) 00169 return p1+logsum_lookup[diff]; 00170 else 00171 return p2+logsum_lookup[-diff]; 00172 } 00173 00174 00175 float32_t CLocalAlignmentStringKernel::LogSum2(float32_t p1, float32_t p2) 00176 { 00177 if (p1>p2) 00178 return (p1-p2>50.) ? p1 : p1+log(1.+exp(p2-p1)); 00179 else 00180 return (p2-p1>50.) ? p2 : p2+log(1.+exp(p1-p2)); 00181 } 00182 00183 00184 void CLocalAlignmentStringKernel::init_static_variables() 00185 /* Initialize all static variables. This function should be called once before computing the first pair HMM score */ 00186 { 00187 register int32_t i; 00188 00189 /* Initialization of the array which gives the position of each amino-acid in the set of amino-acid */ 00190 aaIndex = SG_CALLOC(int32_t, NLET); 00191 for (i=0;i<NAA;i++) 00192 aaIndex[aaList[i]-'A']=i; 00193 00194 /* Initialization of the array which indicates whether a char is an amino-acid */ 00195 isAA = SG_CALLOC(int32_t, 256); 00196 for (i=0;i<NAA;i++) 00197 isAA[(int32_t)aaList[i]]=1; 00198 00199 /* Scale the blossum matrix */ 00200 for (i=0 ; i<NAA*(NAA+1)/2; i++) 00201 scaled_blosum[i]=(int32_t)floor(blosum[i]*SCALING*INTSCALE); 00202 00203 00204 /* Scale of gap penalties */ 00205 m_opening=(int32_t)floor(m_opening*SCALING*INTSCALE); 00206 m_extension=(int32_t)floor(m_extension*SCALING*INTSCALE); 00207 } 00208 00209 00210 00211 /* Implementation of the 00212 * convolution kernel which generalizes the Smith-Waterman algorithm 00213 */ 00214 float64_t CLocalAlignmentStringKernel::LAkernelcompute( 00215 int32_t* aaX, int32_t* aaY, /* the two amino-acid sequences (as sequences of indexes in [0..NAA-1] indicating the position of the amino-acid in the variable 'aaList') */ 00216 int32_t nX, int32_t nY /* the lengths of both sequences */) 00217 { 00218 register int32_t 00219 i,j, /* loop indexes */ 00220 cur, old, /* to indicate the array to use (0 or 1) */ 00221 curpos, frompos; /* position in an array */ 00222 00223 int32_t 00224 *logX, /* arrays to store the log-values of each state */ 00225 *logY, 00226 *logM, 00227 *logX2, 00228 *logY2; 00229 00230 int32_t aux, aux2;/* , aux3 , aux4 , aux5;*/ 00231 int32_t cl;/* length of a column for the dynamic programming */ 00232 00233 /* 00234 printf("now computing pairHMM between %d and %d:\n",nX,nY); 00235 for (i=0;i<nX;printf("%d ",aaX[i++])); 00236 printf("\n and \n"); 00237 for (i=0;i<nY;printf("%d ",aaY[i++])); 00238 printf("\n"); 00239 */ 00240 00241 /* Initialization of the arrays */ 00242 /* Each array stores two successive columns of the (nX+1)x(nY+1) table used in dynamic programming */ 00243 cl=nY+1; /* each column stores the positions in the aaY sequence, plus a position at zero */ 00244 00245 logM=SG_CALLOC(int32_t, 2*cl); 00246 logX=SG_CALLOC(int32_t, 2*cl); 00247 logY=SG_CALLOC(int32_t, 2*cl); 00248 logX2=SG_CALLOC(int32_t, 2*cl); 00249 logY2=SG_CALLOC(int32_t, 2*cl); 00250 00251 /************************************************/ 00252 /* First iteration : initialization of column 0 */ 00253 /************************************************/ 00254 /* The log=proabilities of each state are initialized for the first column (x=0,y=0..nY) */ 00255 00256 for (j=0; j<cl; j++) { 00257 logM[j]=LOG0; 00258 logX[j]=LOG0; 00259 logY[j]=LOG0; 00260 logX2[j]=LOG0; 00261 logY2[j]=LOG0; 00262 } 00263 00264 /* Update column order */ 00265 cur=1; /* Indexes [0..cl-1] are used to process the next column */ 00266 old=0; /* Indexes [cl..2*cl-1] were used for column 0 */ 00267 00268 00269 /************************************************/ 00270 /* Next iterations : processing columns 1 .. nX */ 00271 /************************************************/ 00272 00273 /* Main loop to vary the position in aaX : i=1..nX */ 00274 for (i=1; i<=nX; i++) { 00275 00276 /* Special update for positions (i=1..nX,j=0) */ 00277 curpos=cur*cl; /* index of the state (i,0) */ 00278 logM[curpos]=LOG0; 00279 logX[curpos]=LOG0; 00280 logY[curpos]=LOG0; 00281 logX2[curpos]=LOG0; 00282 logY2[curpos]=LOG0; 00283 00284 /* Secondary loop to vary the position in aaY : j=1..nY */ 00285 for (j=1; j<=nY; j++) { 00286 00287 curpos=cur*cl+j; /* index of the state (i,j) */ 00288 00289 /* Update for states which emit X only */ 00290 /***************************************/ 00291 00292 frompos=old*cl+j; /* index of the state (i-1,j) */ 00293 00294 /* State RX */ 00295 logX[curpos]=LOGP(-m_opening+logM[frompos], -m_extension+logX[frompos]); 00296 /* printf("%.5f\n",logX[curpos]);*/ 00297 /* printf("%.5f\n",logX_B[curpos]);*/ 00298 /* State RX2 */ 00299 logX2[curpos]=LOGP(logM[frompos], logX2[frompos]); 00300 00301 /* Update for states which emit Y only */ 00302 /***************************************/ 00303 00304 frompos=cur*cl+j-1; /* index of the state (i,j-1) */ 00305 00306 /* State RY */ 00307 aux=LOGP(-m_opening+logM[frompos], -m_extension+logY[frompos]); 00308 logY[curpos] = LOGP(aux, -m_opening+logX[frompos]); 00309 00310 /* State RY2 */ 00311 aux=LOGP(logM[frompos], logY2[frompos]); 00312 logY2[curpos]=LOGP(aux, logX2[frompos]); 00313 00314 /* Update for states which emit X and Y */ 00315 /****************************************/ 00316 00317 frompos=old*cl+j-1; /* index of the state (i-1,j-1) */ 00318 00319 aux=LOGP(logX[frompos], logY[frompos]); 00320 aux2=LOGP(0, logM[frompos]); 00321 logM[curpos]=LOGP(aux, aux2)+scaled_blosum[BINDEX(aaX[i-1], aaY[j-1])]; 00322 00323 /* 00324 printf("i=%d , j=%d\nM=%.5f\nX=%.5f\nY=%.5f\nX2=%.5f\nY2=%.5f\n",i,j,logM[curpos],logX[curpos],logY[curpos],logX2[curpos],logY2[curpos]); 00325 */ 00326 00327 } /* end of j=1:nY loop */ 00328 00329 00330 /* Update the culumn order */ 00331 cur=1-cur; 00332 old=1-old; 00333 00334 } /* end of j=1:nX loop */ 00335 00336 00337 /* Termination */ 00338 /***************/ 00339 00340 curpos=old*cl+nY; /* index of the state (nX,nY) */ 00341 aux=LOGP(logX2[curpos], logY2[curpos]); 00342 aux2=LOGP(0, logM[curpos]); 00343 /* kernel_value = LOGP( aux , aux2 );*/ 00344 00345 /* Memory release */ 00346 SG_FREE(logM); 00347 SG_FREE(logX); 00348 SG_FREE(logY); 00349 SG_FREE(logX2); 00350 SG_FREE(logY2); 00351 00352 /* Return the logarithm of the kernel */ 00353 return (float32_t)LOGP(aux,aux2)/INTSCALE; 00354 } 00355 00356 /********************/ 00357 /* Public functions */ 00358 /********************/ 00359 00360 00361 /* Return the log-probability of two sequences x and y under a pair HMM model */ 00362 /* x and y are strings of aminoacid letters, e.g., "AABRS" */ 00363 float64_t CLocalAlignmentStringKernel::compute(int32_t idx_x, int32_t idx_y) 00364 { 00365 int32_t *aax, *aay; /* to convert x and y into sequences of amino-acid indexes */ 00366 int32_t lx=0, ly=0; /* lengths of x and y */ 00367 int32_t i, j; 00368 00369 bool free_x, free_y; 00370 char* x=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_x, lx, free_x); 00371 char* y=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_y, ly, free_y); 00372 ASSERT(x && y) 00373 00374 if ( (lx<1) || (ly<1) ) 00375 SG_ERROR("empty chain") 00376 00377 /* Create aax and aay */ 00378 00379 aax = SG_CALLOC(int32_t, lx); 00380 aay = SG_CALLOC(int32_t, ly); 00381 00382 /* Extract the characters corresponding to aminoacids and keep their indexes */ 00383 00384 j=0; 00385 for (i=0; i<lx; i++) 00386 if (isAA[toupper(x[i])]) 00387 aax[j++]=aaIndex[toupper(x[i])-'A']; 00388 lx=j; 00389 j=0; 00390 for (i=0; i<ly; i++) 00391 if (isAA[toupper(y[i])]) 00392 aay[j++]=aaIndex[toupper(y[i])-'A']; 00393 ly=j; 00394 00395 00396 /* Compute the pair HMM score */ 00397 float64_t result=LAkernelcompute(aax, aay, lx, ly); 00398 00399 /* Release memory */ 00400 SG_FREE(aax); 00401 SG_FREE(aay); 00402 00403 ((CStringFeatures<char>*)lhs)->free_feature_vector(x, idx_x, free_x); 00404 ((CStringFeatures<char>*)rhs)->free_feature_vector(y, idx_y, free_y); 00405 00406 return result; 00407 } 00408 00409 void CLocalAlignmentStringKernel::init() 00410 { 00411 set_normalizer(new CSqrtDiagKernelNormalizer()); 00412 00413 initialized=false; 00414 isAA=NULL; 00415 aaIndex=NULL; 00416 00417 m_opening=10; 00418 m_extension=2; 00419 00420 scaled_blosum=SG_CALLOC(int32_t, sizeof(blosum)); 00421 init_logsum(); 00422 00423 SG_ADD(&initialized, "initialized", "If kernel is initalized.", 00424 MS_NOT_AVAILABLE); 00425 SG_ADD(&m_opening, "opening", "Opening gap opening penalty.", MS_AVAILABLE); 00426 SG_ADD(&m_extension, "extension", "Extension gap extension penalty.", 00427 MS_AVAILABLE); 00428 }