SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
LocalAlignmentStringKernel.cpp
Go to the documentation of this file.
00001 /*
00002  * Compute the local alignment kernel
00003  *
00004  * Largely based on LAkernel.c (version 0.3)
00005  *
00006  * Copyright 2003 Jean-Philippe Vert
00007  * Copyright 2005 Jean-Philippe Vert, Hiroto Saigo
00008  *
00009  * Shogun specific adjustments Written (W) 2007-2008,2010 Soeren Sonnenburg
00010  *
00011  * Reference:
00012  * H. Saigo, J.-P. Vert, T. Akutsu and N. Ueda, "Protein homology
00013  * detection using string alignment kernels", Bioinformatics,
00014  * vol.20, p.1682-1689, 2004.
00015  *
00016  * This program is free software; you can redistribute it and/or modify
00017  * it under the terms of the GNU General Public License as published by
00018  * the Free Software Foundation; either version 3 of the License, or
00019  * (at your option) any later version.
00020  */
00021 
00022 #include <stdlib.h>
00023 #include <stdio.h>
00024 #include <math.h>
00025 #include <ctype.h>
00026 #include <string.h>
00027 #include <shogun/kernel/string/LocalAlignmentStringKernel.h>
00028 #include <shogun/kernel/normalizer/SqrtDiagKernelNormalizer.h>
00029 
00030 using namespace shogun;
00031 
00032 /****************/
00033 /* The alphabet */
00034 /****************/
00035 
00036 const int32_t NAA=20;                                  /* Number of amino-acids */
00037 const int32_t NLET=26;                                 /* Number of letters in the alphabet */
00038 const char* CLocalAlignmentStringKernel::aaList="ARNDCQEGHILKMFPSTWYV";    /* The list of amino acids */
00039 
00040 /*****************/
00041 /* SW parameters */
00042 /*****************/
00043 
00044 /* mutation matrix */
00045 const int32_t CLocalAlignmentStringKernel::blosum[] = {
00046   6,
00047  -2,   8,
00048  -2,  -1,   9,
00049   -3,  -2,   2,   9,
00050   -1,  -5,  -4,  -5,  13,
00051   -1,   1,   0,   0,  -4,   8,
00052   -1,   0,   0,   2,  -5,   3,   7,
00053   0,  -3,  -1,  -2,  -4,  -3,  -3,   8,
00054   -2,   0,   1,  -2,  -4,   1,   0,  -3,  11,
00055   -2,  -5,  -5,  -5,  -2,  -4,  -5,  -6,  -5,   6,
00056   -2,  -3,  -5,  -5,  -2,  -3,  -4,  -5,  -4,   2,   6,
00057   -1,   3,   0,  -1,  -5,   2,   1,  -2,  -1,  -4,  -4,   7,
00058   -1,  -2,  -3,  -5,  -2,  -1,  -3,  -4,  -2,   2,   3,  -2,   8,
00059   -3,  -4,  -5,  -5,  -4,  -5,  -5,  -5,  -2,   0,   1,  -5,   0,   9,
00060   -1,  -3,  -3,  -2,  -4,  -2,  -2,  -3,  -3,  -4,  -4,  -2,  -4,  -5,  11,
00061   2,  -1,   1,   0,  -1,   0,   0,   0,  -1,  -4,  -4,   0,  -2,  -4,  -1,   6,
00062   0,  -2,   0,  -2,  -1,  -1,  -1,  -2,  -3,  -1,  -2,  -1,  -1,  -3,  -2,   2,   7,
00063   -4,  -4,  -6,  -6,  -3,  -3,  -4,  -4,  -4,  -4,  -2,  -4,  -2,   1,  -6,  -4,  -4,  16,
00064   -3,  -3,  -3,  -5,  -4,  -2,  -3,  -5,   3,  -2,  -2,  -3,  -1,   4,  -4,  -3,  -2,   3,  10,
00065   0,  -4,  -4,  -5,  -1,  -3,  -4,  -5,  -5,   4,   1,  -3,   1,  -1,  -4,  -2,   0,  -4,  -2,   6};
00066 
00067 /* Index corresponding to the (i,j) entry (i,j=0..19) in the blosum matrix */
00068 static int32_t BINDEX(int32_t i, int32_t j)
00069 {
00070     return (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2));
00071 }
00072 
00073 /*********************
00074  * Kernel parameters *
00075  *********************/
00076 
00077 const float64_t SCALING=0.1;           /* Factor to scale all SW parameters */
00078 
00079 /* If you want to compute the sum over all local alignments (to get a valid kernel), uncomment the following line : */
00080 /* If x=log(a) and y=log(b), compute log(a+b) : */
00081 /*
00082 #define LOGP(x,y) (((x)>(y))?(x)+log1p(exp((y)-(x))):(y)+log1p(exp((x)-(y))))
00083 */
00084 
00085 #define LOGP(x,y) LogSum(x,y)
00086 
00087 /* OR if you want to compute the score of the best local alignment (to get the SW score by Viterbi), uncomment the following line : */
00088 /*
00089 #define LOGP(x,y) (((x)>(y))?(x):(y))
00090 */
00091 
00092 /* Usefule constants */
00093 const float64_t LOG0=-10000;          /* log(0) */
00094 const float64_t INTSCALE=1000.0;      /* critical for speed and precise computation*/
00095 
00096 int32_t CLocalAlignmentStringKernel::logsum_lookup[LOGSUM_TBL];
00097 
00098 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(int32_t size)
00099 : CStringKernel<char>(size)
00100 {
00101     init();
00102     init_static_variables();
00103 }
00104 
00105 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(
00106     CStringFeatures<char>* l, CStringFeatures<char>* r,
00107     float64_t opening, float64_t extension)
00108 : CStringKernel<char>()
00109 {
00110     init();
00111     m_opening=opening;
00112     m_extension=extension;
00113     init_static_variables();
00114     init(l, r);
00115 }
00116 
00117 CLocalAlignmentStringKernel::~CLocalAlignmentStringKernel()
00118 {
00119     cleanup();
00120 }
00121 
00122 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r)
00123 {
00124     CStringKernel<char>::init(l, r);
00125     initialized=true;
00126     return init_normalizer();
00127 }
00128 
00129 void CLocalAlignmentStringKernel::cleanup()
00130 {
00131     SG_FREE(scaled_blosum);
00132     scaled_blosum=NULL;
00133 
00134     SG_FREE(isAA);
00135     isAA=NULL;
00136     SG_FREE(aaIndex);
00137     aaIndex=NULL;
00138 
00139     CKernel::cleanup();
00140 }
00141 
00142 /* LogSum - default log funciotion. fast, but not exact */
00143 /* LogSum2 - precise, but slow. Note that these two functions need different figure types  */
00144 
00145 void CLocalAlignmentStringKernel::init_logsum(){
00146     int32_t i;
00147     for (i=0; i<LOGSUM_TBL; i++)
00148         logsum_lookup[i]=int32_t(INTSCALE*
00149                 (log(1.+exp((float32_t)-i/INTSCALE))));
00150 }
00151 
00152 int32_t CLocalAlignmentStringKernel::LogSum(int32_t p1, int32_t p2)
00153 {
00154     int32_t diff;
00155     static int32_t firsttime=1;
00156 
00157     if (firsttime)
00158     {
00159         init_logsum();
00160         firsttime =0;
00161     }
00162 
00163     diff=p1-p2;
00164     if (diff>=LOGSUM_TBL)
00165         return p1;
00166     else if (diff<=-LOGSUM_TBL)
00167         return p2;
00168     else if (diff>0)
00169         return p1+logsum_lookup[diff];
00170     else
00171         return p2+logsum_lookup[-diff];
00172 }
00173 
00174 
00175 float32_t CLocalAlignmentStringKernel::LogSum2(float32_t p1, float32_t p2)
00176 {
00177     if (p1>p2)
00178         return (p1-p2>50.) ? p1 : p1+log(1.+exp(p2-p1));
00179     else
00180         return (p2-p1>50.) ? p2 : p2+log(1.+exp(p1-p2));
00181 }
00182 
00183 
00184 void CLocalAlignmentStringKernel::init_static_variables()
00185      /* Initialize all static variables. This function should be called once before computing the first pair HMM score */
00186 {
00187     register int32_t i;
00188 
00189     /* Initialization of the array which gives the position of each amino-acid in the set of amino-acid */
00190     aaIndex = SG_CALLOC(int32_t, NLET);
00191     for (i=0;i<NAA;i++)
00192         aaIndex[aaList[i]-'A']=i;
00193 
00194     /* Initialization of the array which indicates whether a char is an amino-acid */
00195     isAA = SG_CALLOC(int32_t, 256);
00196     for (i=0;i<NAA;i++)
00197         isAA[(int32_t)aaList[i]]=1;
00198 
00199     /* Scale the blossum matrix */
00200     for (i=0 ; i<NAA*(NAA+1)/2; i++)
00201         scaled_blosum[i]=(int32_t)floor(blosum[i]*SCALING*INTSCALE);
00202 
00203 
00204     /* Scale of gap penalties */
00205     m_opening=(int32_t)floor(m_opening*SCALING*INTSCALE);
00206     m_extension=(int32_t)floor(m_extension*SCALING*INTSCALE);
00207 }
00208 
00209 
00210 
00211 /* Implementation of the
00212  * convolution kernel which generalizes the Smith-Waterman algorithm
00213  */
00214 float64_t CLocalAlignmentStringKernel::LAkernelcompute(
00215     int32_t* aaX, int32_t* aaY, /* the two amino-acid sequences (as sequences of indexes in [0..NAA-1] indicating the position of the amino-acid in the variable 'aaList') */
00216     int32_t nX, int32_t nY /* the lengths of both sequences */)
00217 {
00218     register int32_t
00219     i,j,                /* loop indexes */
00220     cur, old,           /* to indicate the array to use (0 or 1) */
00221     curpos, frompos;    /* position in an array */
00222 
00223     int32_t
00224     *logX,           /* arrays to store the log-values of each state */
00225     *logY,
00226     *logM,
00227     *logX2,
00228     *logY2;
00229 
00230     int32_t aux, aux2;/* , aux3 , aux4 , aux5;*/
00231     int32_t cl;/* length of a column for the dynamic programming */
00232 
00233     /*
00234     printf("now computing pairHMM between %d and %d:\n",nX,nY);
00235     for (i=0;i<nX;printf("%d ",aaX[i++]));
00236     printf("\n and \n");
00237     for (i=0;i<nY;printf("%d ",aaY[i++]));
00238     printf("\n");
00239     */
00240 
00241     /* Initialization of the arrays */
00242     /* Each array stores two successive columns of the (nX+1)x(nY+1) table used in dynamic programming */
00243     cl=nY+1;           /* each column stores the positions in the aaY sequence, plus a position at zero */
00244 
00245     logM=SG_CALLOC(int32_t, 2*cl);
00246     logX=SG_CALLOC(int32_t, 2*cl);
00247     logY=SG_CALLOC(int32_t, 2*cl);
00248     logX2=SG_CALLOC(int32_t, 2*cl);
00249     logY2=SG_CALLOC(int32_t, 2*cl);
00250 
00251     /************************************************/
00252     /* First iteration : initialization of column 0 */
00253     /************************************************/
00254     /* The log=proabilities of each state are initialized for the first column (x=0,y=0..nY) */
00255 
00256     for (j=0; j<cl; j++) {
00257         logM[j]=LOG0;
00258         logX[j]=LOG0;
00259         logY[j]=LOG0;
00260         logX2[j]=LOG0;
00261         logY2[j]=LOG0;
00262     }
00263 
00264     /* Update column order */
00265     cur=1;      /* Indexes [0..cl-1] are used to process the next column */
00266     old=0;      /* Indexes [cl..2*cl-1] were used for column 0 */
00267 
00268 
00269     /************************************************/
00270     /* Next iterations : processing columns 1 .. nX */
00271     /************************************************/
00272 
00273     /* Main loop to vary the position in aaX : i=1..nX */
00274     for (i=1; i<=nX; i++) {
00275 
00276         /* Special update for positions (i=1..nX,j=0) */
00277         curpos=cur*cl;                  /* index of the state (i,0) */
00278         logM[curpos]=LOG0;
00279         logX[curpos]=LOG0;
00280         logY[curpos]=LOG0;
00281         logX2[curpos]=LOG0;
00282         logY2[curpos]=LOG0;
00283 
00284         /* Secondary loop to vary the position in aaY : j=1..nY */
00285         for (j=1; j<=nY; j++) {
00286 
00287             curpos=cur*cl+j;            /* index of the state (i,j) */
00288 
00289             /* Update for states which emit X only */
00290             /***************************************/
00291 
00292             frompos=old*cl+j;            /* index of the state (i-1,j) */
00293 
00294             /* State RX */
00295             logX[curpos]=LOGP(-m_opening+logM[frompos], -m_extension+logX[frompos]);
00296             /*      printf("%.5f\n",logX[curpos]);*/
00297             /*      printf("%.5f\n",logX_B[curpos]);*/
00298             /* State RX2 */
00299             logX2[curpos]=LOGP(logM[frompos], logX2[frompos]);
00300 
00301             /* Update for states which emit Y only */
00302             /***************************************/
00303 
00304             frompos=cur*cl+j-1;          /* index of the state (i,j-1) */
00305 
00306             /* State RY */
00307             aux=LOGP(-m_opening+logM[frompos], -m_extension+logY[frompos]);
00308             logY[curpos] = LOGP(aux, -m_opening+logX[frompos]);
00309 
00310             /* State RY2 */
00311             aux=LOGP(logM[frompos], logY2[frompos]);
00312             logY2[curpos]=LOGP(aux, logX2[frompos]);
00313 
00314             /* Update for states which emit X and Y */
00315             /****************************************/
00316 
00317             frompos=old*cl+j-1;          /* index of the state (i-1,j-1) */
00318 
00319             aux=LOGP(logX[frompos], logY[frompos]);
00320             aux2=LOGP(0, logM[frompos]);
00321             logM[curpos]=LOGP(aux, aux2)+scaled_blosum[BINDEX(aaX[i-1], aaY[j-1])];
00322 
00323             /*
00324             printf("i=%d , j=%d\nM=%.5f\nX=%.5f\nY=%.5f\nX2=%.5f\nY2=%.5f\n",i,j,logM[curpos],logX[curpos],logY[curpos],logX2[curpos],logY2[curpos]);
00325             */
00326 
00327         }  /* end of j=1:nY loop */
00328 
00329 
00330         /* Update the culumn order */
00331         cur=1-cur;
00332         old=1-old;
00333 
00334     }  /* end of j=1:nX loop */
00335 
00336 
00337     /* Termination */
00338     /***************/
00339 
00340     curpos=old*cl+nY;                /* index of the state (nX,nY) */
00341     aux=LOGP(logX2[curpos], logY2[curpos]);
00342     aux2=LOGP(0, logM[curpos]);
00343     /*  kernel_value = LOGP( aux , aux2 );*/
00344 
00345     /* Memory release */
00346     SG_FREE(logM);
00347     SG_FREE(logX);
00348     SG_FREE(logY);
00349     SG_FREE(logX2);
00350     SG_FREE(logY2);
00351 
00352     /* Return the logarithm of the kernel */
00353     return (float32_t)LOGP(aux,aux2)/INTSCALE;
00354 }
00355 
00356 /********************/
00357 /* Public functions */
00358 /********************/
00359 
00360 
00361 /* Return the log-probability of two sequences x and y under a pair HMM model */
00362 /* x and y are strings of aminoacid letters, e.g., "AABRS" */
00363 float64_t CLocalAlignmentStringKernel::compute(int32_t idx_x, int32_t idx_y)
00364 {
00365     int32_t *aax, *aay;  /* to convert x and y into sequences of amino-acid indexes */
00366     int32_t lx=0, ly=0;       /* lengths of x and y */
00367     int32_t i, j;
00368 
00369     bool free_x, free_y;
00370     char* x=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_x, lx, free_x);
00371     char* y=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_y, ly, free_y);
00372     ASSERT(x && y)
00373 
00374     if ( (lx<1) || (ly<1) )
00375         SG_ERROR("empty chain")
00376 
00377     /* Create aax and aay */
00378 
00379     aax = SG_CALLOC(int32_t, lx);
00380     aay = SG_CALLOC(int32_t, ly);
00381 
00382     /* Extract the characters corresponding to aminoacids and keep their indexes */
00383 
00384     j=0;
00385     for (i=0; i<lx; i++)
00386         if (isAA[toupper(x[i])])
00387             aax[j++]=aaIndex[toupper(x[i])-'A'];
00388     lx=j;
00389     j=0;
00390     for (i=0; i<ly; i++)
00391         if (isAA[toupper(y[i])])
00392             aay[j++]=aaIndex[toupper(y[i])-'A'];
00393     ly=j;
00394 
00395 
00396     /* Compute the pair HMM score */
00397     float64_t result=LAkernelcompute(aax, aay, lx, ly);
00398 
00399     /* Release memory */
00400     SG_FREE(aax);
00401     SG_FREE(aay);
00402 
00403     ((CStringFeatures<char>*)lhs)->free_feature_vector(x, idx_x, free_x);
00404     ((CStringFeatures<char>*)rhs)->free_feature_vector(y, idx_y, free_y);
00405 
00406     return result;
00407 }
00408 
00409 void CLocalAlignmentStringKernel::init()
00410 {
00411     set_normalizer(new CSqrtDiagKernelNormalizer());
00412 
00413     initialized=false;
00414     isAA=NULL;
00415     aaIndex=NULL;
00416 
00417     m_opening=10;
00418     m_extension=2;
00419 
00420     scaled_blosum=SG_CALLOC(int32_t, sizeof(blosum));
00421     init_logsum();
00422 
00423     SG_ADD(&initialized, "initialized", "If kernel is initalized.",
00424         MS_NOT_AVAILABLE);
00425     SG_ADD(&m_opening, "opening", "Opening gap opening penalty.", MS_AVAILABLE);
00426     SG_ADD(&m_extension, "extension", "Extension gap extension penalty.",
00427         MS_AVAILABLE);
00428 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation