SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
RegulatoryModulesStringKernel.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Sebastian J. Schultheiss and Soeren Sonnenburg
00008  * Copyright (C) 2009 Max-Planck-Society
00009  */
00010 
00011 #include <shogun/lib/common.h>
00012 #include <shogun/kernel/string/RegulatoryModulesStringKernel.h>
00013 #include <shogun/features/Features.h>
00014 #include <shogun/io/SGIO.h>
00015 
00016 using namespace shogun;
00017 
00018 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel()
00019 : CStringKernel<char>(0)
00020 {
00021     init();
00022 }
00023 
00024 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel(
00025         int32_t size, float64_t w, int32_t d, int32_t s, int32_t wl)
00026 : CStringKernel<char>(size)
00027 {
00028     init();
00029 }
00030 
00031 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel(CStringFeatures<char>* lstr, CStringFeatures<char>* rstr,
00032         CDenseFeatures<uint16_t>* lpos, CDenseFeatures<uint16_t>* rpos,
00033         float64_t w, int32_t d, int32_t s, int32_t wl, int32_t size)
00034 : CStringKernel<char>(size)
00035 {
00036     init();
00037     set_motif_positions(lpos, rpos);
00038     init(lstr,rstr);
00039 }
00040 
00041 CRegulatoryModulesStringKernel::~CRegulatoryModulesStringKernel()
00042 {
00043     SG_UNREF(motif_positions_lhs);
00044     SG_UNREF(motif_positions_rhs);
00045 }
00046 
00047 void CRegulatoryModulesStringKernel::init()
00048 {
00049     width=0;
00050     degree=0;
00051     shift=0;
00052     window=0;
00053     motif_positions_lhs=NULL;
00054     motif_positions_rhs=NULL;
00055 
00056     SG_ADD(&width, "width", "the width of Gaussian kernel part", MS_AVAILABLE);
00057     SG_ADD(&degree, "degree", "the degree of weighted degree kernel part",
00058         MS_AVAILABLE);
00059     SG_ADD(&shift, "shift",
00060         "the shift of weighted degree with shifts kernel part", MS_AVAILABLE);
00061     SG_ADD(&window, "window", "the size of window around motifs", MS_AVAILABLE);
00062     SG_ADD((CSGObject**)&motif_positions_lhs, "motif_positions_lhs",
00063             "the matrix of motif positions from sequences left-hand side", MS_NOT_AVAILABLE);
00064     SG_ADD((CSGObject**)&motif_positions_rhs, "motif_positions_rhs",
00065             "the matrix of motif positions from sequences right-hand side", MS_NOT_AVAILABLE);
00066     SG_ADD(&position_weights, "position_weights", "scaling weights in window", MS_NOT_AVAILABLE);
00067     SG_ADD(&weights, "weights", "weights of WD kernel", MS_NOT_AVAILABLE);
00068 }
00069 
00070 bool CRegulatoryModulesStringKernel::init(CFeatures* l, CFeatures* r)
00071 {
00072     ASSERT(motif_positions_lhs)
00073     ASSERT(motif_positions_rhs)
00074 
00075     if (l->get_num_vectors() != motif_positions_lhs->get_num_vectors())
00076         SG_ERROR("Number of vectors does not agree (LHS: %d, Motif LHS: %d).\n",
00077                 l->get_num_vectors(),  motif_positions_lhs->get_num_vectors());
00078     if (r->get_num_vectors() != motif_positions_rhs->get_num_vectors())
00079         SG_ERROR("Number of vectors does not agree (RHS: %d, Motif RHS: %d).\n",
00080                 r->get_num_vectors(), motif_positions_rhs->get_num_vectors());
00081 
00082     set_wd_weights();
00083     CStringKernel<char>::init(l, r);
00084     return init_normalizer();
00085 }
00086 
00087 void CRegulatoryModulesStringKernel::set_motif_positions(
00088         CDenseFeatures<uint16_t>* positions_lhs, CDenseFeatures<uint16_t>* positions_rhs)
00089 {
00090     ASSERT(positions_lhs)
00091     ASSERT(positions_rhs)
00092     SG_UNREF(motif_positions_lhs);
00093     SG_UNREF(motif_positions_rhs);
00094     if (positions_lhs->get_num_features() != positions_rhs->get_num_features())
00095         SG_ERROR("Number of dimensions does not agree.\n")
00096 
00097     motif_positions_lhs=positions_lhs;
00098     motif_positions_rhs=positions_rhs;
00099     SG_REF(positions_lhs);
00100     SG_REF(positions_rhs);
00101 }
00102 
00103 float64_t CRegulatoryModulesStringKernel::compute(int32_t idx_a, int32_t idx_b)
00104 {
00105     ASSERT(motif_positions_lhs)
00106     ASSERT(motif_positions_rhs)
00107 
00108     bool free_avec, free_bvec;
00109     int32_t alen=0;
00110     int32_t blen=0;
00111     char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00112     char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00113 
00114     int32_t alen_pos, blen_pos;
00115     bool afree_pos, bfree_pos;
00116     uint16_t* positions_a = motif_positions_lhs->get_feature_vector(idx_a, alen_pos, afree_pos);
00117     uint16_t* positions_b = motif_positions_rhs->get_feature_vector(idx_b, blen_pos, bfree_pos);
00118     ASSERT(alen_pos==blen_pos)
00119     int32_t num_pos=alen_pos;
00120 
00121 
00122     float64_t result_rbf=0;
00123     float64_t result_wds=0;
00124 
00125     for (int32_t p=0; p<num_pos; p++)
00126     {
00127         result_rbf+=CMath::sq(positions_a[p]-positions_b[p]);
00128 
00129         for (int32_t p2=0; p2<num_pos; p2++) //p+1 and below * 2
00130             result_rbf+=CMath::sq( (positions_a[p]-positions_a[p2]) - (positions_b[p]-positions_b[p2]) );
00131 
00132         int32_t limit = window;
00133         if (window + positions_a[p] > alen)
00134             limit = alen - positions_a[p];
00135 
00136         if (window + positions_b[p] > blen)
00137             limit = CMath::min(limit, blen - positions_b[p]);
00138 
00139         result_wds+=compute_wds(&avec[positions_a[p]], &bvec[positions_b[p]],
00140                 limit);
00141     }
00142 
00143     float64_t result=exp(-result_rbf/width)+result_wds;
00144 
00145     ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00146     ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00147     ((CDenseFeatures<uint16_t>*) lhs)->free_feature_vector(positions_a, idx_a, afree_pos);
00148     ((CDenseFeatures<uint16_t>*) rhs)->free_feature_vector(positions_b, idx_b, bfree_pos);
00149 
00150     return result;
00151 }
00152 
00153 float64_t CRegulatoryModulesStringKernel::compute_wds(
00154     char* avec, char* bvec, int32_t len)
00155 {
00156     float64_t* max_shift_vec = SG_MALLOC(float64_t, shift);
00157     float64_t sum0=0 ;
00158     for (int32_t i=0; i<shift; i++)
00159         max_shift_vec[i]=0 ;
00160 
00161     // no shift
00162     for (int32_t i=0; i<len; i++)
00163     {
00164         if ((position_weights.vector!=NULL) && (position_weights[i]==0.0))
00165             continue ;
00166 
00167         float64_t sumi = 0.0 ;
00168         for (int32_t j=0; (j<degree) && (i+j<len); j++)
00169         {
00170             if (avec[i+j]!=bvec[i+j])
00171                 break ;
00172             sumi += weights[j];
00173         }
00174         if (position_weights.vector!=NULL)
00175             sum0 += position_weights[i]*sumi ;
00176         else
00177             sum0 += sumi ;
00178     } ;
00179 
00180     for (int32_t i=0; i<len; i++)
00181     {
00182         for (int32_t k=1; (k<=shift) && (i+k<len); k++)
00183         {
00184             if ((position_weights.vector!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00185                 continue ;
00186 
00187             float64_t sumi1 = 0.0 ;
00188             // shift in sequence a
00189             for (int32_t j=0; (j<degree) && (i+j+k<len); j++)
00190             {
00191                 if (avec[i+j+k]!=bvec[i+j])
00192                     break ;
00193                 sumi1 += weights[j];
00194             }
00195             float64_t sumi2 = 0.0 ;
00196             // shift in sequence b
00197             for (int32_t j=0; (j<degree) && (i+j+k<len); j++)
00198             {
00199                 if (avec[i+j]!=bvec[i+j+k])
00200                     break ;
00201                 sumi2 += weights[j];
00202             }
00203             if (position_weights.vector!=NULL)
00204                 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00205             else
00206                 max_shift_vec[k-1] += sumi1 + sumi2 ;
00207         } ;
00208     }
00209 
00210     float64_t result = sum0 ;
00211     for (int32_t i=0; i<shift; i++)
00212         result += max_shift_vec[i]/(2*(i+1)) ;
00213 
00214     SG_FREE(max_shift_vec);
00215     return result ;
00216 }
00217 
00218 void CRegulatoryModulesStringKernel::set_wd_weights()
00219 {
00220     ASSERT(degree>0)
00221 
00222     weights=SGVector<float64_t>(degree);
00223 
00224     int32_t i;
00225     float64_t sum=0;
00226     for (i=0; i<degree; i++)
00227     {
00228         weights[i]=degree-i;
00229         sum+=weights[i];
00230     }
00231 
00232     for (i=0; i<degree; i++)
00233         weights[i]/=sum;
00234 }
00235 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation