SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Sebastian J. Schultheiss and Soeren Sonnenburg 00008 * Copyright (C) 2009 Max-Planck-Society 00009 */ 00010 00011 #include <shogun/lib/common.h> 00012 #include <shogun/kernel/string/RegulatoryModulesStringKernel.h> 00013 #include <shogun/features/Features.h> 00014 #include <shogun/io/SGIO.h> 00015 00016 using namespace shogun; 00017 00018 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel() 00019 : CStringKernel<char>(0) 00020 { 00021 init(); 00022 } 00023 00024 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel( 00025 int32_t size, float64_t w, int32_t d, int32_t s, int32_t wl) 00026 : CStringKernel<char>(size) 00027 { 00028 init(); 00029 } 00030 00031 CRegulatoryModulesStringKernel::CRegulatoryModulesStringKernel(CStringFeatures<char>* lstr, CStringFeatures<char>* rstr, 00032 CDenseFeatures<uint16_t>* lpos, CDenseFeatures<uint16_t>* rpos, 00033 float64_t w, int32_t d, int32_t s, int32_t wl, int32_t size) 00034 : CStringKernel<char>(size) 00035 { 00036 init(); 00037 set_motif_positions(lpos, rpos); 00038 init(lstr,rstr); 00039 } 00040 00041 CRegulatoryModulesStringKernel::~CRegulatoryModulesStringKernel() 00042 { 00043 SG_UNREF(motif_positions_lhs); 00044 SG_UNREF(motif_positions_rhs); 00045 } 00046 00047 void CRegulatoryModulesStringKernel::init() 00048 { 00049 width=0; 00050 degree=0; 00051 shift=0; 00052 window=0; 00053 motif_positions_lhs=NULL; 00054 motif_positions_rhs=NULL; 00055 00056 SG_ADD(&width, "width", "the width of Gaussian kernel part", MS_AVAILABLE); 00057 SG_ADD(°ree, "degree", "the degree of weighted degree kernel part", 00058 MS_AVAILABLE); 00059 SG_ADD(&shift, "shift", 00060 "the shift of weighted degree with shifts kernel part", MS_AVAILABLE); 00061 SG_ADD(&window, "window", "the size of window around motifs", MS_AVAILABLE); 00062 SG_ADD((CSGObject**)&motif_positions_lhs, "motif_positions_lhs", 00063 "the matrix of motif positions from sequences left-hand side", MS_NOT_AVAILABLE); 00064 SG_ADD((CSGObject**)&motif_positions_rhs, "motif_positions_rhs", 00065 "the matrix of motif positions from sequences right-hand side", MS_NOT_AVAILABLE); 00066 SG_ADD(&position_weights, "position_weights", "scaling weights in window", MS_NOT_AVAILABLE); 00067 SG_ADD(&weights, "weights", "weights of WD kernel", MS_NOT_AVAILABLE); 00068 } 00069 00070 bool CRegulatoryModulesStringKernel::init(CFeatures* l, CFeatures* r) 00071 { 00072 ASSERT(motif_positions_lhs) 00073 ASSERT(motif_positions_rhs) 00074 00075 if (l->get_num_vectors() != motif_positions_lhs->get_num_vectors()) 00076 SG_ERROR("Number of vectors does not agree (LHS: %d, Motif LHS: %d).\n", 00077 l->get_num_vectors(), motif_positions_lhs->get_num_vectors()); 00078 if (r->get_num_vectors() != motif_positions_rhs->get_num_vectors()) 00079 SG_ERROR("Number of vectors does not agree (RHS: %d, Motif RHS: %d).\n", 00080 r->get_num_vectors(), motif_positions_rhs->get_num_vectors()); 00081 00082 set_wd_weights(); 00083 CStringKernel<char>::init(l, r); 00084 return init_normalizer(); 00085 } 00086 00087 void CRegulatoryModulesStringKernel::set_motif_positions( 00088 CDenseFeatures<uint16_t>* positions_lhs, CDenseFeatures<uint16_t>* positions_rhs) 00089 { 00090 ASSERT(positions_lhs) 00091 ASSERT(positions_rhs) 00092 SG_UNREF(motif_positions_lhs); 00093 SG_UNREF(motif_positions_rhs); 00094 if (positions_lhs->get_num_features() != positions_rhs->get_num_features()) 00095 SG_ERROR("Number of dimensions does not agree.\n") 00096 00097 motif_positions_lhs=positions_lhs; 00098 motif_positions_rhs=positions_rhs; 00099 SG_REF(positions_lhs); 00100 SG_REF(positions_rhs); 00101 } 00102 00103 float64_t CRegulatoryModulesStringKernel::compute(int32_t idx_a, int32_t idx_b) 00104 { 00105 ASSERT(motif_positions_lhs) 00106 ASSERT(motif_positions_rhs) 00107 00108 bool free_avec, free_bvec; 00109 int32_t alen=0; 00110 int32_t blen=0; 00111 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec); 00112 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec); 00113 00114 int32_t alen_pos, blen_pos; 00115 bool afree_pos, bfree_pos; 00116 uint16_t* positions_a = motif_positions_lhs->get_feature_vector(idx_a, alen_pos, afree_pos); 00117 uint16_t* positions_b = motif_positions_rhs->get_feature_vector(idx_b, blen_pos, bfree_pos); 00118 ASSERT(alen_pos==blen_pos) 00119 int32_t num_pos=alen_pos; 00120 00121 00122 float64_t result_rbf=0; 00123 float64_t result_wds=0; 00124 00125 for (int32_t p=0; p<num_pos; p++) 00126 { 00127 result_rbf+=CMath::sq(positions_a[p]-positions_b[p]); 00128 00129 for (int32_t p2=0; p2<num_pos; p2++) //p+1 and below * 2 00130 result_rbf+=CMath::sq( (positions_a[p]-positions_a[p2]) - (positions_b[p]-positions_b[p2]) ); 00131 00132 int32_t limit = window; 00133 if (window + positions_a[p] > alen) 00134 limit = alen - positions_a[p]; 00135 00136 if (window + positions_b[p] > blen) 00137 limit = CMath::min(limit, blen - positions_b[p]); 00138 00139 result_wds+=compute_wds(&avec[positions_a[p]], &bvec[positions_b[p]], 00140 limit); 00141 } 00142 00143 float64_t result=exp(-result_rbf/width)+result_wds; 00144 00145 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec); 00146 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec); 00147 ((CDenseFeatures<uint16_t>*) lhs)->free_feature_vector(positions_a, idx_a, afree_pos); 00148 ((CDenseFeatures<uint16_t>*) rhs)->free_feature_vector(positions_b, idx_b, bfree_pos); 00149 00150 return result; 00151 } 00152 00153 float64_t CRegulatoryModulesStringKernel::compute_wds( 00154 char* avec, char* bvec, int32_t len) 00155 { 00156 float64_t* max_shift_vec = SG_MALLOC(float64_t, shift); 00157 float64_t sum0=0 ; 00158 for (int32_t i=0; i<shift; i++) 00159 max_shift_vec[i]=0 ; 00160 00161 // no shift 00162 for (int32_t i=0; i<len; i++) 00163 { 00164 if ((position_weights.vector!=NULL) && (position_weights[i]==0.0)) 00165 continue ; 00166 00167 float64_t sumi = 0.0 ; 00168 for (int32_t j=0; (j<degree) && (i+j<len); j++) 00169 { 00170 if (avec[i+j]!=bvec[i+j]) 00171 break ; 00172 sumi += weights[j]; 00173 } 00174 if (position_weights.vector!=NULL) 00175 sum0 += position_weights[i]*sumi ; 00176 else 00177 sum0 += sumi ; 00178 } ; 00179 00180 for (int32_t i=0; i<len; i++) 00181 { 00182 for (int32_t k=1; (k<=shift) && (i+k<len); k++) 00183 { 00184 if ((position_weights.vector!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0)) 00185 continue ; 00186 00187 float64_t sumi1 = 0.0 ; 00188 // shift in sequence a 00189 for (int32_t j=0; (j<degree) && (i+j+k<len); j++) 00190 { 00191 if (avec[i+j+k]!=bvec[i+j]) 00192 break ; 00193 sumi1 += weights[j]; 00194 } 00195 float64_t sumi2 = 0.0 ; 00196 // shift in sequence b 00197 for (int32_t j=0; (j<degree) && (i+j+k<len); j++) 00198 { 00199 if (avec[i+j]!=bvec[i+j+k]) 00200 break ; 00201 sumi2 += weights[j]; 00202 } 00203 if (position_weights.vector!=NULL) 00204 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ; 00205 else 00206 max_shift_vec[k-1] += sumi1 + sumi2 ; 00207 } ; 00208 } 00209 00210 float64_t result = sum0 ; 00211 for (int32_t i=0; i<shift; i++) 00212 result += max_shift_vec[i]/(2*(i+1)) ; 00213 00214 SG_FREE(max_shift_vec); 00215 return result ; 00216 } 00217 00218 void CRegulatoryModulesStringKernel::set_wd_weights() 00219 { 00220 ASSERT(degree>0) 00221 00222 weights=SGVector<float64_t>(degree); 00223 00224 int32_t i; 00225 float64_t sum=0; 00226 for (i=0; i<degree; i++) 00227 { 00228 weights[i]=degree-i; 00229 sum+=weights[i]; 00230 } 00231 00232 for (i=0; i<degree; i++) 00233 weights[i]/=sum; 00234 } 00235