SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2008 Christian Igel, Tobias Glasmachers 00008 * Copyright (C) 2008 Christian Igel, Tobias Glasmachers 00009 * 00010 * Shogun adjustments (W) 2008-2009,2013 Soeren Sonnenburg 00011 * Copyright (C) 2008-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00012 * Copyright (C) 2013 Soeren Sonnenburg 00013 * 00014 */ 00015 #include <shogun/kernel/string/OligoStringKernel.h> 00016 #include <shogun/kernel/normalizer/SqrtDiagKernelNormalizer.h> 00017 #include <shogun/features/StringFeatures.h> 00018 00019 #include <map> 00020 #include <vector> 00021 #include <algorithm> 00022 00023 using namespace shogun; 00024 00025 COligoStringKernel::COligoStringKernel() 00026 : CStringKernel<char>() 00027 { 00028 init(); 00029 } 00030 00031 COligoStringKernel::COligoStringKernel(int32_t cache_sz, int32_t kmer_len, float64_t w) 00032 : CStringKernel<char>(cache_sz) 00033 { 00034 init(); 00035 00036 k=kmer_len; 00037 width=w; 00038 } 00039 00040 COligoStringKernel::COligoStringKernel( 00041 CStringFeatures<char>* l, CStringFeatures<char>* r, 00042 int32_t kmer_len, float64_t w) 00043 : CStringKernel<char>() 00044 { 00045 init(); 00046 00047 k=kmer_len; 00048 width=w; 00049 00050 init(l, r); 00051 } 00052 00053 COligoStringKernel::~COligoStringKernel() 00054 { 00055 cleanup(); 00056 } 00057 00058 void COligoStringKernel::cleanup() 00059 { 00060 gauss_table=SGVector<float64_t>(); 00061 CKernel::cleanup(); 00062 } 00063 00064 bool COligoStringKernel::init(CFeatures* l, CFeatures* r) 00065 { 00066 cleanup(); 00067 00068 CStringKernel<char>::init(l,r); 00069 int32_t max_len=CMath::max( 00070 ((CStringFeatures<char>*) l)->get_max_vector_length(), 00071 ((CStringFeatures<char>*) r)->get_max_vector_length() 00072 ); 00073 00074 REQUIRE(k>0, "k must be >0\n") 00075 REQUIRE(width>0, "width must be >0\n") 00076 00077 getExpFunctionCache(max_len); 00078 return init_normalizer(); 00079 } 00080 00081 void COligoStringKernel::encodeOligo( 00082 const std::string& sequence, uint32_t k_mer_length, 00083 const std::string& allowed_characters, 00084 std::vector< std::pair<int32_t, float64_t> >& values) 00085 { 00086 float64_t oligo_value = 0.; 00087 float64_t factor = 1.; 00088 std::map<std::string::value_type, uint32_t> residue_values; 00089 uint32_t counter = 0; 00090 uint32_t number_of_residues = allowed_characters.size(); 00091 uint32_t sequence_length = sequence.size(); 00092 bool sequence_ok = true; 00093 00094 // checking if sequence contains illegal characters 00095 for (uint32_t i = 0; i < sequence.size(); ++i) 00096 { 00097 if (allowed_characters.find(sequence.at(i)) == std::string::npos) 00098 sequence_ok = false; 00099 } 00100 00101 if (sequence_ok && k_mer_length <= sequence_length) 00102 { 00103 values.resize(sequence_length - k_mer_length + 1, 00104 std::pair<int32_t, float64_t>()); 00105 for (uint32_t i = 0; i < number_of_residues; ++i) 00106 { 00107 residue_values.insert(std::make_pair(allowed_characters[i], counter)); 00108 ++counter; 00109 } 00110 for (int32_t k = k_mer_length - 1; k >= 0; k--) 00111 { 00112 oligo_value += factor * residue_values[sequence[k]]; 00113 factor *= number_of_residues; 00114 } 00115 factor /= number_of_residues; 00116 counter = 0; 00117 values[counter].first = 1; 00118 values[counter].second = oligo_value; 00119 ++counter; 00120 00121 for (uint32_t j = 1; j < sequence_length - k_mer_length + 1; j++) 00122 { 00123 oligo_value -= factor * residue_values[sequence[j - 1]]; 00124 oligo_value = oligo_value * number_of_residues + 00125 residue_values[sequence[j + k_mer_length - 1]]; 00126 00127 values[counter].first = j + 1; 00128 values[counter].second = oligo_value ; 00129 ++counter; 00130 } 00131 stable_sort(values.begin(), values.end(), cmpOligos_); 00132 } 00133 else 00134 { 00135 values.clear(); 00136 } 00137 } 00138 00139 void COligoStringKernel::getSequences( 00140 const std::vector<std::string>& sequences, uint32_t k_mer_length, 00141 const std::string& allowed_characters, 00142 std::vector< std::vector< std::pair<int32_t, float64_t> > >& encoded_sequences) 00143 { 00144 std::vector< std::pair<int32_t, float64_t> > temp_vector; 00145 encoded_sequences.resize(sequences.size(), 00146 std::vector< std::pair<int32_t, float64_t> >()); 00147 00148 for (uint32_t i = 0; i < sequences.size(); ++i) 00149 { 00150 encodeOligo(sequences[i], k_mer_length, allowed_characters, temp_vector); 00151 encoded_sequences[i] = temp_vector; 00152 } 00153 } 00154 00155 void COligoStringKernel::getExpFunctionCache(uint32_t sequence_length) 00156 { 00157 gauss_table=SGVector<float64_t>(sequence_length); 00158 00159 gauss_table[0] = 1; 00160 for (uint32_t i = 1; i < sequence_length; i++) 00161 gauss_table[i] = exp(-CMath::sq((float64_t) i) / width); 00162 } 00163 00164 float64_t COligoStringKernel::kernelOligoFast( 00165 const std::vector< std::pair<int32_t, float64_t> >& x, 00166 const std::vector< std::pair<int32_t, float64_t> >& y, 00167 int32_t max_distance) 00168 { 00169 float64_t result = 0; 00170 int32_t i1 = 0; 00171 int32_t i2 = 0; 00172 int32_t c1 = 0; 00173 uint32_t x_size = x.size(); 00174 uint32_t y_size = y.size(); 00175 00176 while ((uint32_t) i1 + 1 < x_size && (uint32_t) i2 + 1 < y_size) 00177 { 00178 if (x[i1].second == y[i2].second) 00179 { 00180 if (max_distance < 0 00181 || (abs(x[i1].first - y[i2].first)) <= max_distance) 00182 { 00183 result += gauss_table[abs((x[i1].first - y[i2].first))]; 00184 if (x[i1].second == x[i1 + 1].second) 00185 { 00186 i1++; 00187 c1++; 00188 } 00189 else if (y[i2].second == y[i2 + 1].second) 00190 { 00191 i2++; 00192 i1 -= c1; 00193 c1 = 0; 00194 } 00195 else 00196 { 00197 i1++; 00198 i2++; 00199 } 00200 } 00201 else 00202 { 00203 if (x[i1].first < y[i2].first) 00204 { 00205 if (x[i1].second == x[i1 + 1].second) 00206 { 00207 i1++; 00208 } 00209 else if (y[i2].second == y[i2 + 1].second) 00210 { 00211 while (y[i2].second == y[i2].second) 00212 i2++; 00213 ++i1; 00214 c1 = 0; 00215 } 00216 else 00217 { 00218 i1++; 00219 i2++; 00220 c1 = 0; 00221 } 00222 } 00223 else 00224 { 00225 i2++; 00226 i1 -= c1; 00227 c1 = 0; 00228 } 00229 } 00230 } 00231 else 00232 { 00233 if (x[i1].second < y[i2].second) 00234 i1++; 00235 else 00236 i2++; 00237 c1 = 0; 00238 } 00239 } 00240 return result; 00241 } 00242 00243 float64_t COligoStringKernel::kernelOligo( 00244 const std::vector< std::pair<int32_t, float64_t> >& x, 00245 const std::vector< std::pair<int32_t, float64_t> >& y) 00246 { 00247 float64_t result = 0; 00248 int32_t i1 = 0; 00249 int32_t i2 = 0; 00250 int32_t c1 = 0; 00251 uint32_t x_size = x.size(); 00252 uint32_t y_size = y.size(); 00253 00254 while ((uint32_t) i1 < x_size && (uint32_t) i2 < y_size) 00255 { 00256 if (x[i1].second == y[i2].second) 00257 { 00258 result += exp(-CMath::sq(x[i1].first - y[i2].first) / width); 00259 00260 if (((uint32_t) i1+1) < x_size && x[i1].second == x[i1 + 1].second) 00261 { 00262 i1++; 00263 c1++; 00264 } 00265 else if (((uint32_t) i2+1) <y_size && y[i2].second == y[i2 + 1].second) 00266 { 00267 i2++; 00268 i1 -= c1; 00269 c1 = 0; 00270 } 00271 else 00272 { 00273 i1++; 00274 i2++; 00275 } 00276 } 00277 else 00278 { 00279 if (x[i1].second < y[i2].second) 00280 i1++; 00281 else 00282 i2++; 00283 00284 c1 = 0; 00285 } 00286 } 00287 return result; 00288 } 00289 00290 float64_t COligoStringKernel::compute(int32_t idx_a, int32_t idx_b) 00291 { 00292 int32_t alen, blen; 00293 bool free_a, free_b; 00294 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_a); 00295 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_b); 00296 std::vector< std::pair<int32_t, float64_t> > aenc; 00297 std::vector< std::pair<int32_t, float64_t> > benc; 00298 encodeOligo(std::string(avec, alen), k, "ACGT", aenc); 00299 encodeOligo(std::string(bvec, alen), k, "ACGT", benc); 00300 //float64_t result=kernelOligo(aenc, benc); 00301 float64_t result=kernelOligoFast(aenc, benc); 00302 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_a); 00303 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_b); 00304 return result; 00305 } 00306 00307 void COligoStringKernel::init() 00308 { 00309 k=0; 00310 width=0.0; 00311 00312 set_normalizer(new CSqrtDiagKernelNormalizer()); 00313 00314 SG_ADD(&k, "k", "K-mer length.", MS_AVAILABLE); 00315 SG_ADD(&width, "width", "Width of Gaussian.", MS_AVAILABLE); 00316 SG_ADD(&gauss_table, "gauss_table", "Gauss Cache Table.", MS_NOT_AVAILABLE); 00317 }