SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
OligoStringKernel.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2008 Christian Igel, Tobias Glasmachers
00008  * Copyright (C) 2008 Christian Igel, Tobias Glasmachers
00009  *
00010  * Shogun adjustments (W) 2008-2009,2013 Soeren Sonnenburg
00011  * Copyright (C) 2008-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00012  * Copyright (C) 2013 Soeren Sonnenburg
00013  *
00014  */
00015 #include <shogun/kernel/string/OligoStringKernel.h>
00016 #include <shogun/kernel/normalizer/SqrtDiagKernelNormalizer.h>
00017 #include <shogun/features/StringFeatures.h>
00018 
00019 #include <map>
00020 #include <vector>
00021 #include <algorithm>
00022 
00023 using namespace shogun;
00024 
00025 COligoStringKernel::COligoStringKernel()
00026   : CStringKernel<char>()
00027 {
00028     init();
00029 }
00030 
00031 COligoStringKernel::COligoStringKernel(int32_t cache_sz, int32_t kmer_len, float64_t w)
00032 : CStringKernel<char>(cache_sz)
00033 {
00034     init();
00035 
00036     k=kmer_len;
00037     width=w;
00038 }
00039 
00040 COligoStringKernel::COligoStringKernel(
00041         CStringFeatures<char>* l, CStringFeatures<char>* r,
00042         int32_t kmer_len, float64_t w)
00043 : CStringKernel<char>()
00044 {
00045     init();
00046 
00047     k=kmer_len;
00048     width=w;
00049 
00050     init(l, r);
00051 }
00052 
00053 COligoStringKernel::~COligoStringKernel()
00054 {
00055     cleanup();
00056 }
00057 
00058 void COligoStringKernel::cleanup()
00059 {
00060     gauss_table=SGVector<float64_t>();
00061     CKernel::cleanup();
00062 }
00063 
00064 bool COligoStringKernel::init(CFeatures* l, CFeatures* r)
00065 {
00066     cleanup();
00067 
00068     CStringKernel<char>::init(l,r);
00069     int32_t max_len=CMath::max(
00070             ((CStringFeatures<char>*) l)->get_max_vector_length(),
00071             ((CStringFeatures<char>*) r)->get_max_vector_length()
00072             );
00073 
00074     REQUIRE(k>0, "k must be >0\n")
00075     REQUIRE(width>0, "width must be >0\n")
00076 
00077     getExpFunctionCache(max_len);
00078     return init_normalizer();
00079 }
00080 
00081 void COligoStringKernel::encodeOligo(
00082     const std::string& sequence, uint32_t k_mer_length,
00083     const std::string& allowed_characters,
00084     std::vector< std::pair<int32_t, float64_t> >& values)
00085 {
00086     float64_t oligo_value = 0.;
00087     float64_t factor      = 1.;
00088     std::map<std::string::value_type, uint32_t> residue_values;
00089     uint32_t counter = 0;
00090     uint32_t number_of_residues = allowed_characters.size();
00091     uint32_t sequence_length = sequence.size();
00092     bool sequence_ok = true;
00093 
00094     // checking if sequence contains illegal characters
00095     for (uint32_t i = 0; i < sequence.size(); ++i)
00096     {
00097         if (allowed_characters.find(sequence.at(i)) == std::string::npos)
00098             sequence_ok = false;
00099     }
00100 
00101     if (sequence_ok && k_mer_length <= sequence_length)
00102     {
00103         values.resize(sequence_length - k_mer_length + 1,
00104             std::pair<int32_t, float64_t>());
00105         for (uint32_t i = 0; i < number_of_residues; ++i)
00106         {
00107             residue_values.insert(std::make_pair(allowed_characters[i], counter));
00108             ++counter;
00109         }
00110         for (int32_t k = k_mer_length - 1; k >= 0; k--)
00111         {
00112             oligo_value += factor * residue_values[sequence[k]];
00113             factor *= number_of_residues;
00114         }
00115         factor /= number_of_residues;
00116         counter = 0;
00117         values[counter].first = 1;
00118         values[counter].second = oligo_value;
00119         ++counter;
00120 
00121         for (uint32_t j = 1; j < sequence_length - k_mer_length + 1; j++)
00122         {
00123             oligo_value -= factor * residue_values[sequence[j - 1]];
00124             oligo_value = oligo_value * number_of_residues +
00125                 residue_values[sequence[j + k_mer_length - 1]];
00126 
00127             values[counter].first = j + 1;
00128             values[counter].second = oligo_value ;
00129             ++counter;
00130         }
00131         stable_sort(values.begin(), values.end(), cmpOligos_);
00132     }
00133     else
00134     {
00135         values.clear();
00136     }
00137 }
00138 
00139 void COligoStringKernel::getSequences(
00140     const std::vector<std::string>& sequences, uint32_t k_mer_length,
00141     const std::string& allowed_characters,
00142     std::vector< std::vector< std::pair<int32_t, float64_t> > >& encoded_sequences)
00143 {
00144     std::vector< std::pair<int32_t, float64_t> > temp_vector;
00145     encoded_sequences.resize(sequences.size(),
00146         std::vector< std::pair<int32_t, float64_t> >());
00147 
00148     for (uint32_t i = 0; i < sequences.size(); ++i)
00149     {
00150         encodeOligo(sequences[i], k_mer_length, allowed_characters, temp_vector);
00151         encoded_sequences[i] = temp_vector;
00152     }
00153 }
00154 
00155 void COligoStringKernel::getExpFunctionCache(uint32_t sequence_length)
00156 {
00157     gauss_table=SGVector<float64_t>(sequence_length);
00158 
00159     gauss_table[0] = 1;
00160     for (uint32_t i = 1; i < sequence_length; i++)
00161         gauss_table[i] = exp(-CMath::sq((float64_t) i) / width);
00162 }
00163 
00164 float64_t COligoStringKernel::kernelOligoFast(
00165     const std::vector< std::pair<int32_t, float64_t> >& x,
00166     const std::vector< std::pair<int32_t, float64_t> >& y,
00167     int32_t max_distance)
00168 {
00169     float64_t result = 0;
00170     int32_t  i1     = 0;
00171     int32_t  i2     = 0;
00172     int32_t  c1     = 0;
00173     uint32_t x_size = x.size();
00174     uint32_t y_size = y.size();
00175 
00176     while ((uint32_t) i1 + 1 < x_size && (uint32_t) i2 + 1 < y_size)
00177     {
00178         if (x[i1].second == y[i2].second)
00179         {
00180             if (max_distance < 0
00181                     || (abs(x[i1].first - y[i2].first)) <= max_distance)
00182             {
00183                 result += gauss_table[abs((x[i1].first - y[i2].first))];
00184                 if (x[i1].second == x[i1 + 1].second)
00185                 {
00186                     i1++;
00187                     c1++;
00188                 }
00189                 else if (y[i2].second == y[i2 + 1].second)
00190                 {
00191                     i2++;
00192                     i1 -= c1;
00193                     c1 = 0;
00194                 }
00195                 else
00196                 {
00197                     i1++;
00198                     i2++;
00199                 }
00200             }
00201             else
00202             {
00203                 if (x[i1].first < y[i2].first)
00204                 {
00205                     if (x[i1].second == x[i1 + 1].second)
00206                     {
00207                         i1++;
00208                     }
00209                     else if (y[i2].second == y[i2 + 1].second)
00210                     {
00211                         while (y[i2].second == y[i2].second)
00212                             i2++;
00213                         ++i1;
00214                         c1 = 0;
00215                     }
00216                     else
00217                     {
00218                         i1++;
00219                         i2++;
00220                         c1 = 0;
00221                     }
00222                 }
00223                 else
00224                 {
00225                     i2++;
00226                     i1 -= c1;
00227                     c1 = 0;
00228                 }
00229             }
00230         }
00231         else
00232         {
00233             if (x[i1].second < y[i2].second)
00234                 i1++;
00235             else
00236                 i2++;
00237             c1 = 0;
00238         }
00239     }
00240     return result;
00241 }
00242 
00243 float64_t COligoStringKernel::kernelOligo(
00244         const std::vector< std::pair<int32_t, float64_t> >&    x,
00245         const std::vector< std::pair<int32_t, float64_t> >&    y)
00246 {
00247     float64_t result = 0;
00248     int32_t    i1     = 0;
00249     int32_t    i2     = 0;
00250     int32_t    c1     = 0;
00251     uint32_t x_size = x.size();
00252     uint32_t y_size = y.size();
00253 
00254     while ((uint32_t) i1 < x_size && (uint32_t) i2 < y_size)
00255     {
00256         if (x[i1].second == y[i2].second)
00257         {
00258             result += exp(-CMath::sq(x[i1].first - y[i2].first) / width);
00259 
00260             if (((uint32_t) i1+1) < x_size && x[i1].second == x[i1 + 1].second)
00261             {
00262                 i1++;
00263                 c1++;
00264             }
00265             else if (((uint32_t) i2+1) <y_size && y[i2].second == y[i2 + 1].second)
00266             {
00267                 i2++;
00268                 i1 -= c1;
00269                 c1 = 0;
00270             }
00271             else
00272             {
00273                 i1++;
00274                 i2++;
00275             }
00276         }
00277         else
00278         {
00279             if (x[i1].second < y[i2].second)
00280                 i1++;
00281             else
00282                 i2++;
00283 
00284             c1 = 0;
00285         }
00286     }
00287     return result;
00288 }
00289 
00290 float64_t COligoStringKernel::compute(int32_t idx_a, int32_t idx_b)
00291 {
00292     int32_t alen, blen;
00293     bool free_a, free_b;
00294     char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_a);
00295     char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_b);
00296     std::vector< std::pair<int32_t, float64_t> > aenc;
00297     std::vector< std::pair<int32_t, float64_t> > benc;
00298     encodeOligo(std::string(avec, alen), k, "ACGT", aenc);
00299     encodeOligo(std::string(bvec, alen), k, "ACGT", benc);
00300     //float64_t result=kernelOligo(aenc, benc);
00301     float64_t result=kernelOligoFast(aenc, benc);
00302     ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_a);
00303     ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_b);
00304     return result;
00305 }
00306 
00307 void COligoStringKernel::init()
00308 {
00309     k=0;
00310     width=0.0;
00311 
00312     set_normalizer(new CSqrtDiagKernelNormalizer());
00313 
00314     SG_ADD(&k, "k", "K-mer length.", MS_AVAILABLE);
00315     SG_ADD(&width, "width", "Width of Gaussian.", MS_AVAILABLE);
00316     SG_ADD(&gauss_table, "gauss_table", "Gauss Cache Table.", MS_NOT_AVAILABLE);
00317 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation