SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
SNPFeatures.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <shogun/features/SNPFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013 #include <shogun/features/Alphabet.h>
00014 #include <shogun/lib/memory.h>
00015 
00016 using namespace shogun;
00017 
00018 CSNPFeatures::CSNPFeatures()
00019 {
00020     SG_UNSTABLE("CSNPFeatures::CSNPFeatures()", "\n")
00021 
00022     strings = NULL;
00023 
00024     string_length = 0;
00025     num_strings = 0;
00026     w_dim = 0;
00027 
00028     normalization_const = 0.0;
00029 
00030     m_str_min = NULL;
00031     m_str_maj = NULL;
00032 }
00033 
00034 CSNPFeatures::CSNPFeatures(CStringFeatures<uint8_t>* str) : CDotFeatures(),
00035     m_str_min(NULL), m_str_maj(NULL)
00036 {
00037     ASSERT(str)
00038     ASSERT(str->have_same_length())
00039     SG_REF(str);
00040 
00041     strings=str;
00042     string_length=str->get_max_vector_length();
00043     ASSERT((string_length & 1) == 0) // length divisible by 2
00044     w_dim=3*string_length/2;
00045     num_strings=str->get_num_vectors();
00046     CAlphabet* alpha=str->get_alphabet();
00047     ASSERT(alpha->get_alphabet()==SNP)
00048     SG_UNREF(alpha);
00049 
00050     obtain_base_strings();
00051     set_normalization_const();
00052 
00053 }
00054 
00055 CSNPFeatures::CSNPFeatures(const CSNPFeatures& orig)
00056     : CDotFeatures(orig), strings(orig.strings),
00057     normalization_const(orig.normalization_const),
00058     m_str_min(NULL), m_str_maj(NULL)
00059 {
00060     SG_REF(strings);
00061 
00062     if (strings)
00063     {
00064         string_length=strings->get_max_vector_length();
00065         ASSERT((string_length & 1) == 0) // length divisible by 2
00066             w_dim=3*string_length;
00067         num_strings=strings->get_num_vectors();
00068     }
00069     else
00070     {
00071         string_length = 0;
00072         w_dim = 0;
00073         num_strings = 0;
00074     }
00075 
00076     obtain_base_strings();
00077 }
00078 
00079 CSNPFeatures::~CSNPFeatures()
00080 {
00081     SG_UNREF(strings);
00082 }
00083 
00084 int32_t CSNPFeatures::get_dim_feature_space() const
00085 {
00086     return w_dim;
00087 }
00088 
00089 int32_t CSNPFeatures::get_nnz_features_for_vector(int32_t num)
00090 {
00091     return w_dim/3;
00092 }
00093 
00094 EFeatureType CSNPFeatures::get_feature_type() const
00095 {
00096     return F_UNKNOWN;
00097 }
00098 
00099 EFeatureClass CSNPFeatures::get_feature_class() const
00100 {
00101     return C_WD;
00102 }
00103 
00104 int32_t CSNPFeatures::get_num_vectors() const
00105 {
00106     return num_strings;
00107 }
00108 
00109 float64_t CSNPFeatures::get_normalization_const()
00110 {
00111     return normalization_const;
00112 }
00113 
00114 void CSNPFeatures::set_minor_base_string(const char* str)
00115 {
00116     m_str_min=(uint8_t*) get_strdup(str);
00117 }
00118 
00119 void CSNPFeatures::set_major_base_string(const char* str)
00120 {
00121     m_str_maj=(uint8_t*) get_strdup(str);
00122 }
00123 
00124 char* CSNPFeatures::get_minor_base_string()
00125 {
00126     return (char*) m_str_min;
00127 }
00128 
00129 char* CSNPFeatures::get_major_base_string()
00130 {
00131     return (char*) m_str_maj;
00132 }
00133 
00134 float64_t CSNPFeatures::dot(int32_t idx_a, CDotFeatures* df, int32_t idx_b)
00135 {
00136     ASSERT(df)
00137     ASSERT(df->get_feature_type() == get_feature_type())
00138     ASSERT(df->get_feature_class() == get_feature_class())
00139     CSNPFeatures* sf=(CSNPFeatures*) df;
00140 
00141     int32_t alen, blen;
00142     bool free_avec, free_bvec;
00143 
00144     uint8_t* avec = strings->get_feature_vector(idx_a, alen, free_avec);
00145     uint8_t* bvec = sf->strings->get_feature_vector(idx_b, blen, free_bvec);
00146 
00147     ASSERT(alen==blen)
00148     if (alen!=string_length)
00149         SG_ERROR("alen (%d) !=string_length (%d)\n", alen, string_length)
00150     ASSERT(m_str_min)
00151     ASSERT(m_str_maj)
00152 
00153     float64_t total=0;
00154 
00155     for (int32_t i = 0; i<alen-1; i+=2)
00156     {
00157         int32_t sumaa=0;
00158         int32_t sumbb=0;
00159         int32_t sumab=0;
00160 
00161         uint8_t a1=avec[i];
00162         uint8_t a2=avec[i+1];
00163         uint8_t b1=bvec[i];
00164         uint8_t b2=bvec[i+1];
00165 
00166         if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0'))
00167             sumab++;
00168         else if (a1==a2 && b1==b2)
00169         {
00170             if (a1!=b1)
00171                 continue;
00172 
00173             if (a1==m_str_min[i])
00174                 sumaa++;
00175             else if (a1==m_str_maj[i])
00176                 sumbb++;
00177             else
00178             {
00179                 SG_ERROR("The impossible happened i=%d a1=%c "
00180                         "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, a1,a2, b1,b2, m_str_min[i], m_str_maj[i]);
00181             }
00182 
00183         }
00184         total+=sumaa+sumbb+sumab;
00185     }
00186 
00187     strings->free_feature_vector(avec, idx_a, free_avec);
00188     sf->strings->free_feature_vector(bvec, idx_b, free_bvec);
00189     return total;
00190 }
00191 
00192 float64_t CSNPFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len)
00193 {
00194     if (vec2_len != w_dim)
00195         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
00196 
00197     float64_t sum=0;
00198     int32_t len;
00199     bool free_vec1;
00200     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00201     int32_t offs=0;
00202 
00203     for (int32_t i=0; i<len; i+=2)
00204     {
00205         int32_t dim=0;
00206 
00207         char a1=vec[i];
00208         char a2=vec[i+1];
00209 
00210         if (a1==a2 && a1!='0' && a2!='0')
00211         {
00212             if (a1==m_str_min[i])
00213                 dim=1;
00214             else if (a1==m_str_maj[i])
00215                 dim=2;
00216             else
00217             {
00218                 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n",
00219                         i, a1,a2, m_str_min[i], m_str_maj[i]);
00220             }
00221         }
00222 
00223         sum+=vec2[offs+dim];
00224         offs+=3;
00225     }
00226     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00227 
00228     return sum/normalization_const;
00229 }
00230 
00231 void CSNPFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00232 {
00233     if (vec2_len != w_dim)
00234         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
00235 
00236     int32_t len;
00237     bool free_vec1;
00238     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00239     int32_t offs=0;
00240 
00241     if (abs_val)
00242         alpha=CMath::abs(alpha);
00243 
00244     for (int32_t i=0; i<len; i+=2)
00245     {
00246         int32_t dim=0;
00247 
00248         char a1=vec[i];
00249         char a2=vec[i+1];
00250 
00251         if (a1==a2 && a1!='0' && a2!='0')
00252         {
00253             if (a1==m_str_min[i])
00254                 dim=1;
00255             else if (a1==m_str_maj[i])
00256                 dim=2;
00257             else
00258             {
00259                 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n",
00260                         i, a1,a2, m_str_min[i], m_str_maj[i]);
00261             }
00262         }
00263 
00264         vec2[offs+dim]+=alpha;
00265         offs+=3;
00266     }
00267     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00268 }
00269 
00270 void CSNPFeatures::find_minor_major_strings(uint8_t* minor, uint8_t* major)
00271 {
00272     for (int32_t i=0; i<num_strings; i++)
00273     {
00274         int32_t len;
00275         bool free_vec;
00276         uint8_t* vec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(i, len, free_vec);
00277         ASSERT(string_length==len)
00278 
00279         for (int32_t j=0; j<len; j++)
00280         {
00281             // skip sequencing errors
00282             if (vec[j]=='0')
00283                 continue;
00284 
00285             if (minor[j]==0)
00286                 minor[j]=vec[j];
00287             else if (major[j]==0 && vec[j]!=minor[j])
00288                 major[j]=vec[j];
00289         }
00290 
00291         ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(vec, i, free_vec);
00292     }
00293 }
00294 
00295 void CSNPFeatures::obtain_base_strings(CSNPFeatures* snp)
00296 {
00297     SG_FREE(m_str_min);
00298     SG_FREE(m_str_maj);
00299     size_t tlen=(string_length+1)*sizeof(uint8_t);
00300 
00301     m_str_min=SG_CALLOC(uint8_t, tlen);
00302     m_str_maj=SG_CALLOC(uint8_t, tlen);
00303 
00304     find_minor_major_strings(m_str_min, m_str_maj);
00305 
00306     if (snp)
00307         snp->find_minor_major_strings(m_str_min, m_str_maj);
00308 
00309     for (int32_t j=0; j<string_length; j++)
00310     {
00311         // if only one symbol occurs use 0
00312         if (m_str_min[j]==0)
00313             m_str_min[j]='0';
00314         if (m_str_maj[j]==0)
00315             m_str_maj[j]='0';
00316 
00317         if (m_str_min[j]>m_str_maj[j])
00318             CMath::swap(m_str_min[j], m_str_maj[j]);
00319     }
00320 }
00321 
00322 void CSNPFeatures::set_normalization_const(float64_t n)
00323 {
00324     if (n==0)
00325     {
00326         normalization_const=string_length;
00327         normalization_const=CMath::sqrt(normalization_const);
00328     }
00329     else
00330         normalization_const=n;
00331 
00332     SG_DEBUG("normalization_const:%f\n", normalization_const)
00333 }
00334 
00335 void* CSNPFeatures::get_feature_iterator(int32_t vector_index)
00336 {
00337     return NULL;
00338 }
00339 
00340 bool CSNPFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00341 {
00342     return false;
00343 }
00344 
00345 void CSNPFeatures::free_feature_iterator(void* iterator)
00346 {
00347 }
00348 
00349 CFeatures* CSNPFeatures::duplicate() const
00350 {
00351     return new CSNPFeatures(*this);
00352 }
00353 
00354 SGMatrix<float64_t> CSNPFeatures::get_histogram(bool normalize)
00355 {
00356     int32_t nsym=3;
00357     float64_t* h= SG_CALLOC(float64_t, size_t(nsym)*string_length/2);
00358 
00359     float64_t* h_normalizer=SG_MALLOC(float64_t, string_length/2);
00360     memset(h_normalizer, 0, string_length/2*sizeof(float64_t));
00361     int32_t num_str=get_num_vectors();
00362     for (int32_t i=0; i<num_str; i++)
00363     {
00364         int32_t len;
00365         bool free_vec;
00366         uint8_t* vec = strings->get_feature_vector(i, len, free_vec);
00367 
00368         for (int32_t j=0; j<len; j+=2)
00369         {
00370             int32_t dim=0;
00371 
00372             char a1=vec[j];
00373             char a2=vec[j+1];
00374 
00375             if (a1==a2 && a1!='0' && a2!='0')
00376             {
00377                 if (a1==m_str_min[j])
00378                     dim=1;
00379                 else if (a1==m_str_maj[j])
00380                     dim=2;
00381                 else
00382                 {
00383                     SG_ERROR("The impossible happened j=%d a1=%c a2=%c min=%c maj=%c\n",
00384                             j, a1,a2, m_str_min[j], m_str_maj[j]);
00385                 }
00386             }
00387 
00388             h[int64_t(j/2)*nsym+dim]++;
00389             h_normalizer[j/2]++;
00390         }
00391 
00392         strings->free_feature_vector(vec, i, free_vec);
00393     }
00394 
00395     if (normalize)
00396     {
00397         for (int32_t i=0; i<string_length/2; i++)
00398         {
00399             for (int32_t j=0; j<nsym; j++)
00400             {
00401                 if (h_normalizer && h_normalizer[i])
00402                     h[int64_t(i)*nsym+j]/=h_normalizer[i];
00403             }
00404         }
00405     }
00406     SG_FREE(h_normalizer);
00407 
00408     return SGMatrix<float64_t>(h, nsym, string_length/2);
00409 }
00410 
00411 SGMatrix<float64_t> CSNPFeatures::get_2x3_table(CSNPFeatures* pos, CSNPFeatures* neg)
00412 {
00413 
00414     ASSERT(pos->strings->get_max_vector_length() == neg->strings->get_max_vector_length())
00415     int32_t len=pos->strings->get_max_vector_length();
00416 
00417     float64_t* table=SG_MALLOC(float64_t, 3*2*len/2);
00418 
00419     SGMatrix<float64_t> p_hist=pos->get_histogram(false);
00420     SGMatrix<float64_t> n_hist=neg->get_histogram(false);
00421 
00422 
00423     for (int32_t i=0; i<3*len/2; i++)
00424     {
00425         table[2*i]=p_hist.matrix[i];
00426         table[2*i+1]=n_hist.matrix[i];
00427     }
00428     return SGMatrix<float64_t>(table, 2,3*len/2);
00429 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation