SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
WDFeatures.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <shogun/features/WDFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013 
00014 using namespace shogun;
00015 
00016 CWDFeatures::CWDFeatures() :CDotFeatures()
00017 {
00018     SG_UNSTABLE("CWDFeatures::CWDFeatures() :CDotFeatures()",
00019                 "\n");
00020 
00021     strings = NULL;
00022 
00023     degree = 0;
00024     from_degree = 0;
00025     string_length = 0;
00026     num_strings = 0;
00027     alphabet_size = 0;
00028     w_dim = 0;
00029     wd_weights = NULL;
00030     normalization_const = 0.0;
00031 }
00032 
00033 CWDFeatures::CWDFeatures(CStringFeatures<uint8_t>* str,
00034         int32_t order, int32_t from_order) : CDotFeatures()
00035 {
00036     ASSERT(str)
00037     ASSERT(str->have_same_length())
00038     SG_REF(str);
00039 
00040     strings=str;
00041     string_length=str->get_max_vector_length();
00042     num_strings=str->get_num_vectors();
00043     CAlphabet* alpha=str->get_alphabet();
00044     alphabet_size=alpha->get_num_symbols();
00045     SG_UNREF(alpha);
00046 
00047     degree=order;
00048     from_degree=from_order;
00049     wd_weights=NULL;
00050     set_wd_weights();
00051     set_normalization_const();
00052 
00053 }
00054 
00055 CWDFeatures::CWDFeatures(const CWDFeatures& orig)
00056     : CDotFeatures(orig), strings(orig.strings),
00057     degree(orig.degree), from_degree(orig.from_degree),
00058     normalization_const(orig.normalization_const)
00059 {
00060     SG_REF(strings);
00061 
00062     if (strings)
00063     {
00064         string_length=strings->get_max_vector_length();
00065         num_strings=strings->get_num_vectors();
00066         CAlphabet* alpha=strings->get_alphabet();
00067         alphabet_size=alpha->get_num_symbols();
00068         SG_UNREF(alpha);
00069     }
00070     else
00071     {
00072         string_length = 0;
00073         num_strings = 0;
00074         alphabet_size = 0;
00075     }
00076 
00077     wd_weights=NULL;
00078     if (degree>0)
00079         set_wd_weights();
00080 }
00081 
00082 CWDFeatures::~CWDFeatures()
00083 {
00084     SG_UNREF(strings);
00085     SG_FREE(wd_weights);
00086 }
00087 
00088 float64_t CWDFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00089 {
00090     ASSERT(df)
00091     ASSERT(df->get_feature_type() == get_feature_type())
00092     ASSERT(df->get_feature_class() == get_feature_class())
00093     CWDFeatures* wdf = (CWDFeatures*) df;
00094 
00095     int32_t len1, len2;
00096     bool free_vec1, free_vec2;
00097 
00098     uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00099     uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
00100 
00101     ASSERT(len1==len2)
00102 
00103     float64_t sum=0.0;
00104 
00105     for (int32_t i=0; i<len1; i++)
00106     {
00107         for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00108         {
00109             if (vec1[i+j]!=vec2[i+j])
00110                 break ;
00111             sum += wd_weights[j]*wd_weights[j];
00112         }
00113     }
00114     strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00115     wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00116     return sum/CMath::sq(normalization_const);
00117 }
00118 
00119 float64_t CWDFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len)
00120 {
00121     if (vec2_len != w_dim)
00122         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
00123 
00124     float64_t sum=0;
00125     int32_t lim=CMath::min(degree, string_length);
00126     int32_t len;
00127     bool free_vec1;
00128     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00129     int32_t* val=SG_MALLOC(int32_t, len);
00130     SGVector<int32_t>::fill_vector(val, len, 0);
00131 
00132     int32_t asize=alphabet_size;
00133     int32_t asizem1=1;
00134     int32_t offs=0;
00135 
00136     for (int32_t k=0; k<lim; k++)
00137     {
00138         float64_t wd = wd_weights[k];
00139 
00140         int32_t o=offs;
00141         for (int32_t i=0; i+k < len; i++)
00142         {
00143             val[i]+=asizem1*vec[i+k];
00144             sum+=vec2[val[i]+o]*wd;
00145             o+=asize;
00146         }
00147         offs+=asize*len;
00148         asize*=alphabet_size;
00149         asizem1*=alphabet_size;
00150     }
00151     SG_FREE(val);
00152     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00153 
00154     return sum/normalization_const;
00155 }
00156 
00157 void CWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00158 {
00159     if (vec2_len != w_dim)
00160         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
00161 
00162     int32_t lim=CMath::min(degree, string_length);
00163     int32_t len;
00164     bool free_vec1;
00165     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00166     int32_t* val=SG_MALLOC(int32_t, len);
00167     SGVector<int32_t>::fill_vector(val, len, 0);
00168 
00169     int32_t asize=alphabet_size;
00170     int32_t asizem1=1;
00171     int32_t offs=0;
00172 
00173     for (int32_t k=0; k<lim; k++)
00174     {
00175         float64_t wd = alpha*wd_weights[k]/normalization_const;
00176 
00177         if (abs_val)
00178             wd=CMath::abs(wd);
00179 
00180         int32_t o=offs;
00181         for (int32_t i=0; i+k < len; i++)
00182         {
00183             val[i]+=asizem1*vec[i+k];
00184             vec2[val[i]+o]+=wd;
00185             o+=asize;
00186         }
00187         offs+=asize*len;
00188         asize*=alphabet_size;
00189         asizem1*=alphabet_size;
00190     }
00191     SG_FREE(val);
00192 
00193     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00194 }
00195 
00196 void CWDFeatures::set_wd_weights()
00197 {
00198     ASSERT(degree>0 && degree<=8)
00199     SG_FREE(wd_weights);
00200     wd_weights=SG_MALLOC(float64_t, degree);
00201     w_dim=0;
00202 
00203     for (int32_t i=0; i<degree; i++)
00204     {
00205         w_dim+=CMath::pow(alphabet_size, i+1)*string_length;
00206         wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00207     }
00208     SG_DEBUG("created WDFeatures with d=%d (%d), alphabetsize=%d, dim=%d num=%d, len=%d\n", degree, from_degree, alphabet_size, w_dim, num_strings, string_length)
00209 }
00210 
00211 
00212 void CWDFeatures::set_normalization_const(float64_t n)
00213 {
00214     if (n==0)
00215     {
00216         normalization_const=0;
00217         for (int32_t i=0; i<degree; i++)
00218             normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00219 
00220         normalization_const=CMath::sqrt(normalization_const);
00221     }
00222     else
00223         normalization_const=n;
00224 
00225     SG_DEBUG("normalization_const:%f\n", normalization_const)
00226 }
00227 
00228 void* CWDFeatures::get_feature_iterator(int32_t vector_index)
00229 {
00230     if (vector_index>=num_strings)
00231     {
00232         SG_ERROR("Index out of bounds (number of strings %d, you "
00233                 "requested %d)\n", num_strings, vector_index);
00234     }
00235 
00236     wd_feature_iterator* it=SG_MALLOC(wd_feature_iterator, 1);
00237 
00238     it->lim=CMath::min(degree, string_length);
00239     it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00240     it->vidx=vector_index;
00241 
00242     it->vec = strings->get_feature_vector(vector_index, it->vlen, it->vfree);
00243     it->val=SG_MALLOC(int32_t, it->vlen);
00244     SGVector<int32_t>::fill_vector(it->val, it->vlen, 0);
00245 
00246     it->asize=alphabet_size;
00247     it->asizem1=1;
00248     it->offs=0;
00249     it->k=0;
00250     it->i=0;
00251     it->o=0;
00252 
00253     return it;
00254 }
00255 
00256 bool CWDFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00257 {
00258     wd_feature_iterator* it=(wd_feature_iterator*) iterator;
00259 
00260     if (it->i + it->k >= it->vlen)
00261     {
00262         if (it->k < it->lim-1)
00263         {
00264             it->offs+=it->asize*it->vlen;
00265             it->asize*=alphabet_size;
00266             it->asizem1*=alphabet_size;
00267             it->k++;
00268             it->i=0;
00269             it->o=it->offs;
00270         }
00271         else
00272             return false;
00273     }
00274 
00275     int32_t i=it->i;
00276     int32_t k=it->k;
00277 #ifdef DEBUG_WDFEATURES
00278     SG_PRINT("i=%d k=%d offs=%d o=%d asize=%d asizem1=%d\n", i, k, it->offs, it->o, it->asize, it->asizem1)
00279 #endif
00280 
00281     it->val[i]+=it->asizem1*it->vec[i+k];
00282     value=wd_weights[k]/normalization_const;
00283     index=it->val[i]+it->o;
00284 #ifdef DEBUG_WDFEATURES
00285     SG_PRINT("index=%d val=%f w_size=%d lim=%d vlen=%d\n", index, value, w_dim, it->lim, it->vlen)
00286 #endif
00287 
00288     it->o+=it->asize;
00289     it->i=i+1;
00290 
00291     return true;
00292 }
00293 
00294 void CWDFeatures::free_feature_iterator(void* iterator)
00295 {
00296     ASSERT(iterator)
00297     wd_feature_iterator* it=(wd_feature_iterator*) iterator;
00298     strings->free_feature_vector(it->vec, it->vidx, it->vfree);
00299     SG_FREE(it->val);
00300     SG_FREE(it);
00301 }
00302 
00303 CFeatures* CWDFeatures::duplicate() const
00304 {
00305     return new CWDFeatures(*this);
00306 }
00307 
00308 int32_t CWDFeatures::get_dim_feature_space() const
00309 {
00310     return w_dim;
00311 }
00312 
00313 int32_t CWDFeatures::get_nnz_features_for_vector(int32_t num)
00314 {
00315     int32_t vlen=-1;
00316     bool free_vec;
00317     uint8_t* vec=strings->get_feature_vector(num, vlen, free_vec);
00318     strings->free_feature_vector(vec, num, free_vec);
00319     return degree*vlen;
00320 }
00321 
00322 EFeatureType CWDFeatures::get_feature_type() const
00323 {
00324     return F_UNKNOWN;
00325 }
00326 
00327 EFeatureClass CWDFeatures::get_feature_class() const
00328 {
00329     return C_WD;
00330 }
00331 
00332 int32_t CWDFeatures::get_num_vectors() const
00333 {
00334     return num_strings;
00335 }
00336 
00337 float64_t CWDFeatures::get_normalization_const()
00338 {
00339     return normalization_const;
00340 }
00341 
00342 void CWDFeatures::set_wd_weights(SGVector<float64_t> weights)
00343 {
00344     ASSERT(weights.vlen==degree)
00345 
00346     for (int32_t i=0; i<degree; i++)
00347         wd_weights[i]=weights.vector[i];
00348 }
00349 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation