SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
HashedWDFeatures.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2010 Soeren Sonnenburg
00008  * Copyright (C) 2010 Berlin Institute of Technology
00009  */
00010 
00011 #include <shogun/features/HashedWDFeatures.h>
00012 #include <shogun/io/SGIO.h>
00013 
00014 using namespace shogun;
00015 
00016 CHashedWDFeatures::CHashedWDFeatures() :CDotFeatures()
00017 {
00018     SG_UNSTABLE("CHashedWDFeatures::CHashedWDFeatures()", "\n")
00019 
00020     strings = NULL;
00021 
00022     degree = 0;
00023     start_degree = 0;
00024     from_degree = 0;
00025     string_length = 0;
00026     num_strings = 0;
00027     alphabet_size = 0;
00028     w_dim = 0;
00029     partial_w_dim = 0;
00030     wd_weights = NULL;
00031     mask = 0;
00032     m_hash_bits = 0;
00033 
00034     normalization_const = 0.0;
00035 }
00036 
00037 CHashedWDFeatures::CHashedWDFeatures(CStringFeatures<uint8_t>* str,
00038         int32_t start_order, int32_t order, int32_t from_order,
00039         int32_t hash_bits) : CDotFeatures()
00040 {
00041     ASSERT(start_order>=0)
00042     ASSERT(start_order<order)
00043     ASSERT(order<=from_order)
00044     ASSERT(hash_bits>0)
00045     ASSERT(str)
00046     ASSERT(str->have_same_length())
00047     SG_REF(str);
00048 
00049     strings=str;
00050     string_length=str->get_max_vector_length();
00051     num_strings=str->get_num_vectors();
00052     CAlphabet* alpha=str->get_alphabet();
00053     alphabet_size=alpha->get_num_symbols();
00054     SG_UNREF(alpha);
00055 
00056     degree=order;
00057     start_degree=start_order;
00058     from_degree=from_order;
00059     m_hash_bits=hash_bits;
00060     set_wd_weights();
00061     set_normalization_const();
00062 }
00063 
00064 CHashedWDFeatures::CHashedWDFeatures(const CHashedWDFeatures& orig)
00065     : CDotFeatures(orig), strings(orig.strings),
00066     degree(orig.degree), start_degree(orig.start_degree),
00067     from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
00068     normalization_const(orig.normalization_const)
00069 {
00070 
00071 
00072     SG_REF(strings);
00073     if (strings)
00074     {
00075         string_length=strings->get_max_vector_length();
00076         num_strings=strings->get_num_vectors();
00077         CAlphabet* alpha=strings->get_alphabet();
00078         alphabet_size=alpha->get_num_symbols();
00079         SG_UNREF(alpha);
00080     }
00081     else
00082     {
00083         string_length = 0;
00084         num_strings = 0;
00085         alphabet_size = 0;
00086     }
00087 
00088     if (degree>0)
00089         set_wd_weights();
00090 }
00091 
00092 CHashedWDFeatures::~CHashedWDFeatures()
00093 {
00094     SG_UNREF(strings);
00095     SG_FREE(wd_weights);
00096 }
00097 
00098 float64_t CHashedWDFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00099 {
00100     ASSERT(df)
00101     ASSERT(df->get_feature_type() == get_feature_type())
00102     ASSERT(df->get_feature_class() == get_feature_class())
00103     CHashedWDFeatures* wdf = (CHashedWDFeatures*) df;
00104 
00105     int32_t len1, len2;
00106     bool free_vec1, free_vec2;
00107 
00108     uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00109     uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
00110 
00111     ASSERT(len1==len2)
00112 
00113     float64_t sum=0.0;
00114 
00115     for (int32_t i=0; i<len1; i++)
00116     {
00117         for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00118         {
00119             if (vec1[i+j]!=vec2[i+j])
00120                 break;
00121             if (j>=start_degree)
00122                 sum += wd_weights[j]*wd_weights[j];
00123         }
00124     }
00125     strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00126     wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00127     return sum/CMath::sq(normalization_const);
00128 }
00129 
00130 float64_t CHashedWDFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len)
00131 {
00132     if (vec2_len != w_dim)
00133         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
00134 
00135     float64_t sum=0;
00136     int32_t lim=CMath::min(degree, string_length);
00137     int32_t len;
00138     bool free_vec1;
00139     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00140     uint32_t* val=SG_MALLOC(uint32_t, len);
00141 
00142     uint32_t offs=0;
00143 
00144     if (start_degree>0)
00145     {
00146         // compute hash for strings of length start_degree-1
00147         for (int32_t i=0; i+start_degree < len; i++)
00148             val[i]=CHash::MurmurHash3(&vec[i], start_degree, 0xDEADBEAF);
00149     }
00150     else
00151         SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
00152 
00153     for (int32_t k=start_degree; k<lim; k++)
00154     {
00155         float64_t wd = wd_weights[k];
00156 
00157         uint32_t o=offs;
00158         uint32_t carry = 0;
00159         uint32_t chunk = 0;
00160 
00161         for (int32_t i=0; i+k < len; i++)
00162         {
00163             chunk++;
00164             CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
00165             uint32_t h =
00166                     CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
00167 #ifdef DEBUG_HASHEDWD
00168             SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o)
00169 #endif
00170             sum+=vec2[o+(h & mask)]*wd;
00171             val[i] = h;
00172             o+=partial_w_dim;
00173         }
00174         val[len-k-1] =
00175                 CHash::FinalizeIncrementalMurmurHash3(val[len-k-1], carry, chunk);
00176         offs+=partial_w_dim*len;
00177     }
00178     SG_FREE(val);
00179     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00180 
00181     return sum/normalization_const;
00182 }
00183 
00184 void CHashedWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00185 {
00186     if (vec2_len != w_dim)
00187         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim)
00188 
00189     int32_t lim=CMath::min(degree, string_length);
00190     int32_t len;
00191     bool free_vec1;
00192     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00193     uint32_t* val=SG_MALLOC(uint32_t, len);
00194 
00195     uint32_t offs=0;
00196 
00197     if (start_degree>0)
00198     {
00199         // compute hash for strings of length start_degree-1
00200         for (int32_t i=0; i+start_degree < len; i++)
00201             val[i]=CHash::MurmurHash3(&vec[i], start_degree, 0xDEADBEAF);
00202     }
00203     else
00204         SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF);
00205 
00206     for (int32_t k=start_degree; k<lim; k++)
00207     {
00208         float64_t wd = alpha*wd_weights[k]/normalization_const;
00209 
00210         if (abs_val)
00211             wd=CMath::abs(wd);
00212 
00213         uint32_t o=offs;
00214         uint32_t carry = 0;
00215         uint32_t chunk = 0;
00216 
00217         for (int32_t i=0; i+k < len; i++)
00218         {
00219             chunk++;
00220             CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1);
00221             uint32_t h = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk);
00222 
00223 #ifdef DEBUG_HASHEDWD
00224             SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h)
00225             SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o)
00226 #endif
00227             vec2[o+(h & mask)]+=wd;
00228             val[i] = h;
00229             o+=partial_w_dim;
00230         }
00231         val[len-k-1] =
00232                 CHash::FinalizeIncrementalMurmurHash3(val[len-k-1], carry, chunk);
00233 
00234         offs+=partial_w_dim*len;
00235     }
00236 
00237     SG_FREE(val);
00238     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00239 }
00240 
00241 void CHashedWDFeatures::set_wd_weights()
00242 {
00243     ASSERT(degree>0)
00244 
00245     mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
00246     partial_w_dim=1<<m_hash_bits;
00247     w_dim=partial_w_dim*string_length*(degree-start_degree);
00248 
00249     wd_weights=SG_MALLOC(float64_t, degree);
00250 
00251     for (int32_t i=0; i<degree; i++)
00252         wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00253 
00254     SG_DEBUG("created HashedWDFeatures with d=%d (%d), alphabetsize=%d, "
00255             "dim=%d partial_dim=%d num=%d, len=%d\n",
00256             degree, from_degree, alphabet_size,
00257             w_dim, partial_w_dim, num_strings, string_length);
00258 }
00259 
00260 
00261 void CHashedWDFeatures::set_normalization_const(float64_t n)
00262 {
00263     if (n==0)
00264     {
00265         normalization_const=0;
00266         for (int32_t i=0; i<degree; i++)
00267             normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00268 
00269         normalization_const=CMath::sqrt(normalization_const);
00270     }
00271     else
00272         normalization_const=n;
00273 
00274     SG_DEBUG("normalization_const:%f\n", normalization_const)
00275 }
00276 
00277 CFeatures* CHashedWDFeatures::duplicate() const
00278 {
00279     return new CHashedWDFeatures(*this);
00280 }
00281 
00282 
00283 int32_t CHashedWDFeatures::get_nnz_features_for_vector(int32_t num)
00284 {
00285     int32_t vlen=-1;
00286     bool free_vec;
00287     uint8_t* vec=strings->get_feature_vector(num, vlen, free_vec);
00288     strings->free_feature_vector(vec, num, free_vec);
00289     return degree*vlen;
00290 }
00291 
00292 void* CHashedWDFeatures::get_feature_iterator(int32_t vector_index)
00293 {
00294     SG_NOTIMPLEMENTED
00295     return NULL;
00296 }
00297 
00298 bool CHashedWDFeatures::get_next_feature(int32_t& index, float64_t& value,
00299         void* iterator)
00300 {
00301     SG_NOTIMPLEMENTED
00302     return false;
00303 }
00304 
00305 void CHashedWDFeatures::free_feature_iterator(void* iterator)
00306 {
00307     SG_NOTIMPLEMENTED
00308 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation