SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
HashedSparseFeatures.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evangelos Anagnostopoulos
00008  * Copyright (C) 2013 Evangelos Anagnostopoulos
00009  */
00010 
00011 #include <shogun/features/HashedSparseFeatures.h>
00012 #include <shogun/features/HashedDenseFeatures.h>
00013 #include <shogun/base/Parameter.h>
00014 #include <shogun/lib/Hash.h>
00015 #include <shogun/io/SGIO.h>
00016 #include <shogun/lib/DynamicArray.h>
00017 #include <shogun/lib/SGSparseVector.h>
00018 #include <shogun/mathematics/Math.h>
00019 #include <string.h>
00020 #include <iostream>
00021 
00022 namespace shogun {
00023 
00024 template <class ST>
00025 CHashedSparseFeatures<ST>::CHashedSparseFeatures(int32_t size, bool use_quadr,
00026     bool keep_lin_terms) : CDotFeatures(size)
00027 {
00028     init(NULL, 0, use_quadr, keep_lin_terms);
00029 }
00030 
00031 template <class ST>
00032 CHashedSparseFeatures<ST>::CHashedSparseFeatures(CSparseFeatures<ST>* feats, int32_t d,
00033     bool use_quadr, bool keep_lin_terms) : CDotFeatures()
00034 {
00035     init(feats, d, use_quadr, keep_lin_terms);
00036 }
00037 
00038 template <class ST>
00039 CHashedSparseFeatures<ST>::CHashedSparseFeatures(SGSparseMatrix<ST> matrix, int32_t d,
00040     bool use_quadr, bool keep_lin_terms) : CDotFeatures()
00041 {
00042     CSparseFeatures<ST>* feats = new CSparseFeatures<ST>(matrix);
00043     init(feats, d, use_quadr, keep_lin_terms);
00044 }
00045 
00046 template <class ST>
00047 CHashedSparseFeatures<ST>::CHashedSparseFeatures(CFile* loader, int32_t d, bool use_quadr,
00048     bool keep_lin_terms) : CDotFeatures(loader)
00049 {
00050     CSparseFeatures<ST>* feats = new CSparseFeatures<ST>();
00051     feats->load(loader);
00052     init(feats, d, use_quadr, keep_lin_terms);
00053 }
00054 
00055 template <class ST>
00056 void CHashedSparseFeatures<ST>::init(CSparseFeatures<ST>* feats, int32_t d, bool use_quadr,
00057     bool keep_lin_terms)
00058 {
00059     dim = d;
00060     use_quadratic = use_quadr;
00061     keep_linear_terms = keep_lin_terms;
00062     sparse_feats = feats;
00063     SG_REF(sparse_feats);
00064 
00065     SG_ADD(&use_quadratic, "use_quadratic", "Whether to use quadratic features",
00066         MS_NOT_AVAILABLE);
00067     SG_ADD(&keep_linear_terms, "keep_linear_terms", "Whether to keep the linear terms or not",
00068         MS_NOT_AVAILABLE);
00069     SG_ADD(&dim, "dim", "Dimension of new feature space", MS_NOT_AVAILABLE);
00070     SG_ADD((CSGObject** ) &sparse_feats, "sparse_feats", "Sparse features to work on",
00071         MS_NOT_AVAILABLE);
00072 
00073     set_generic<ST>();
00074 }
00075 
00076 template <class ST>
00077 CHashedSparseFeatures<ST>::CHashedSparseFeatures(const CHashedSparseFeatures& orig)
00078 : CDotFeatures(orig)
00079 {
00080     init(orig.sparse_feats, orig.dim, orig.use_quadratic, orig.keep_linear_terms);
00081 }
00082 
00083 template <class ST>
00084 CHashedSparseFeatures<ST>::~CHashedSparseFeatures()
00085 {
00086     SG_UNREF(sparse_feats);
00087 }
00088 
00089 template <class ST>
00090 CFeatures* CHashedSparseFeatures<ST>::duplicate() const
00091 {
00092     return new CHashedSparseFeatures<ST>(*this);
00093 }
00094 
00095 template <class ST>
00096 int32_t CHashedSparseFeatures<ST>::get_dim_feature_space() const
00097 {
00098     return dim;
00099 }
00100 
00101 template <class ST>
00102 SGSparseVector<ST> CHashedSparseFeatures<ST>::get_hashed_feature_vector(
00103     int32_t vec_idx) const
00104 {
00105     return CHashedSparseFeatures<ST>::hash_vector(sparse_feats->get_sparse_feature_vector(vec_idx),
00106         dim, use_quadratic, keep_linear_terms);
00107 }
00108 
00109 template <class ST>
00110 SGSparseVector<ST> CHashedSparseFeatures<ST>::hash_vector(SGVector<ST> vec, int32_t dim,
00111     bool use_quadratic, bool keep_linear_terms)
00112 {
00113     return CHashedDenseFeatures<ST>::hash_vector(vec, dim, use_quadratic, keep_linear_terms);
00114 }
00115 
00116 template <class ST>
00117 SGSparseVector<ST> CHashedSparseFeatures<ST>::hash_vector(SGSparseVector<ST> vec, int32_t dim,
00118     bool use_quadratic, bool keep_linear_terms)
00119 {
00120     SGVector<ST> h_vec(dim);
00121     SGVector<ST>::fill_vector(h_vec, dim, 0);
00122 
00123     int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0;
00124     SGVector<uint32_t> hash_cache(hash_cache_size);
00125 
00126     for (index_t i=0; i<vec.num_feat_entries; i++)
00127     {
00128         uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
00129                         vec.features[i].feat_index);
00130 
00131         if (use_quadratic)
00132             hash_cache[i] = hash;
00133 
00134         if ( (!use_quadratic) || keep_linear_terms )
00135             h_vec[hash % dim] += vec.features[i].entry;
00136     }
00137 
00138     if (use_quadratic)
00139     {
00140         for (index_t i=0; i<vec.num_feat_entries; i++)
00141         {
00142             index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index;
00143             index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof(index_t),
00144                     vec.features[i].feat_index) % dim;
00145 
00146             h_vec[idx] += vec.features[i].entry * vec.features[i].entry;
00147 
00148             for (index_t j=i+1; j<vec.num_feat_entries; j++)
00149             {
00150                 idx = (hash_cache[i] ^ hash_cache[j]) % dim;
00151                 h_vec[idx] += vec.features[i].entry * vec.features[j].entry;
00152             }
00153         }
00154     }
00155 
00156     int32_t num_nnz_features = 0;
00157     for (index_t i=0; i<dim; i++)
00158     {
00159         if (h_vec[i]!=0)
00160             num_nnz_features++;
00161     }
00162 
00163     SGSparseVector<ST> sv(num_nnz_features);
00164 
00165     int32_t sparse_index = 0;
00166     for (index_t i=0; i<dim; i++)
00167     {
00168         if (h_vec[i]!=0)
00169         {
00170             sv.features[sparse_index].entry = h_vec[i];
00171             sv.features[sparse_index++].feat_index = i;
00172         }
00173     }
00174 
00175     return sv;
00176 }
00177 
00178 template <class ST>
00179 float64_t CHashedSparseFeatures<ST>::dot(int32_t vec_idx1, CDotFeatures* df,
00180     int32_t vec_idx2)
00181 {
00182     ASSERT(df)
00183     ASSERT(df->get_feature_type() == get_feature_type())
00184     ASSERT(df->get_feature_class() == get_feature_class())
00185     ASSERT(strcmp(df->get_name(), get_name())==0)
00186 
00187     CHashedSparseFeatures<ST>* feats = (CHashedSparseFeatures<ST>* ) df;
00188     SGSparseVector<ST> vec_1 = get_hashed_feature_vector(vec_idx1);
00189     SGSparseVector<ST> vec_2 = feats->get_hashed_feature_vector(vec_idx2);
00190 
00191     float64_t result = vec_1.sparse_dot(vec_2);
00192     return result;
00193 }
00194 
00195 template <class ST>
00196 float64_t CHashedSparseFeatures<ST>::dense_dot(int32_t vec_idx1, float64_t* vec2,
00197     int32_t vec2_len)
00198 {
00199     ASSERT(vec2_len == dim)
00200 
00201     SGSparseVector<ST> vec = sparse_feats->get_sparse_feature_vector(vec_idx1);
00202 
00203     int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0;
00204     SGVector<uint32_t> hash_cache(hash_cache_size);
00205 
00206     float64_t result = 0;
00207     for (index_t i=0; i<vec.num_feat_entries; i++)
00208     {
00209         uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
00210                        vec.features[i].feat_index);
00211 
00212         if (use_quadratic)
00213             hash_cache[i] = hash;
00214 
00215         if ( (!use_quadratic) || keep_linear_terms)
00216             result += vec2[hash % dim] * vec.features[i].entry;
00217     }
00218 
00219     if (use_quadratic)
00220     {
00221         for (index_t i=0; i<vec.num_feat_entries; i++)
00222         {
00223             index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index;
00224             index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t),
00225                         vec.features[i].feat_index) % dim;
00226 
00227             result += vec2[idx] * vec.features[i].entry * vec.features[i].entry;
00228 
00229             for (index_t j=i+1; j<vec.num_feat_entries; j++)
00230             {
00231                 idx = (hash_cache[i] ^ hash_cache[j]) % dim;
00232                 result += vec2[idx] * vec.features[i].entry * vec.features[j].entry;
00233             }
00234         }
00235     }
00236 
00237     sparse_feats ->free_feature_vector(vec_idx1);
00238     return result;
00239 }
00240 
00241 template <class ST>
00242 void CHashedSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t vec_idx1,
00243     float64_t* vec2, int32_t vec2_len, bool abs_val)
00244 {
00245     float64_t val = abs_val ? CMath::abs(alpha) : alpha;
00246     ASSERT(vec2_len == dim)
00247 
00248     SGSparseVector<ST> vec = sparse_feats->get_sparse_feature_vector(vec_idx1);
00249 
00250     int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0;
00251     SGVector<uint32_t> hash_cache(hash_cache_size);
00252 
00253     for (index_t i=0; i<vec.num_feat_entries; i++)
00254     {
00255         uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t),
00256                        vec.features[i].feat_index);
00257         if (use_quadratic)
00258             hash_cache[i] = hash;
00259 
00260         if ( (!use_quadratic) || keep_linear_terms)
00261             vec2[hash % dim] += val * vec.features[i].entry;
00262     }
00263 
00264     if (use_quadratic)
00265     {
00266         for (index_t i=0; i<vec.num_feat_entries; i++)
00267         {
00268             index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index;
00269             index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t),
00270                         vec.features[i].feat_index) % dim;
00271 
00272             vec2[idx] += val * vec.features[i].entry * vec.features[i].entry;
00273 
00274             for (index_t j=i+1; j<vec.num_feat_entries; j++)
00275             {
00276                 idx = (hash_cache[i] ^ hash_cache[j]) % dim;
00277                 vec2[idx] += val * vec.features[i].entry * vec.features[j].entry;
00278             }
00279         }
00280     }
00281     sparse_feats ->free_feature_vector(vec_idx1);
00282 }
00283 
00284 template <class ST>
00285 int32_t CHashedSparseFeatures<ST>::get_nnz_features_for_vector(int32_t num)
00286 {
00287     return dim;
00288 }
00289 
00290 template <class ST>
00291 void* CHashedSparseFeatures<ST>::get_feature_iterator(int32_t vector_index)
00292 {
00293     SG_NOTIMPLEMENTED;
00294     return NULL;
00295 }
00296 template <class ST>
00297 bool CHashedSparseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value,
00298     void* iterator)
00299 {
00300     SG_NOTIMPLEMENTED;
00301     return false;
00302 }
00303 template <class ST>
00304 void CHashedSparseFeatures<ST>::free_feature_iterator(void* iterator)
00305 {
00306     SG_NOTIMPLEMENTED;
00307 }
00308 
00309 template <class ST>
00310 const char* CHashedSparseFeatures<ST>::get_name() const
00311 {
00312     return "HashedSparseFeatures";
00313 }
00314 
00315 template <class ST>
00316 EFeatureType CHashedSparseFeatures<ST>::get_feature_type() const
00317 {
00318     return F_UINT;
00319 }
00320 
00321 template <class ST>
00322 EFeatureClass CHashedSparseFeatures<ST>::get_feature_class() const
00323 {
00324     return C_SPARSE;
00325 }
00326 
00327 template <class ST>
00328 int32_t CHashedSparseFeatures<ST>::get_num_vectors() const
00329 {
00330     return sparse_feats ->get_num_vectors();
00331 }
00332 
00333 template class CHashedSparseFeatures <bool>;
00334 template class CHashedSparseFeatures <char>;
00335 template class CHashedSparseFeatures <int8_t>;
00336 template class CHashedSparseFeatures <uint8_t>;
00337 template class CHashedSparseFeatures <int16_t>;
00338 template class CHashedSparseFeatures <uint16_t>;
00339 template class CHashedSparseFeatures <int32_t>;
00340 template class CHashedSparseFeatures <uint32_t>;
00341 template class CHashedSparseFeatures <int64_t>;
00342 template class CHashedSparseFeatures <uint64_t>;
00343 template class CHashedSparseFeatures <float32_t>;
00344 template class CHashedSparseFeatures <float64_t>;
00345 template class CHashedSparseFeatures <floatmax_t>;
00346 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation