SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
HashedDenseFeatures.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evangelos Anagnostopoulos
00008  * Copyright (C) 2013 Evangelos Anagnostopoulos
00009  */
00010 
00011 #include <shogun/features/HashedDenseFeatures.h>
00012 #include <shogun/base/Parameter.h>
00013 #include <shogun/lib/Hash.h>
00014 #include <shogun/lib/DynamicArray.h>
00015 #include <shogun/io/SGIO.h>
00016 #include <shogun/mathematics/Math.h>
00017 
00018 #include <string.h>
00019 
00020 namespace shogun {
00021 template <class ST>
00022 CHashedDenseFeatures<ST>::CHashedDenseFeatures(int32_t size, bool use_quadr, bool keep_lin_terms)
00023 : CDotFeatures(size)
00024 {
00025     init(NULL, 0, use_quadr, keep_lin_terms);
00026 }
00027 
00028 template <class ST>
00029 CHashedDenseFeatures<ST>::CHashedDenseFeatures(CDenseFeatures<ST>* feats, int32_t d,
00030     bool use_quadr, bool keep_lin_terms) : CDotFeatures()
00031 {
00032     init(feats, d, use_quadr, keep_lin_terms);
00033 }
00034 
00035 template <class ST>
00036 CHashedDenseFeatures<ST>::CHashedDenseFeatures(SGMatrix<ST> matrix, int32_t d, bool use_quadr,
00037     bool keep_lin_terms) : CDotFeatures()
00038 {
00039     CDenseFeatures<ST>* feats = new CDenseFeatures<ST>(matrix);
00040     init(feats, d, use_quadr, keep_lin_terms);
00041 }
00042 
00043 template <class ST>
00044 CHashedDenseFeatures<ST>::CHashedDenseFeatures(ST* src, int32_t num_feat, int32_t num_vec,
00045     int32_t d, bool use_quadr, bool keep_lin_terms) : CDotFeatures()
00046 {
00047     CDenseFeatures<ST>* feats = new CDenseFeatures<ST>(src, num_feat, num_vec);
00048     init(feats, d, use_quadr, keep_lin_terms);
00049 }
00050 
00051 template <class ST>
00052 CHashedDenseFeatures<ST>::CHashedDenseFeatures(CFile* loader, int32_t d, bool use_quadr,
00053     bool keep_lin_terms) : CDotFeatures(loader)
00054 {
00055     CDenseFeatures<ST>* feats = new CDenseFeatures<ST>();
00056     feats->load(loader);
00057     init(feats, d, use_quadr, keep_lin_terms);
00058 }
00059 
00060 template <class ST>
00061 void CHashedDenseFeatures<ST>::init(CDenseFeatures<ST>* feats, int32_t d, bool use_quadr,
00062     bool keep_lin_terms)
00063 {
00064     dim = d;
00065     dense_feats = feats;
00066     SG_REF(dense_feats);
00067     use_quadratic = use_quadr;
00068     keep_linear_terms = keep_lin_terms;
00069 
00070     SG_ADD(&use_quadratic, "use_quadratic", "Whether to use quadratic features",
00071         MS_NOT_AVAILABLE);
00072     SG_ADD(&keep_linear_terms, "keep_linear_terms", "Whether to keep the linear terms or not",
00073         MS_NOT_AVAILABLE);
00074     SG_ADD(&dim, "dim", "Dimension of new feature space", MS_NOT_AVAILABLE);
00075     SG_ADD((CSGObject** ) &dense_feats, "dense_feats", "Dense features to work on",
00076         MS_NOT_AVAILABLE);
00077 
00078     set_generic<ST>();
00079 }
00080 
00081 template <class ST>
00082 CHashedDenseFeatures<ST>::CHashedDenseFeatures(const CHashedDenseFeatures& orig)
00083 : CDotFeatures(orig)
00084 {
00085     init(orig.dense_feats, orig.dim, orig.use_quadratic, orig.keep_linear_terms);
00086 }
00087 
00088 template <class ST>
00089 CHashedDenseFeatures<ST>::~CHashedDenseFeatures()
00090 {
00091     SG_UNREF(dense_feats);
00092 }
00093 
00094 template <class ST>
00095 CFeatures* CHashedDenseFeatures<ST>::duplicate() const
00096 {
00097     return new CHashedDenseFeatures<ST>(*this);
00098 }
00099 
00100 template <class ST>
00101 int32_t CHashedDenseFeatures<ST>::get_dim_feature_space() const
00102 {
00103     return dim;
00104 }
00105 
00106 template <class ST>
00107 float64_t CHashedDenseFeatures<ST>::dot(int32_t vec_idx1, CDotFeatures* df,
00108     int32_t vec_idx2)
00109 {
00110     ASSERT(df)
00111     ASSERT(df->get_feature_type() == get_feature_type())
00112     ASSERT(df->get_feature_class() == get_feature_class())
00113     ASSERT(strcmp(df->get_name(), get_name())==0)
00114 
00115     CHashedDenseFeatures<ST>* feats = (CHashedDenseFeatures<ST>* ) df;
00116     ASSERT(feats->get_dim_feature_space() == get_dim_feature_space())
00117 
00118     SGSparseVector<ST> vec_1 = get_hashed_feature_vector(vec_idx1);
00119 
00120     bool same_vec = (df == this) && (vec_idx1 == vec_idx2);
00121     SGSparseVector<ST> vec_2 = same_vec ? vec_1 : feats->get_hashed_feature_vector(vec_idx2);
00122     float64_t result = vec_1.sparse_dot(vec_2);
00123 
00124     return result;
00125 }
00126 
00127 template <class ST>
00128 float64_t CHashedDenseFeatures<ST>::dense_dot(int32_t vec_idx1, float64_t* vec2,
00129     int32_t vec2_len)
00130 {
00131     ASSERT(vec2_len == dim)
00132 
00133     SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx1);
00134 
00135     float64_t result = 0;
00136 
00137     int32_t hash_cache_size = use_quadratic ? vec.vlen : 0;
00138     SGVector<uint32_t> hash_cache(hash_cache_size);
00139 
00140     for (index_t i=0; i<vec.vlen; i++)
00141     {
00142         uint32_t h_idx = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i);
00143         if (use_quadratic)
00144             hash_cache[i] = h_idx;
00145 
00146         if ( (!use_quadratic) || keep_linear_terms)
00147             result += vec2[h_idx % dim] * vec[i];
00148     }
00149 
00150     if (use_quadratic)
00151     {
00152         for (index_t i=0; i<vec.size(); i++)
00153         {
00154             int32_t n_idx = i * vec.size() + i;
00155             uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim;
00156             result += vec2[idx] * vec[i] * vec[i];
00157 
00158             for (index_t j=i+1; j<vec.size(); j++)
00159             {
00160                 idx = (hash_cache[i] ^ hash_cache[j]) % dim;
00161                 result += vec2[idx] * vec[i] * vec[j];
00162             }
00163         }
00164     }
00165 
00166     dense_feats->free_feature_vector(vec, vec_idx1);
00167     return result;
00168 }
00169 
00170 template <class ST>
00171 void CHashedDenseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t vec_idx1,
00172     float64_t* vec2, int32_t vec2_len, bool abs_val)
00173 {
00174     float64_t val = abs_val ? CMath::abs(alpha) : alpha;
00175     ASSERT(vec2_len == dim)
00176 
00177     SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx1);
00178 
00179     int32_t hash_cache_size = use_quadratic ? vec.vlen : 0;
00180     SGVector<uint32_t> hash_cache(hash_cache_size);
00181 
00182     for (index_t i=0; i<vec.vlen; i++)
00183     {
00184         uint32_t h_idx = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i);
00185 
00186         if (use_quadratic)
00187             hash_cache[i] = h_idx;
00188 
00189         if ( (!use_quadratic) || keep_linear_terms)
00190             vec2[h_idx % dim] += val * vec[i];
00191     }
00192 
00193     if (use_quadratic)
00194     {
00195         for (index_t i=0; i<vec.size(); i++)
00196         {
00197             int32_t n_idx = i * vec.size() + i;
00198             uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim;
00199             vec2[idx] += val * vec[i] * vec[i];
00200 
00201             for (index_t j=i+1; j<vec.size(); j++)
00202             {
00203                 idx = (hash_cache[i] ^ hash_cache[j]) % dim;
00204                 vec2[idx] += val * vec[i] * vec[j];
00205             }
00206         }
00207     }
00208     dense_feats->free_feature_vector(vec, vec_idx1);
00209 }
00210 
00211 template <class ST>
00212 int32_t CHashedDenseFeatures<ST>::get_nnz_features_for_vector(int32_t num)
00213 {
00214     return dim;
00215 }
00216 
00217 template <class ST>
00218 void* CHashedDenseFeatures<ST>::get_feature_iterator(int32_t vector_index)
00219 {
00220     SG_NOTIMPLEMENTED;
00221     return NULL;
00222 }
00223 template <class ST>
00224 bool CHashedDenseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value,
00225     void* iterator)
00226 {
00227     SG_NOTIMPLEMENTED;
00228     return false;
00229 }
00230 template <class ST>
00231 void CHashedDenseFeatures<ST>::free_feature_iterator(void* iterator)
00232 {
00233     SG_NOTIMPLEMENTED;
00234 }
00235 
00236 template <class ST>
00237 const char* CHashedDenseFeatures<ST>::get_name() const
00238 {
00239     return "HashedDenseFeatures";
00240 }
00241 
00242 template <class ST>
00243 EFeatureType CHashedDenseFeatures<ST>::get_feature_type() const
00244 {
00245     return F_UINT;
00246 }
00247 
00248 template <class ST>
00249 EFeatureClass CHashedDenseFeatures<ST>::get_feature_class() const
00250 {
00251     return C_SPARSE;
00252 }
00253 
00254 template <class ST>
00255 int32_t CHashedDenseFeatures<ST>::get_num_vectors() const
00256 {
00257     return dense_feats->get_num_vectors();
00258 }
00259 
00260 template <class ST>
00261 SGSparseVector<ST> CHashedDenseFeatures<ST>::get_hashed_feature_vector(int32_t vec_idx)
00262 {
00263     SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx);
00264     SGSparseVector<ST> hashed_vec = CHashedDenseFeatures<ST>::hash_vector(
00265             vec, dim, use_quadratic, keep_linear_terms);
00266     dense_feats->free_feature_vector(vec, vec_idx);
00267     return hashed_vec;
00268 }
00269 
00270 template <class ST>
00271 SGSparseVector<ST> CHashedDenseFeatures<ST>::hash_vector(SGVector<ST> vec, int32_t dim,
00272     bool use_quadratic, bool keep_linear_terms)
00273 {
00274     SGVector<ST> h_vec(dim);
00275     SGVector<ST>::fill_vector(h_vec.vector, dim, 0);
00276 
00277     int32_t hash_cache_size = use_quadratic ? vec.vlen : 0;
00278     SGVector<uint32_t> hash_cache(hash_cache_size);
00279 
00280     for (index_t i=0; i<vec.size(); i++)
00281     {
00282         uint32_t hash = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i);
00283         if (use_quadratic)
00284             hash_cache[i] = hash;
00285 
00286         if ( (!use_quadratic) || keep_linear_terms)
00287             h_vec[hash % dim] += vec[i];
00288     }
00289 
00290     if (use_quadratic)
00291     {
00292         for (index_t i=0; i<vec.size(); i++)
00293         {
00294             index_t n_idx = i * vec.size() + i;
00295             uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim;
00296             h_vec[idx] += vec[i] * vec[i];
00297 
00298             for (index_t j=i+1; j<vec.size(); j++)
00299             {
00300                 idx = (hash_cache[i] ^ hash_cache[j]) % dim;
00301                 h_vec[idx] += vec[i] * vec[j];
00302             }
00303         }
00304     }
00305 
00306     int32_t num_nnz_feats = 0;
00307     for(index_t i=0; i<dim; i++)
00308     {
00309         if (h_vec[i]!=0)
00310             num_nnz_feats++;
00311     }
00312 
00313     SGSparseVector<ST> hashed_vector(num_nnz_feats);
00314 
00315     int32_t sparse_feat_index = 0;
00316     for (index_t i=0; i<dim; i++)
00317     {
00318         if (h_vec[i]!=0)
00319         {
00320             hashed_vector.features[sparse_feat_index].feat_index = i;
00321             hashed_vector.features[sparse_feat_index++].entry = h_vec[i];
00322         }
00323     }
00324 
00325     return hashed_vector;
00326 }
00327 
00328 template class CHashedDenseFeatures<bool>;
00329 template class CHashedDenseFeatures<char>;
00330 template class CHashedDenseFeatures<int8_t>;
00331 template class CHashedDenseFeatures<uint8_t>;
00332 template class CHashedDenseFeatures<int16_t>;
00333 template class CHashedDenseFeatures<uint16_t>;
00334 template class CHashedDenseFeatures<int32_t>;
00335 template class CHashedDenseFeatures<uint32_t>;
00336 template class CHashedDenseFeatures<int64_t>;
00337 template class CHashedDenseFeatures<uint64_t>;
00338 template class CHashedDenseFeatures<float32_t>;
00339 template class CHashedDenseFeatures<float64_t>;
00340 template class CHashedDenseFeatures<floatmax_t>;
00341 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation