SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evangelos Anagnostopoulos 00008 * Copyright (C) 2013 Evangelos Anagnostopoulos 00009 */ 00010 00011 #include <shogun/features/HashedDenseFeatures.h> 00012 #include <shogun/base/Parameter.h> 00013 #include <shogun/lib/Hash.h> 00014 #include <shogun/lib/DynamicArray.h> 00015 #include <shogun/io/SGIO.h> 00016 #include <shogun/mathematics/Math.h> 00017 00018 #include <string.h> 00019 00020 namespace shogun { 00021 template <class ST> 00022 CHashedDenseFeatures<ST>::CHashedDenseFeatures(int32_t size, bool use_quadr, bool keep_lin_terms) 00023 : CDotFeatures(size) 00024 { 00025 init(NULL, 0, use_quadr, keep_lin_terms); 00026 } 00027 00028 template <class ST> 00029 CHashedDenseFeatures<ST>::CHashedDenseFeatures(CDenseFeatures<ST>* feats, int32_t d, 00030 bool use_quadr, bool keep_lin_terms) : CDotFeatures() 00031 { 00032 init(feats, d, use_quadr, keep_lin_terms); 00033 } 00034 00035 template <class ST> 00036 CHashedDenseFeatures<ST>::CHashedDenseFeatures(SGMatrix<ST> matrix, int32_t d, bool use_quadr, 00037 bool keep_lin_terms) : CDotFeatures() 00038 { 00039 CDenseFeatures<ST>* feats = new CDenseFeatures<ST>(matrix); 00040 init(feats, d, use_quadr, keep_lin_terms); 00041 } 00042 00043 template <class ST> 00044 CHashedDenseFeatures<ST>::CHashedDenseFeatures(ST* src, int32_t num_feat, int32_t num_vec, 00045 int32_t d, bool use_quadr, bool keep_lin_terms) : CDotFeatures() 00046 { 00047 CDenseFeatures<ST>* feats = new CDenseFeatures<ST>(src, num_feat, num_vec); 00048 init(feats, d, use_quadr, keep_lin_terms); 00049 } 00050 00051 template <class ST> 00052 CHashedDenseFeatures<ST>::CHashedDenseFeatures(CFile* loader, int32_t d, bool use_quadr, 00053 bool keep_lin_terms) : CDotFeatures(loader) 00054 { 00055 CDenseFeatures<ST>* feats = new CDenseFeatures<ST>(); 00056 feats->load(loader); 00057 init(feats, d, use_quadr, keep_lin_terms); 00058 } 00059 00060 template <class ST> 00061 void CHashedDenseFeatures<ST>::init(CDenseFeatures<ST>* feats, int32_t d, bool use_quadr, 00062 bool keep_lin_terms) 00063 { 00064 dim = d; 00065 dense_feats = feats; 00066 SG_REF(dense_feats); 00067 use_quadratic = use_quadr; 00068 keep_linear_terms = keep_lin_terms; 00069 00070 SG_ADD(&use_quadratic, "use_quadratic", "Whether to use quadratic features", 00071 MS_NOT_AVAILABLE); 00072 SG_ADD(&keep_linear_terms, "keep_linear_terms", "Whether to keep the linear terms or not", 00073 MS_NOT_AVAILABLE); 00074 SG_ADD(&dim, "dim", "Dimension of new feature space", MS_NOT_AVAILABLE); 00075 SG_ADD((CSGObject** ) &dense_feats, "dense_feats", "Dense features to work on", 00076 MS_NOT_AVAILABLE); 00077 00078 set_generic<ST>(); 00079 } 00080 00081 template <class ST> 00082 CHashedDenseFeatures<ST>::CHashedDenseFeatures(const CHashedDenseFeatures& orig) 00083 : CDotFeatures(orig) 00084 { 00085 init(orig.dense_feats, orig.dim, orig.use_quadratic, orig.keep_linear_terms); 00086 } 00087 00088 template <class ST> 00089 CHashedDenseFeatures<ST>::~CHashedDenseFeatures() 00090 { 00091 SG_UNREF(dense_feats); 00092 } 00093 00094 template <class ST> 00095 CFeatures* CHashedDenseFeatures<ST>::duplicate() const 00096 { 00097 return new CHashedDenseFeatures<ST>(*this); 00098 } 00099 00100 template <class ST> 00101 int32_t CHashedDenseFeatures<ST>::get_dim_feature_space() const 00102 { 00103 return dim; 00104 } 00105 00106 template <class ST> 00107 float64_t CHashedDenseFeatures<ST>::dot(int32_t vec_idx1, CDotFeatures* df, 00108 int32_t vec_idx2) 00109 { 00110 ASSERT(df) 00111 ASSERT(df->get_feature_type() == get_feature_type()) 00112 ASSERT(df->get_feature_class() == get_feature_class()) 00113 ASSERT(strcmp(df->get_name(), get_name())==0) 00114 00115 CHashedDenseFeatures<ST>* feats = (CHashedDenseFeatures<ST>* ) df; 00116 ASSERT(feats->get_dim_feature_space() == get_dim_feature_space()) 00117 00118 SGSparseVector<ST> vec_1 = get_hashed_feature_vector(vec_idx1); 00119 00120 bool same_vec = (df == this) && (vec_idx1 == vec_idx2); 00121 SGSparseVector<ST> vec_2 = same_vec ? vec_1 : feats->get_hashed_feature_vector(vec_idx2); 00122 float64_t result = vec_1.sparse_dot(vec_2); 00123 00124 return result; 00125 } 00126 00127 template <class ST> 00128 float64_t CHashedDenseFeatures<ST>::dense_dot(int32_t vec_idx1, float64_t* vec2, 00129 int32_t vec2_len) 00130 { 00131 ASSERT(vec2_len == dim) 00132 00133 SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx1); 00134 00135 float64_t result = 0; 00136 00137 int32_t hash_cache_size = use_quadratic ? vec.vlen : 0; 00138 SGVector<uint32_t> hash_cache(hash_cache_size); 00139 00140 for (index_t i=0; i<vec.vlen; i++) 00141 { 00142 uint32_t h_idx = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i); 00143 if (use_quadratic) 00144 hash_cache[i] = h_idx; 00145 00146 if ( (!use_quadratic) || keep_linear_terms) 00147 result += vec2[h_idx % dim] * vec[i]; 00148 } 00149 00150 if (use_quadratic) 00151 { 00152 for (index_t i=0; i<vec.size(); i++) 00153 { 00154 int32_t n_idx = i * vec.size() + i; 00155 uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim; 00156 result += vec2[idx] * vec[i] * vec[i]; 00157 00158 for (index_t j=i+1; j<vec.size(); j++) 00159 { 00160 idx = (hash_cache[i] ^ hash_cache[j]) % dim; 00161 result += vec2[idx] * vec[i] * vec[j]; 00162 } 00163 } 00164 } 00165 00166 dense_feats->free_feature_vector(vec, vec_idx1); 00167 return result; 00168 } 00169 00170 template <class ST> 00171 void CHashedDenseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, 00172 float64_t* vec2, int32_t vec2_len, bool abs_val) 00173 { 00174 float64_t val = abs_val ? CMath::abs(alpha) : alpha; 00175 ASSERT(vec2_len == dim) 00176 00177 SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx1); 00178 00179 int32_t hash_cache_size = use_quadratic ? vec.vlen : 0; 00180 SGVector<uint32_t> hash_cache(hash_cache_size); 00181 00182 for (index_t i=0; i<vec.vlen; i++) 00183 { 00184 uint32_t h_idx = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i); 00185 00186 if (use_quadratic) 00187 hash_cache[i] = h_idx; 00188 00189 if ( (!use_quadratic) || keep_linear_terms) 00190 vec2[h_idx % dim] += val * vec[i]; 00191 } 00192 00193 if (use_quadratic) 00194 { 00195 for (index_t i=0; i<vec.size(); i++) 00196 { 00197 int32_t n_idx = i * vec.size() + i; 00198 uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim; 00199 vec2[idx] += val * vec[i] * vec[i]; 00200 00201 for (index_t j=i+1; j<vec.size(); j++) 00202 { 00203 idx = (hash_cache[i] ^ hash_cache[j]) % dim; 00204 vec2[idx] += val * vec[i] * vec[j]; 00205 } 00206 } 00207 } 00208 dense_feats->free_feature_vector(vec, vec_idx1); 00209 } 00210 00211 template <class ST> 00212 int32_t CHashedDenseFeatures<ST>::get_nnz_features_for_vector(int32_t num) 00213 { 00214 return dim; 00215 } 00216 00217 template <class ST> 00218 void* CHashedDenseFeatures<ST>::get_feature_iterator(int32_t vector_index) 00219 { 00220 SG_NOTIMPLEMENTED; 00221 return NULL; 00222 } 00223 template <class ST> 00224 bool CHashedDenseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value, 00225 void* iterator) 00226 { 00227 SG_NOTIMPLEMENTED; 00228 return false; 00229 } 00230 template <class ST> 00231 void CHashedDenseFeatures<ST>::free_feature_iterator(void* iterator) 00232 { 00233 SG_NOTIMPLEMENTED; 00234 } 00235 00236 template <class ST> 00237 const char* CHashedDenseFeatures<ST>::get_name() const 00238 { 00239 return "HashedDenseFeatures"; 00240 } 00241 00242 template <class ST> 00243 EFeatureType CHashedDenseFeatures<ST>::get_feature_type() const 00244 { 00245 return F_UINT; 00246 } 00247 00248 template <class ST> 00249 EFeatureClass CHashedDenseFeatures<ST>::get_feature_class() const 00250 { 00251 return C_SPARSE; 00252 } 00253 00254 template <class ST> 00255 int32_t CHashedDenseFeatures<ST>::get_num_vectors() const 00256 { 00257 return dense_feats->get_num_vectors(); 00258 } 00259 00260 template <class ST> 00261 SGSparseVector<ST> CHashedDenseFeatures<ST>::get_hashed_feature_vector(int32_t vec_idx) 00262 { 00263 SGVector<ST> vec = dense_feats->get_feature_vector(vec_idx); 00264 SGSparseVector<ST> hashed_vec = CHashedDenseFeatures<ST>::hash_vector( 00265 vec, dim, use_quadratic, keep_linear_terms); 00266 dense_feats->free_feature_vector(vec, vec_idx); 00267 return hashed_vec; 00268 } 00269 00270 template <class ST> 00271 SGSparseVector<ST> CHashedDenseFeatures<ST>::hash_vector(SGVector<ST> vec, int32_t dim, 00272 bool use_quadratic, bool keep_linear_terms) 00273 { 00274 SGVector<ST> h_vec(dim); 00275 SGVector<ST>::fill_vector(h_vec.vector, dim, 0); 00276 00277 int32_t hash_cache_size = use_quadratic ? vec.vlen : 0; 00278 SGVector<uint32_t> hash_cache(hash_cache_size); 00279 00280 for (index_t i=0; i<vec.size(); i++) 00281 { 00282 uint32_t hash = CHash::MurmurHash3((uint8_t* ) &i, sizeof (index_t), i); 00283 if (use_quadratic) 00284 hash_cache[i] = hash; 00285 00286 if ( (!use_quadratic) || keep_linear_terms) 00287 h_vec[hash % dim] += vec[i]; 00288 } 00289 00290 if (use_quadratic) 00291 { 00292 for (index_t i=0; i<vec.size(); i++) 00293 { 00294 index_t n_idx = i * vec.size() + i; 00295 uint32_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), n_idx) % dim; 00296 h_vec[idx] += vec[i] * vec[i]; 00297 00298 for (index_t j=i+1; j<vec.size(); j++) 00299 { 00300 idx = (hash_cache[i] ^ hash_cache[j]) % dim; 00301 h_vec[idx] += vec[i] * vec[j]; 00302 } 00303 } 00304 } 00305 00306 int32_t num_nnz_feats = 0; 00307 for(index_t i=0; i<dim; i++) 00308 { 00309 if (h_vec[i]!=0) 00310 num_nnz_feats++; 00311 } 00312 00313 SGSparseVector<ST> hashed_vector(num_nnz_feats); 00314 00315 int32_t sparse_feat_index = 0; 00316 for (index_t i=0; i<dim; i++) 00317 { 00318 if (h_vec[i]!=0) 00319 { 00320 hashed_vector.features[sparse_feat_index].feat_index = i; 00321 hashed_vector.features[sparse_feat_index++].entry = h_vec[i]; 00322 } 00323 } 00324 00325 return hashed_vector; 00326 } 00327 00328 template class CHashedDenseFeatures<bool>; 00329 template class CHashedDenseFeatures<char>; 00330 template class CHashedDenseFeatures<int8_t>; 00331 template class CHashedDenseFeatures<uint8_t>; 00332 template class CHashedDenseFeatures<int16_t>; 00333 template class CHashedDenseFeatures<uint16_t>; 00334 template class CHashedDenseFeatures<int32_t>; 00335 template class CHashedDenseFeatures<uint32_t>; 00336 template class CHashedDenseFeatures<int64_t>; 00337 template class CHashedDenseFeatures<uint64_t>; 00338 template class CHashedDenseFeatures<float32_t>; 00339 template class CHashedDenseFeatures<float64_t>; 00340 template class CHashedDenseFeatures<floatmax_t>; 00341 }