SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evangelos Anagnostopoulos 00008 * Copyright (C) 2013 Evangelos Anagnostopoulos 00009 */ 00010 00011 #include <shogun/features/HashedSparseFeatures.h> 00012 #include <shogun/features/HashedDenseFeatures.h> 00013 #include <shogun/base/Parameter.h> 00014 #include <shogun/lib/Hash.h> 00015 #include <shogun/io/SGIO.h> 00016 #include <shogun/lib/DynamicArray.h> 00017 #include <shogun/lib/SGSparseVector.h> 00018 #include <shogun/mathematics/Math.h> 00019 #include <string.h> 00020 #include <iostream> 00021 00022 namespace shogun { 00023 00024 template <class ST> 00025 CHashedSparseFeatures<ST>::CHashedSparseFeatures(int32_t size, bool use_quadr, 00026 bool keep_lin_terms) : CDotFeatures(size) 00027 { 00028 init(NULL, 0, use_quadr, keep_lin_terms); 00029 } 00030 00031 template <class ST> 00032 CHashedSparseFeatures<ST>::CHashedSparseFeatures(CSparseFeatures<ST>* feats, int32_t d, 00033 bool use_quadr, bool keep_lin_terms) : CDotFeatures() 00034 { 00035 init(feats, d, use_quadr, keep_lin_terms); 00036 } 00037 00038 template <class ST> 00039 CHashedSparseFeatures<ST>::CHashedSparseFeatures(SGSparseMatrix<ST> matrix, int32_t d, 00040 bool use_quadr, bool keep_lin_terms) : CDotFeatures() 00041 { 00042 CSparseFeatures<ST>* feats = new CSparseFeatures<ST>(matrix); 00043 init(feats, d, use_quadr, keep_lin_terms); 00044 } 00045 00046 template <class ST> 00047 CHashedSparseFeatures<ST>::CHashedSparseFeatures(CFile* loader, int32_t d, bool use_quadr, 00048 bool keep_lin_terms) : CDotFeatures(loader) 00049 { 00050 CSparseFeatures<ST>* feats = new CSparseFeatures<ST>(); 00051 feats->load(loader); 00052 init(feats, d, use_quadr, keep_lin_terms); 00053 } 00054 00055 template <class ST> 00056 void CHashedSparseFeatures<ST>::init(CSparseFeatures<ST>* feats, int32_t d, bool use_quadr, 00057 bool keep_lin_terms) 00058 { 00059 dim = d; 00060 use_quadratic = use_quadr; 00061 keep_linear_terms = keep_lin_terms; 00062 sparse_feats = feats; 00063 SG_REF(sparse_feats); 00064 00065 SG_ADD(&use_quadratic, "use_quadratic", "Whether to use quadratic features", 00066 MS_NOT_AVAILABLE); 00067 SG_ADD(&keep_linear_terms, "keep_linear_terms", "Whether to keep the linear terms or not", 00068 MS_NOT_AVAILABLE); 00069 SG_ADD(&dim, "dim", "Dimension of new feature space", MS_NOT_AVAILABLE); 00070 SG_ADD((CSGObject** ) &sparse_feats, "sparse_feats", "Sparse features to work on", 00071 MS_NOT_AVAILABLE); 00072 00073 set_generic<ST>(); 00074 } 00075 00076 template <class ST> 00077 CHashedSparseFeatures<ST>::CHashedSparseFeatures(const CHashedSparseFeatures& orig) 00078 : CDotFeatures(orig) 00079 { 00080 init(orig.sparse_feats, orig.dim, orig.use_quadratic, orig.keep_linear_terms); 00081 } 00082 00083 template <class ST> 00084 CHashedSparseFeatures<ST>::~CHashedSparseFeatures() 00085 { 00086 SG_UNREF(sparse_feats); 00087 } 00088 00089 template <class ST> 00090 CFeatures* CHashedSparseFeatures<ST>::duplicate() const 00091 { 00092 return new CHashedSparseFeatures<ST>(*this); 00093 } 00094 00095 template <class ST> 00096 int32_t CHashedSparseFeatures<ST>::get_dim_feature_space() const 00097 { 00098 return dim; 00099 } 00100 00101 template <class ST> 00102 SGSparseVector<ST> CHashedSparseFeatures<ST>::get_hashed_feature_vector( 00103 int32_t vec_idx) const 00104 { 00105 return CHashedSparseFeatures<ST>::hash_vector(sparse_feats->get_sparse_feature_vector(vec_idx), 00106 dim, use_quadratic, keep_linear_terms); 00107 } 00108 00109 template <class ST> 00110 SGSparseVector<ST> CHashedSparseFeatures<ST>::hash_vector(SGVector<ST> vec, int32_t dim, 00111 bool use_quadratic, bool keep_linear_terms) 00112 { 00113 return CHashedDenseFeatures<ST>::hash_vector(vec, dim, use_quadratic, keep_linear_terms); 00114 } 00115 00116 template <class ST> 00117 SGSparseVector<ST> CHashedSparseFeatures<ST>::hash_vector(SGSparseVector<ST> vec, int32_t dim, 00118 bool use_quadratic, bool keep_linear_terms) 00119 { 00120 SGVector<ST> h_vec(dim); 00121 SGVector<ST>::fill_vector(h_vec, dim, 0); 00122 00123 int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0; 00124 SGVector<uint32_t> hash_cache(hash_cache_size); 00125 00126 for (index_t i=0; i<vec.num_feat_entries; i++) 00127 { 00128 uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t), 00129 vec.features[i].feat_index); 00130 00131 if (use_quadratic) 00132 hash_cache[i] = hash; 00133 00134 if ( (!use_quadratic) || keep_linear_terms ) 00135 h_vec[hash % dim] += vec.features[i].entry; 00136 } 00137 00138 if (use_quadratic) 00139 { 00140 for (index_t i=0; i<vec.num_feat_entries; i++) 00141 { 00142 index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index; 00143 index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof(index_t), 00144 vec.features[i].feat_index) % dim; 00145 00146 h_vec[idx] += vec.features[i].entry * vec.features[i].entry; 00147 00148 for (index_t j=i+1; j<vec.num_feat_entries; j++) 00149 { 00150 idx = (hash_cache[i] ^ hash_cache[j]) % dim; 00151 h_vec[idx] += vec.features[i].entry * vec.features[j].entry; 00152 } 00153 } 00154 } 00155 00156 int32_t num_nnz_features = 0; 00157 for (index_t i=0; i<dim; i++) 00158 { 00159 if (h_vec[i]!=0) 00160 num_nnz_features++; 00161 } 00162 00163 SGSparseVector<ST> sv(num_nnz_features); 00164 00165 int32_t sparse_index = 0; 00166 for (index_t i=0; i<dim; i++) 00167 { 00168 if (h_vec[i]!=0) 00169 { 00170 sv.features[sparse_index].entry = h_vec[i]; 00171 sv.features[sparse_index++].feat_index = i; 00172 } 00173 } 00174 00175 return sv; 00176 } 00177 00178 template <class ST> 00179 float64_t CHashedSparseFeatures<ST>::dot(int32_t vec_idx1, CDotFeatures* df, 00180 int32_t vec_idx2) 00181 { 00182 ASSERT(df) 00183 ASSERT(df->get_feature_type() == get_feature_type()) 00184 ASSERT(df->get_feature_class() == get_feature_class()) 00185 ASSERT(strcmp(df->get_name(), get_name())==0) 00186 00187 CHashedSparseFeatures<ST>* feats = (CHashedSparseFeatures<ST>* ) df; 00188 SGSparseVector<ST> vec_1 = get_hashed_feature_vector(vec_idx1); 00189 SGSparseVector<ST> vec_2 = feats->get_hashed_feature_vector(vec_idx2); 00190 00191 float64_t result = vec_1.sparse_dot(vec_2); 00192 return result; 00193 } 00194 00195 template <class ST> 00196 float64_t CHashedSparseFeatures<ST>::dense_dot(int32_t vec_idx1, float64_t* vec2, 00197 int32_t vec2_len) 00198 { 00199 ASSERT(vec2_len == dim) 00200 00201 SGSparseVector<ST> vec = sparse_feats->get_sparse_feature_vector(vec_idx1); 00202 00203 int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0; 00204 SGVector<uint32_t> hash_cache(hash_cache_size); 00205 00206 float64_t result = 0; 00207 for (index_t i=0; i<vec.num_feat_entries; i++) 00208 { 00209 uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t), 00210 vec.features[i].feat_index); 00211 00212 if (use_quadratic) 00213 hash_cache[i] = hash; 00214 00215 if ( (!use_quadratic) || keep_linear_terms) 00216 result += vec2[hash % dim] * vec.features[i].entry; 00217 } 00218 00219 if (use_quadratic) 00220 { 00221 for (index_t i=0; i<vec.num_feat_entries; i++) 00222 { 00223 index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index; 00224 index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), 00225 vec.features[i].feat_index) % dim; 00226 00227 result += vec2[idx] * vec.features[i].entry * vec.features[i].entry; 00228 00229 for (index_t j=i+1; j<vec.num_feat_entries; j++) 00230 { 00231 idx = (hash_cache[i] ^ hash_cache[j]) % dim; 00232 result += vec2[idx] * vec.features[i].entry * vec.features[j].entry; 00233 } 00234 } 00235 } 00236 00237 sparse_feats ->free_feature_vector(vec_idx1); 00238 return result; 00239 } 00240 00241 template <class ST> 00242 void CHashedSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, 00243 float64_t* vec2, int32_t vec2_len, bool abs_val) 00244 { 00245 float64_t val = abs_val ? CMath::abs(alpha) : alpha; 00246 ASSERT(vec2_len == dim) 00247 00248 SGSparseVector<ST> vec = sparse_feats->get_sparse_feature_vector(vec_idx1); 00249 00250 int32_t hash_cache_size = use_quadratic ? vec.num_feat_entries : 0; 00251 SGVector<uint32_t> hash_cache(hash_cache_size); 00252 00253 for (index_t i=0; i<vec.num_feat_entries; i++) 00254 { 00255 uint32_t hash = CHash::MurmurHash3((uint8_t* ) &vec.features[i].feat_index, sizeof (index_t), 00256 vec.features[i].feat_index); 00257 if (use_quadratic) 00258 hash_cache[i] = hash; 00259 00260 if ( (!use_quadratic) || keep_linear_terms) 00261 vec2[hash % dim] += val * vec.features[i].entry; 00262 } 00263 00264 if (use_quadratic) 00265 { 00266 for (index_t i=0; i<vec.num_feat_entries; i++) 00267 { 00268 index_t n_idx = vec.features[i].feat_index + vec.features[i].feat_index; 00269 index_t idx = CHash::MurmurHash3((uint8_t* ) &n_idx, sizeof (index_t), 00270 vec.features[i].feat_index) % dim; 00271 00272 vec2[idx] += val * vec.features[i].entry * vec.features[i].entry; 00273 00274 for (index_t j=i+1; j<vec.num_feat_entries; j++) 00275 { 00276 idx = (hash_cache[i] ^ hash_cache[j]) % dim; 00277 vec2[idx] += val * vec.features[i].entry * vec.features[j].entry; 00278 } 00279 } 00280 } 00281 sparse_feats ->free_feature_vector(vec_idx1); 00282 } 00283 00284 template <class ST> 00285 int32_t CHashedSparseFeatures<ST>::get_nnz_features_for_vector(int32_t num) 00286 { 00287 return dim; 00288 } 00289 00290 template <class ST> 00291 void* CHashedSparseFeatures<ST>::get_feature_iterator(int32_t vector_index) 00292 { 00293 SG_NOTIMPLEMENTED; 00294 return NULL; 00295 } 00296 template <class ST> 00297 bool CHashedSparseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value, 00298 void* iterator) 00299 { 00300 SG_NOTIMPLEMENTED; 00301 return false; 00302 } 00303 template <class ST> 00304 void CHashedSparseFeatures<ST>::free_feature_iterator(void* iterator) 00305 { 00306 SG_NOTIMPLEMENTED; 00307 } 00308 00309 template <class ST> 00310 const char* CHashedSparseFeatures<ST>::get_name() const 00311 { 00312 return "HashedSparseFeatures"; 00313 } 00314 00315 template <class ST> 00316 EFeatureType CHashedSparseFeatures<ST>::get_feature_type() const 00317 { 00318 return F_UINT; 00319 } 00320 00321 template <class ST> 00322 EFeatureClass CHashedSparseFeatures<ST>::get_feature_class() const 00323 { 00324 return C_SPARSE; 00325 } 00326 00327 template <class ST> 00328 int32_t CHashedSparseFeatures<ST>::get_num_vectors() const 00329 { 00330 return sparse_feats ->get_num_vectors(); 00331 } 00332 00333 template class CHashedSparseFeatures <bool>; 00334 template class CHashedSparseFeatures <char>; 00335 template class CHashedSparseFeatures <int8_t>; 00336 template class CHashedSparseFeatures <uint8_t>; 00337 template class CHashedSparseFeatures <int16_t>; 00338 template class CHashedSparseFeatures <uint16_t>; 00339 template class CHashedSparseFeatures <int32_t>; 00340 template class CHashedSparseFeatures <uint32_t>; 00341 template class CHashedSparseFeatures <int64_t>; 00342 template class CHashedSparseFeatures <uint64_t>; 00343 template class CHashedSparseFeatures <float32_t>; 00344 template class CHashedSparseFeatures <float64_t>; 00345 template class CHashedSparseFeatures <floatmax_t>; 00346 }