SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2011 Shashwat Lal Das 00008 * Modifications (W) 2013 Thoralf Klein 00009 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society 00010 */ 00011 #include <shogun/features/streaming/StreamingSparseFeatures.h> 00012 namespace shogun 00013 { 00014 00015 template <class T> 00016 CStreamingSparseFeatures<T>::CStreamingSparseFeatures() : CStreamingDotFeatures() 00017 { 00018 set_read_functions(); 00019 init(); 00020 } 00021 00022 template <class T> 00023 CStreamingSparseFeatures<T>::CStreamingSparseFeatures(CStreamingFile* file, 00024 bool is_labelled, 00025 int32_t size) 00026 : CStreamingDotFeatures() 00027 { 00028 set_read_functions(); 00029 init(file, is_labelled, size); 00030 } 00031 00032 template <class T> 00033 CStreamingSparseFeatures<T>::~CStreamingSparseFeatures() 00034 { 00035 if (parser.is_running()) 00036 parser.end_parser(); 00037 } 00038 00039 template <class T> 00040 T CStreamingSparseFeatures<T>::get_feature(int32_t index) 00041 { 00042 ASSERT(index>=0 && index<current_num_features) 00043 return current_sgvector.get_feature(index); 00044 } 00045 00046 template <class T> 00047 void CStreamingSparseFeatures<T>::reset_stream() 00048 { 00049 SG_NOTIMPLEMENTED 00050 } 00051 00052 template <class T> 00053 int32_t CStreamingSparseFeatures<T>::set_num_features(int32_t num) 00054 { 00055 int32_t n=current_num_features; 00056 ASSERT(n<=num) 00057 current_num_features=num; 00058 return n; 00059 } 00060 00061 template <class T> 00062 T CStreamingSparseFeatures<T>::sparse_dot(T alpha, SGSparseVectorEntry<T>* avec, int32_t alen, SGSparseVectorEntry<T>* bvec, int32_t blen) 00063 { 00064 T result=0; 00065 00066 //result remains zero when one of the vectors is non existent 00067 if (avec && bvec) 00068 { 00069 SGSparseVector<T> asv(avec, alen, false); 00070 SGSparseVector<T> bsv(bvec, blen, false); 00071 00072 result=alpha*SGSparseVector<T>::sparse_dot(asv, bsv); 00073 } 00074 00075 return result; 00076 } 00077 00078 template <class T> 00079 T CStreamingSparseFeatures<T>::dense_dot(T alpha, T* vec, int32_t dim, T b) 00080 { 00081 ASSERT(vec) 00082 ASSERT(dim>=current_num_features) 00083 00084 return current_sgvector.dense_dot(alpha, vec, dim, b); 00085 } 00086 00087 template <class T> 00088 float64_t CStreamingSparseFeatures<T>::dense_dot(const float64_t* vec2, int32_t vec2_len) 00089 { 00090 ASSERT(vec2) 00091 00092 int32_t current_length = current_sgvector.num_feat_entries; 00093 SGSparseVectorEntry<T>* current_vector = current_sgvector.features; 00094 00095 float64_t result=0; 00096 if (current_vector) 00097 { 00098 for (int32_t i=0; i<current_length; i++) { 00099 if (current_vector[i].feat_index < vec2_len) { 00100 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry; 00101 } 00102 } 00103 } 00104 00105 return result; 00106 } 00107 00108 template <class T> 00109 float32_t CStreamingSparseFeatures<T>::dense_dot(const float32_t* vec2, int32_t vec2_len) 00110 { 00111 ASSERT(vec2) 00112 00113 int32_t current_length = current_sgvector.num_feat_entries; 00114 SGSparseVectorEntry<T>* current_vector = current_sgvector.features; 00115 00116 float32_t result=0; 00117 if (current_vector) 00118 { 00119 for (int32_t i=0; i<current_length; i++) { 00120 if (current_vector[i].feat_index < vec2_len) { 00121 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry; 00122 } 00123 } 00124 } 00125 00126 return result; 00127 } 00128 00129 template <class T> 00130 void CStreamingSparseFeatures<T>::add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val) 00131 { 00132 ASSERT(vec2) 00133 if (vec2_len < current_num_features) 00134 { 00135 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n", 00136 vec2_len, current_num_features); 00137 } 00138 00139 SGSparseVectorEntry<T>* sv=current_sgvector.features; 00140 int32_t num_feat=current_sgvector.num_feat_entries; 00141 00142 if (sv) 00143 { 00144 if (abs_val) 00145 { 00146 for (int32_t i=0; i<num_feat; i++) 00147 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry); 00148 } 00149 else 00150 { 00151 for (int32_t i=0; i<num_feat; i++) 00152 vec2[sv[i].feat_index]+= alpha*sv[i].entry; 00153 } 00154 } 00155 } 00156 00157 template <class T> 00158 void CStreamingSparseFeatures<T>::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val) 00159 { 00160 ASSERT(vec2) 00161 if (vec2_len < current_num_features) 00162 { 00163 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n", 00164 vec2_len, current_num_features); 00165 } 00166 00167 SGSparseVectorEntry<T>* sv=current_sgvector.features; 00168 int32_t num_feat=current_sgvector.num_feat_entries; 00169 00170 if (sv) 00171 { 00172 if (abs_val) 00173 { 00174 for (int32_t i=0; i<num_feat; i++) 00175 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry); 00176 } 00177 else 00178 { 00179 for (int32_t i=0; i<num_feat; i++) 00180 vec2[sv[i].feat_index]+= alpha*sv[i].entry; 00181 } 00182 } 00183 } 00184 00185 template <class T> 00186 int64_t CStreamingSparseFeatures<T>::get_num_nonzero_entries() 00187 { 00188 return current_sgvector.num_feat_entries; 00189 } 00190 00191 template <class T> 00192 float32_t CStreamingSparseFeatures<T>::compute_squared() 00193 { 00194 int32_t current_length = current_sgvector.num_feat_entries; 00195 SGSparseVectorEntry<T>* current_vector = current_sgvector.features; 00196 00197 ASSERT(current_vector) 00198 00199 float32_t sq=0; 00200 00201 for (int32_t i=0; i<current_length; i++) 00202 sq += current_vector[i].entry * current_vector[i].entry; 00203 00204 return sq; 00205 } 00206 00207 template <class T> 00208 void CStreamingSparseFeatures<T>::sort_features() 00209 { 00210 SGSparseVectorEntry<T>* old_ptr = current_sgvector.features; 00211 00212 // setting false to disallow reallocation 00213 // and guarantee stable get_vector().features pointer 00214 get_vector().sort_features(true); 00215 00216 ASSERT(old_ptr == current_sgvector.features); 00217 } 00218 00219 template <class T> 00220 CFeatures* CStreamingSparseFeatures<T>::duplicate() const 00221 { 00222 return new CStreamingSparseFeatures<T>(*this); 00223 } 00224 00225 template <class T> 00226 int32_t CStreamingSparseFeatures<T>::get_num_vectors() const 00227 { 00228 if (current_sgvector.features) 00229 return 1; 00230 return 0; 00231 } 00232 00233 template <class T> void CStreamingSparseFeatures<T>::set_vector_reader() 00234 { 00235 parser.set_read_vector(&CStreamingFile::get_sparse_vector); 00236 } 00237 00238 template <class T> void CStreamingSparseFeatures<T>::set_vector_and_label_reader() 00239 { 00240 parser.set_read_vector_and_label 00241 (&CStreamingFile::get_sparse_vector_and_label); 00242 } 00243 00244 #define GET_FEATURE_TYPE(f_type, sg_type) \ 00245 template<> EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() const \ 00246 { \ 00247 return f_type; \ 00248 } 00249 00250 GET_FEATURE_TYPE(F_BOOL, bool) 00251 GET_FEATURE_TYPE(F_CHAR, char) 00252 GET_FEATURE_TYPE(F_BYTE, uint8_t) 00253 GET_FEATURE_TYPE(F_BYTE, int8_t) 00254 GET_FEATURE_TYPE(F_SHORT, int16_t) 00255 GET_FEATURE_TYPE(F_WORD, uint16_t) 00256 GET_FEATURE_TYPE(F_INT, int32_t) 00257 GET_FEATURE_TYPE(F_UINT, uint32_t) 00258 GET_FEATURE_TYPE(F_LONG, int64_t) 00259 GET_FEATURE_TYPE(F_ULONG, uint64_t) 00260 GET_FEATURE_TYPE(F_SHORTREAL, float32_t) 00261 GET_FEATURE_TYPE(F_DREAL, float64_t) 00262 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t) 00263 #undef GET_FEATURE_TYPE 00264 00265 00266 template <class T> 00267 void CStreamingSparseFeatures<T>::init() 00268 { 00269 working_file=NULL; 00270 current_vec_index=0; 00271 current_num_features=-1; 00272 00273 set_generic<T>(); 00274 } 00275 00276 template <class T> 00277 void CStreamingSparseFeatures<T>::init(CStreamingFile* file, 00278 bool is_labelled, 00279 int32_t size) 00280 { 00281 init(); 00282 has_labels = is_labelled; 00283 working_file = file; 00284 SG_REF(working_file); 00285 parser.init(file, is_labelled, size); 00286 parser.set_free_vector_after_release(false); 00287 } 00288 00289 template <class T> 00290 void CStreamingSparseFeatures<T>::start_parser() 00291 { 00292 if (!parser.is_running()) 00293 parser.start_parser(); 00294 } 00295 00296 template <class T> 00297 void CStreamingSparseFeatures<T>::end_parser() 00298 { 00299 parser.end_parser(); 00300 } 00301 00302 template <class T> 00303 bool CStreamingSparseFeatures<T>::get_next_example() 00304 { 00305 int32_t current_length = 0; 00306 SGSparseVectorEntry<T>* current_vector = NULL; 00307 00308 bool ret_value; 00309 ret_value = (bool) parser.get_next_example(current_vector, 00310 current_length, 00311 current_label); 00312 00313 if (!ret_value) 00314 return false; 00315 00316 // ref_count disabled, because parser still owns the memory 00317 current_sgvector = SGSparseVector<T>(current_vector, current_length, false); 00318 00319 // Update number of features based on highest index 00320 int32_t current_dimension = get_vector().get_num_dimensions(); 00321 current_num_features = CMath::max(current_num_features, current_dimension); 00322 00323 current_vec_index++; 00324 return true; 00325 } 00326 00327 template <class T> 00328 SGSparseVector<T> CStreamingSparseFeatures<T>::get_vector() 00329 { 00330 return current_sgvector; 00331 } 00332 00333 template <class T> 00334 float64_t CStreamingSparseFeatures<T>::get_label() 00335 { 00336 ASSERT(has_labels) 00337 00338 return current_label; 00339 } 00340 00341 template <class T> 00342 void CStreamingSparseFeatures<T>::release_example() 00343 { 00344 parser.finalize_example(); 00345 } 00346 00347 template <class T> 00348 int32_t CStreamingSparseFeatures<T>::get_dim_feature_space() const 00349 { 00350 return current_num_features; 00351 } 00352 00353 template <class T> 00354 float32_t CStreamingSparseFeatures<T>::dot(CStreamingDotFeatures* df) 00355 { 00356 SG_NOTIMPLEMENTED 00357 return -1; 00358 } 00359 00360 template <class T> 00361 int32_t CStreamingSparseFeatures<T>::get_num_features() 00362 { 00363 return current_num_features; 00364 } 00365 00366 template <class T> 00367 int32_t CStreamingSparseFeatures<T>::get_nnz_features_for_vector() 00368 { 00369 return current_sgvector.num_feat_entries; 00370 } 00371 00372 template <class T> 00373 EFeatureClass CStreamingSparseFeatures<T>::get_feature_class() const 00374 { 00375 return C_STREAMING_SPARSE; 00376 } 00377 00378 template class CStreamingSparseFeatures<bool>; 00379 template class CStreamingSparseFeatures<char>; 00380 template class CStreamingSparseFeatures<int8_t>; 00381 template class CStreamingSparseFeatures<uint8_t>; 00382 template class CStreamingSparseFeatures<int16_t>; 00383 template class CStreamingSparseFeatures<uint16_t>; 00384 template class CStreamingSparseFeatures<int32_t>; 00385 template class CStreamingSparseFeatures<uint32_t>; 00386 template class CStreamingSparseFeatures<int64_t>; 00387 template class CStreamingSparseFeatures<uint64_t>; 00388 template class CStreamingSparseFeatures<float32_t>; 00389 template class CStreamingSparseFeatures<float64_t>; 00390 template class CStreamingSparseFeatures<floatmax_t>; 00391 }