SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
StreamingSparseFeatures.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2011 Shashwat Lal Das
00008  * Modifications (W) 2013 Thoralf Klein
00009  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
00010  */
00011 #include <shogun/features/streaming/StreamingSparseFeatures.h>
00012 namespace shogun
00013 {
00014 
00015 template <class T>
00016 CStreamingSparseFeatures<T>::CStreamingSparseFeatures() : CStreamingDotFeatures()
00017 {
00018     set_read_functions();
00019     init();
00020 }
00021 
00022 template <class T>
00023 CStreamingSparseFeatures<T>::CStreamingSparseFeatures(CStreamingFile* file,
00024              bool is_labelled,
00025              int32_t size)
00026     : CStreamingDotFeatures()
00027 {
00028     set_read_functions();
00029     init(file, is_labelled, size);
00030 }
00031 
00032 template <class T>
00033 CStreamingSparseFeatures<T>::~CStreamingSparseFeatures()
00034 {
00035     if (parser.is_running())
00036         parser.end_parser();
00037 }
00038 
00039 template <class T>
00040 T CStreamingSparseFeatures<T>::get_feature(int32_t index)
00041 {
00042     ASSERT(index>=0 && index<current_num_features)
00043     return current_sgvector.get_feature(index);
00044 }
00045 
00046 template <class T>
00047 void CStreamingSparseFeatures<T>::reset_stream()
00048 {
00049     SG_NOTIMPLEMENTED
00050 }
00051 
00052 template <class T>
00053 int32_t CStreamingSparseFeatures<T>::set_num_features(int32_t num)
00054 {
00055     int32_t n=current_num_features;
00056     ASSERT(n<=num)
00057     current_num_features=num;
00058     return n;
00059 }
00060 
00061 template <class T>
00062 T CStreamingSparseFeatures<T>::sparse_dot(T alpha, SGSparseVectorEntry<T>* avec, int32_t alen, SGSparseVectorEntry<T>* bvec, int32_t blen)
00063 {
00064     T result=0;
00065 
00066     //result remains zero when one of the vectors is non existent
00067     if (avec && bvec)
00068     {
00069         SGSparseVector<T> asv(avec, alen, false);
00070         SGSparseVector<T> bsv(bvec, blen, false);
00071 
00072         result=alpha*SGSparseVector<T>::sparse_dot(asv, bsv);
00073     }
00074 
00075     return result;
00076 }
00077 
00078 template <class T>
00079 T CStreamingSparseFeatures<T>::dense_dot(T alpha, T* vec, int32_t dim, T b)
00080 {
00081     ASSERT(vec)
00082     ASSERT(dim>=current_num_features)
00083 
00084     return current_sgvector.dense_dot(alpha, vec, dim, b);
00085 }
00086 
00087 template <class T>
00088 float64_t CStreamingSparseFeatures<T>::dense_dot(const float64_t* vec2, int32_t vec2_len)
00089 {
00090     ASSERT(vec2)
00091 
00092     int32_t current_length = current_sgvector.num_feat_entries;
00093     SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
00094 
00095     float64_t result=0;
00096     if (current_vector)
00097     {
00098         for (int32_t i=0; i<current_length; i++) {
00099             if (current_vector[i].feat_index < vec2_len) {
00100                 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
00101             }
00102         }
00103     }
00104 
00105     return result;
00106 }
00107 
00108 template <class T>
00109 float32_t CStreamingSparseFeatures<T>::dense_dot(const float32_t* vec2, int32_t vec2_len)
00110 {
00111     ASSERT(vec2)
00112 
00113     int32_t current_length = current_sgvector.num_feat_entries;
00114     SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
00115 
00116     float32_t result=0;
00117     if (current_vector)
00118     {
00119         for (int32_t i=0; i<current_length; i++) {
00120             if (current_vector[i].feat_index < vec2_len) {
00121                 result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
00122             }
00123         }
00124     }
00125 
00126     return result;
00127 }
00128 
00129 template <class T>
00130 void CStreamingSparseFeatures<T>::add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val)
00131 {
00132     ASSERT(vec2)
00133     if (vec2_len < current_num_features)
00134     {
00135         SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00136              vec2_len, current_num_features);
00137     }
00138 
00139     SGSparseVectorEntry<T>* sv=current_sgvector.features;
00140     int32_t num_feat=current_sgvector.num_feat_entries;
00141 
00142     if (sv)
00143     {
00144         if (abs_val)
00145         {
00146             for (int32_t i=0; i<num_feat; i++)
00147                 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00148         }
00149         else
00150         {
00151             for (int32_t i=0; i<num_feat; i++)
00152                 vec2[sv[i].feat_index]+= alpha*sv[i].entry;
00153         }
00154     }
00155 }
00156 
00157 template <class T>
00158 void CStreamingSparseFeatures<T>::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
00159 {
00160     ASSERT(vec2)
00161     if (vec2_len < current_num_features)
00162     {
00163         SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
00164              vec2_len, current_num_features);
00165     }
00166 
00167     SGSparseVectorEntry<T>* sv=current_sgvector.features;
00168     int32_t num_feat=current_sgvector.num_feat_entries;
00169 
00170     if (sv)
00171     {
00172         if (abs_val)
00173         {
00174             for (int32_t i=0; i<num_feat; i++)
00175                 vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00176         }
00177         else
00178         {
00179             for (int32_t i=0; i<num_feat; i++)
00180                 vec2[sv[i].feat_index]+= alpha*sv[i].entry;
00181         }
00182     }
00183 }
00184 
00185 template <class T>
00186 int64_t CStreamingSparseFeatures<T>::get_num_nonzero_entries()
00187 {
00188     return current_sgvector.num_feat_entries;
00189 }
00190 
00191 template <class T>
00192 float32_t CStreamingSparseFeatures<T>::compute_squared()
00193 {
00194     int32_t current_length = current_sgvector.num_feat_entries;
00195     SGSparseVectorEntry<T>* current_vector = current_sgvector.features;
00196 
00197     ASSERT(current_vector)
00198 
00199     float32_t sq=0;
00200 
00201     for (int32_t i=0; i<current_length; i++)
00202         sq += current_vector[i].entry * current_vector[i].entry;
00203 
00204     return sq;
00205 }
00206 
00207 template <class T>
00208 void CStreamingSparseFeatures<T>::sort_features()
00209 {
00210     SGSparseVectorEntry<T>* old_ptr = current_sgvector.features;
00211 
00212     // setting false to disallow reallocation
00213     // and guarantee stable get_vector().features pointer
00214     get_vector().sort_features(true);
00215 
00216     ASSERT(old_ptr == current_sgvector.features);
00217 }
00218 
00219 template <class T>
00220 CFeatures* CStreamingSparseFeatures<T>::duplicate() const
00221 {
00222     return new CStreamingSparseFeatures<T>(*this);
00223 }
00224 
00225 template <class T>
00226 int32_t CStreamingSparseFeatures<T>::get_num_vectors() const
00227 {
00228     if (current_sgvector.features)
00229         return 1;
00230     return 0;
00231 }
00232 
00233 template <class T> void CStreamingSparseFeatures<T>::set_vector_reader()
00234 {
00235     parser.set_read_vector(&CStreamingFile::get_sparse_vector);
00236 }
00237 
00238 template <class T> void CStreamingSparseFeatures<T>::set_vector_and_label_reader()
00239 {
00240     parser.set_read_vector_and_label
00241         (&CStreamingFile::get_sparse_vector_and_label);
00242 }
00243 
00244 #define GET_FEATURE_TYPE(f_type, sg_type)               \
00245 template<> EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() const \
00246 {                                   \
00247     return f_type;                          \
00248 }
00249 
00250 GET_FEATURE_TYPE(F_BOOL, bool)
00251 GET_FEATURE_TYPE(F_CHAR, char)
00252 GET_FEATURE_TYPE(F_BYTE, uint8_t)
00253 GET_FEATURE_TYPE(F_BYTE, int8_t)
00254 GET_FEATURE_TYPE(F_SHORT, int16_t)
00255 GET_FEATURE_TYPE(F_WORD, uint16_t)
00256 GET_FEATURE_TYPE(F_INT, int32_t)
00257 GET_FEATURE_TYPE(F_UINT, uint32_t)
00258 GET_FEATURE_TYPE(F_LONG, int64_t)
00259 GET_FEATURE_TYPE(F_ULONG, uint64_t)
00260 GET_FEATURE_TYPE(F_SHORTREAL, float32_t)
00261 GET_FEATURE_TYPE(F_DREAL, float64_t)
00262 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t)
00263 #undef GET_FEATURE_TYPE
00264 
00265 
00266 template <class T>
00267 void CStreamingSparseFeatures<T>::init()
00268 {
00269     working_file=NULL;
00270     current_vec_index=0;
00271     current_num_features=-1;
00272 
00273     set_generic<T>();
00274 }
00275 
00276 template <class T>
00277 void CStreamingSparseFeatures<T>::init(CStreamingFile* file,
00278                     bool is_labelled,
00279                     int32_t size)
00280 {
00281     init();
00282     has_labels = is_labelled;
00283     working_file = file;
00284     SG_REF(working_file);
00285     parser.init(file, is_labelled, size);
00286     parser.set_free_vector_after_release(false);
00287 }
00288 
00289 template <class T>
00290 void CStreamingSparseFeatures<T>::start_parser()
00291 {
00292     if (!parser.is_running())
00293         parser.start_parser();
00294 }
00295 
00296 template <class T>
00297 void CStreamingSparseFeatures<T>::end_parser()
00298 {
00299     parser.end_parser();
00300 }
00301 
00302 template <class T>
00303 bool CStreamingSparseFeatures<T>::get_next_example()
00304 {
00305     int32_t current_length = 0;
00306     SGSparseVectorEntry<T>* current_vector = NULL;
00307 
00308     bool ret_value;
00309     ret_value = (bool) parser.get_next_example(current_vector,
00310                            current_length,
00311                            current_label);
00312 
00313     if (!ret_value)
00314         return false;
00315 
00316     // ref_count disabled, because parser still owns the memory
00317     current_sgvector = SGSparseVector<T>(current_vector, current_length, false);
00318 
00319     // Update number of features based on highest index
00320     int32_t current_dimension = get_vector().get_num_dimensions();
00321     current_num_features = CMath::max(current_num_features, current_dimension);
00322 
00323     current_vec_index++;
00324     return true;
00325 }
00326 
00327 template <class T>
00328 SGSparseVector<T> CStreamingSparseFeatures<T>::get_vector()
00329 {
00330     return current_sgvector;
00331 }
00332 
00333 template <class T>
00334 float64_t CStreamingSparseFeatures<T>::get_label()
00335 {
00336     ASSERT(has_labels)
00337 
00338     return current_label;
00339 }
00340 
00341 template <class T>
00342 void CStreamingSparseFeatures<T>::release_example()
00343 {
00344     parser.finalize_example();
00345 }
00346 
00347 template <class T>
00348 int32_t CStreamingSparseFeatures<T>::get_dim_feature_space() const
00349 {
00350     return current_num_features;
00351 }
00352 
00353 template <class T>
00354     float32_t CStreamingSparseFeatures<T>::dot(CStreamingDotFeatures* df)
00355 {
00356     SG_NOTIMPLEMENTED
00357     return -1;
00358 }
00359 
00360 template <class T>
00361 int32_t CStreamingSparseFeatures<T>::get_num_features()
00362 {
00363     return current_num_features;
00364 }
00365 
00366 template <class T>
00367 int32_t CStreamingSparseFeatures<T>::get_nnz_features_for_vector()
00368 {
00369     return current_sgvector.num_feat_entries;
00370 }
00371 
00372 template <class T>
00373 EFeatureClass CStreamingSparseFeatures<T>::get_feature_class() const
00374 {
00375     return C_STREAMING_SPARSE;
00376 }
00377 
00378 template class CStreamingSparseFeatures<bool>;
00379 template class CStreamingSparseFeatures<char>;
00380 template class CStreamingSparseFeatures<int8_t>;
00381 template class CStreamingSparseFeatures<uint8_t>;
00382 template class CStreamingSparseFeatures<int16_t>;
00383 template class CStreamingSparseFeatures<uint16_t>;
00384 template class CStreamingSparseFeatures<int32_t>;
00385 template class CStreamingSparseFeatures<uint32_t>;
00386 template class CStreamingSparseFeatures<int64_t>;
00387 template class CStreamingSparseFeatures<uint64_t>;
00388 template class CStreamingSparseFeatures<float32_t>;
00389 template class CStreamingSparseFeatures<float64_t>;
00390 template class CStreamingSparseFeatures<floatmax_t>;
00391 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation