SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
StreamingVwFeatures.cpp
Go to the documentation of this file.
00001 /*
00002  * Copyright (c) 2009 Yahoo! Inc.  All rights reserved.  The copyrights
00003  * embodied in the content of this file are licensed under the BSD
00004  * (revised) open source license.
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 3 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * Written (W) 2011 Shashwat Lal Das
00012  * Adaptation of Vowpal Wabbit v5.1.
00013  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society.
00014  */
00015 
00016 #include <shogun/features/streaming/StreamingVwFeatures.h>
00017 
00018 using namespace shogun;
00019 
00020 CStreamingVwFeatures::CStreamingVwFeatures() : CStreamingDotFeatures()
00021 {
00022     init();
00023     set_read_functions();
00024 }
00025 
00026 CStreamingVwFeatures::CStreamingVwFeatures(CStreamingVwFile* file,
00027         bool is_labelled, int32_t size)
00028 : CStreamingDotFeatures()
00029 {
00030     init(file, is_labelled, size);
00031     set_read_functions();
00032 }
00033 
00034 CStreamingVwFeatures::CStreamingVwFeatures(CStreamingVwCacheFile* file,
00035         bool is_labelled, int32_t size)
00036 : CStreamingDotFeatures()
00037 {
00038     init(file, is_labelled, size);
00039     set_read_functions();
00040 }
00041 
00042 CStreamingVwFeatures::~CStreamingVwFeatures()
00043 {
00044     if (parser.is_running())
00045         parser.end_parser();
00046     SG_UNREF(env);
00047 }
00048 
00049 CFeatures* CStreamingVwFeatures::duplicate() const
00050 {
00051     return new CStreamingVwFeatures(*this);
00052 }
00053 
00054 void CStreamingVwFeatures::set_vector_reader()
00055 {
00056     parser.set_read_vector(&CStreamingFile::get_vector);
00057 }
00058 
00059 void CStreamingVwFeatures::set_vector_and_label_reader()
00060 {
00061     parser.set_read_vector_and_label(&CStreamingFile::get_vector_and_label);
00062 }
00063 
00064 void CStreamingVwFeatures::reset_stream()
00065 {
00066     if (working_file->is_seekable())
00067     {
00068         working_file->reset_stream();
00069         parser.exit_parser();
00070         parser.init(working_file, has_labels, parser.get_ring_size());
00071         parser.set_free_vector_after_release(false);
00072         parser.start_parser();
00073     }
00074     else
00075         SG_ERROR("The input cannot be reset! Please use 1 pass.\n")
00076 }
00077 
00078 CVwEnvironment* CStreamingVwFeatures::get_env()
00079 {
00080     SG_REF(env);
00081     return env;
00082 }
00083 
00084 void CStreamingVwFeatures::set_env(CVwEnvironment* vw_env)
00085 {
00086     env = vw_env;
00087     SG_REF(env);
00088 }
00089 
00090 void CStreamingVwFeatures::expand_if_required(float32_t*& vec, int32_t& len)
00091 {
00092     int32_t dim = 1 << env->num_bits;
00093     if (dim > len)
00094     {
00095         vec = SG_REALLOC(float32_t, vec, len, dim);
00096         memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
00097         len = dim;
00098     }
00099 }
00100 
00101 void CStreamingVwFeatures::expand_if_required(float64_t*& vec, int32_t& len)
00102 {
00103     int32_t dim = 1 << env->num_bits;
00104     if (dim > len)
00105     {
00106         vec = SG_REALLOC(float64_t, vec, len, dim);
00107         memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
00108         len = dim;
00109     }
00110 }
00111 
00112 float32_t CStreamingVwFeatures::real_weight(float32_t w, float32_t gravity)
00113 {
00114     float32_t wprime = 0;
00115     if (gravity < fabsf(w))
00116         wprime = CMath::sign(w)*(fabsf(w) - gravity);
00117     return wprime;
00118 }
00119 
00120 int32_t CStreamingVwFeatures::get_nnz_features_for_vector()
00121 {
00122     return current_length;
00123 }
00124 
00125 int32_t CStreamingVwFeatures::get_num_vectors() const
00126 {
00127     if (current_example)
00128         return 1;
00129     else
00130         return 0;
00131 }
00132 
00133 EFeatureType CStreamingVwFeatures::get_feature_type() const
00134 {
00135     return F_DREAL;
00136 }
00137 
00138 void CStreamingVwFeatures::init()
00139 {
00140     working_file=NULL;
00141     seekable=false;
00142     current_length=-1;
00143     current_example=NULL;
00144     env=NULL;
00145 
00146     example_count = 0;
00147 }
00148 
00149 void CStreamingVwFeatures::init(CStreamingVwFile* file, bool is_labelled, int32_t size)
00150 {
00151     init();
00152     has_labels = is_labelled;
00153     working_file = file;
00154     parser.init(file, is_labelled, size);
00155     parser.set_free_vector_after_release(false);
00156     seekable=false;
00157 
00158     // Get environment from the StreamingVwFile
00159     env = ((CStreamingVwFile*) file)->get_env();
00160     SG_REF(env);
00161 }
00162 
00163 void CStreamingVwFeatures::init(CStreamingVwCacheFile* file, bool is_labelled, int32_t size)
00164 {
00165     init();
00166     has_labels = is_labelled;
00167     working_file = file;
00168     parser.init(file, is_labelled, size);
00169     parser.set_free_vector_after_release(false);
00170     seekable=true;
00171 
00172     // Get environment from the StreamingVwFile
00173     env = ((CStreamingVwCacheFile*) file)->get_env();
00174     SG_REF(env);
00175 }
00176 
00177 void CStreamingVwFeatures::setup_example(VwExample* ae)
00178 {
00179     ae->pass = env->passes_complete;
00180     ae->num_features = 0;
00181     ae->total_sum_feat_sq = 1;
00182     ae->example_counter = ++example_count;
00183     ae->global_weight = ae->ld->weight;
00184     env->t += ae->global_weight;
00185     ae->example_t = env->t;
00186 
00187     // If some namespaces should be ignored, remove them
00188     if (env->ignore_some)
00189     {
00190         for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
00191             if (env->ignore[*i])
00192             {
00193                 ae->atomics[*i].erase();
00194                 memmove(i,i+1,(ae->indices.end - (i+1))*sizeof(vw_size_t));
00195                 ae->indices.end--;
00196                 i--;
00197             }
00198     }
00199 
00200     // Add constant feature
00201     vw_size_t constant_namespace = 128;
00202     VwFeature temp = {1,constant_hash & env->mask};
00203     ae->indices.push(constant_namespace);
00204     ae->atomics[constant_namespace].push(temp);
00205     ae->sum_feat_sq[constant_namespace] = 0;
00206 
00207     if(env->stride != 1)
00208     {
00209         // Make room for per-feature information.
00210         vw_size_t stride = env->stride;
00211         for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
00212             for(VwFeature* j = ae->atomics[*i].begin; j != ae->atomics[*i].end; j++)
00213                 j->weight_index = j->weight_index*stride;
00214     }
00215 
00216     for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++)
00217     {
00218         ae->num_features += ae->atomics[*i].end - ae->atomics[*i].begin;
00219         ae->total_sum_feat_sq += ae->sum_feat_sq[*i];
00220     }
00221 
00222     // For quadratic features
00223     for (int32_t k = 0; k < env->pairs.get_num_elements(); k++)
00224     {
00225         char* i = env->pairs.get_element(k);
00226 
00227         ae->num_features
00228             += (ae->atomics[(int32_t)(i[0])].end - ae->atomics[(int32_t)(i[0])].begin)
00229             *(ae->atomics[(int32_t)(i[1])].end - ae->atomics[(int32_t)(i[1])].begin);
00230 
00231         ae->total_sum_feat_sq += ae->sum_feat_sq[(int32_t)(i[0])]*ae->sum_feat_sq[(int32_t)(i[1])];
00232     }
00233 }
00234 
00235 void CStreamingVwFeatures::start_parser()
00236 {
00237     if (!parser.is_running())
00238         parser.start_parser();
00239 }
00240 
00241 void CStreamingVwFeatures::end_parser()
00242 {
00243     parser.end_parser();
00244 }
00245 
00246 bool CStreamingVwFeatures::get_next_example()
00247 {
00248     bool ret_value;
00249     ret_value = (bool) parser.get_next_example(current_example,
00250                            current_length,
00251                            current_label);
00252     if (current_length < 1)
00253         return false;
00254 
00255     if (ret_value)
00256         setup_example(current_example);
00257     else
00258         return false;
00259 
00260     current_label = current_example->ld->label;
00261     current_length = current_example->num_features;
00262 
00263     return ret_value;
00264 }
00265 
00266 VwExample* CStreamingVwFeatures::get_example()
00267 {
00268     return current_example;
00269 }
00270 
00271 float64_t CStreamingVwFeatures::get_label()
00272 {
00273     ASSERT(has_labels)
00274 
00275     return current_label;
00276 }
00277 
00278 void CStreamingVwFeatures::release_example()
00279 {
00280     env->example_number++;
00281     env->weighted_examples += current_example->ld->weight;
00282 
00283     if (current_example->ld->label == FLT_MAX)
00284         env->weighted_labels += 0;
00285     else
00286         env->weighted_labels += current_example->ld->label * current_example->ld->weight;
00287 
00288     env->total_features += current_example->num_features;
00289     env->sum_loss += current_example->loss;
00290 
00291     current_example->reset_members();
00292     parser.finalize_example();
00293 }
00294 
00295 int32_t CStreamingVwFeatures::get_dim_feature_space() const
00296 {
00297     return current_length;
00298 }
00299 
00300 float32_t CStreamingVwFeatures::dot(CStreamingDotFeatures* df)
00301 {
00302     SG_NOTIMPLEMENTED
00303     return CMath::INFTY;
00304 }
00305 
00306 float32_t CStreamingVwFeatures::dense_dot(VwExample* &ex, const float32_t* vec2)
00307 {
00308     float32_t ret = 0.;
00309     for (vw_size_t* i = ex->indices.begin; i!= ex->indices.end; i++)
00310     {
00311         for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
00312             ret += vec2[f->weight_index & env->thread_mask] * f->x;
00313     }
00314     return ret;
00315 }
00316 
00317 float32_t CStreamingVwFeatures::dense_dot(const float32_t* vec2, int32_t vec2_len)
00318 {
00319     return dense_dot(current_example, vec2);
00320 }
00321 
00322 float32_t CStreamingVwFeatures::dense_dot(SGSparseVector<float32_t>* vec1, const float32_t* vec2)
00323 {
00324     float32_t ret = 0.;
00325     for (int32_t i = 0; i < vec1->num_feat_entries; i++)
00326         ret += vec1->features[i].entry * vec2[vec1->features[i].feat_index & env->mask];
00327 
00328     return ret;
00329 }
00330 
00331 float32_t CStreamingVwFeatures::dense_dot_truncated(const float32_t* vec2, VwExample* &ex, float32_t gravity)
00332 {
00333     float32_t ret = 0.;
00334     for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
00335     {
00336         for (VwFeature* f = ex->atomics[*i].begin; f!= ex->atomics[*i].end; f++)
00337         {
00338             float32_t w = vec2[f->weight_index & env->thread_mask];
00339             float32_t wprime = real_weight(w,gravity);
00340             ret += wprime*f->x;
00341         }
00342     }
00343 
00344     return ret;
00345 }
00346 
00347 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, VwExample* &ex, float32_t* vec2, int32_t vec2_len, bool abs_val)
00348 {
00349     if (abs_val)
00350     {
00351         for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
00352         {
00353             for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
00354                 vec2[f->weight_index & env->thread_mask] += alpha * abs(f->x);
00355         }
00356     }
00357     else
00358     {
00359         for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++)
00360         {
00361             for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++)
00362                 vec2[f->weight_index & env->thread_mask] += alpha * f->x;
00363         }
00364     }
00365 }
00366 
00367 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
00368 {
00369     add_to_dense_vec(alpha, current_example, vec2, vec2_len, abs_val);
00370 }
00371 
00372 int32_t CStreamingVwFeatures::get_num_features()
00373 {
00374     return current_length;
00375 }
00376 
00377 EFeatureClass CStreamingVwFeatures::get_feature_class() const
00378 {
00379     return C_STREAMING_VW;
00380 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation