SHOGUN
v3.2.0
|
00001 /* 00002 * Copyright (c) 2009 Yahoo! Inc. All rights reserved. The copyrights 00003 * embodied in the content of this file are licensed under the BSD 00004 * (revised) open source license. 00005 * 00006 * This program is free software; you can redistribute it and/or modify 00007 * it under the terms of the GNU General Public License as published by 00008 * the Free Software Foundation; either version 3 of the License, or 00009 * (at your option) any later version. 00010 * 00011 * Written (W) 2011 Shashwat Lal Das 00012 * Adaptation of Vowpal Wabbit v5.1. 00013 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society. 00014 */ 00015 00016 #include <shogun/features/streaming/StreamingVwFeatures.h> 00017 00018 using namespace shogun; 00019 00020 CStreamingVwFeatures::CStreamingVwFeatures() : CStreamingDotFeatures() 00021 { 00022 init(); 00023 set_read_functions(); 00024 } 00025 00026 CStreamingVwFeatures::CStreamingVwFeatures(CStreamingVwFile* file, 00027 bool is_labelled, int32_t size) 00028 : CStreamingDotFeatures() 00029 { 00030 init(file, is_labelled, size); 00031 set_read_functions(); 00032 } 00033 00034 CStreamingVwFeatures::CStreamingVwFeatures(CStreamingVwCacheFile* file, 00035 bool is_labelled, int32_t size) 00036 : CStreamingDotFeatures() 00037 { 00038 init(file, is_labelled, size); 00039 set_read_functions(); 00040 } 00041 00042 CStreamingVwFeatures::~CStreamingVwFeatures() 00043 { 00044 if (parser.is_running()) 00045 parser.end_parser(); 00046 SG_UNREF(env); 00047 } 00048 00049 CFeatures* CStreamingVwFeatures::duplicate() const 00050 { 00051 return new CStreamingVwFeatures(*this); 00052 } 00053 00054 void CStreamingVwFeatures::set_vector_reader() 00055 { 00056 parser.set_read_vector(&CStreamingFile::get_vector); 00057 } 00058 00059 void CStreamingVwFeatures::set_vector_and_label_reader() 00060 { 00061 parser.set_read_vector_and_label(&CStreamingFile::get_vector_and_label); 00062 } 00063 00064 void CStreamingVwFeatures::reset_stream() 00065 { 00066 if (working_file->is_seekable()) 00067 { 00068 working_file->reset_stream(); 00069 parser.exit_parser(); 00070 parser.init(working_file, has_labels, parser.get_ring_size()); 00071 parser.set_free_vector_after_release(false); 00072 parser.start_parser(); 00073 } 00074 else 00075 SG_ERROR("The input cannot be reset! Please use 1 pass.\n") 00076 } 00077 00078 CVwEnvironment* CStreamingVwFeatures::get_env() 00079 { 00080 SG_REF(env); 00081 return env; 00082 } 00083 00084 void CStreamingVwFeatures::set_env(CVwEnvironment* vw_env) 00085 { 00086 env = vw_env; 00087 SG_REF(env); 00088 } 00089 00090 void CStreamingVwFeatures::expand_if_required(float32_t*& vec, int32_t& len) 00091 { 00092 int32_t dim = 1 << env->num_bits; 00093 if (dim > len) 00094 { 00095 vec = SG_REALLOC(float32_t, vec, len, dim); 00096 memset(&vec[len], 0, (dim-len) * sizeof(float32_t)); 00097 len = dim; 00098 } 00099 } 00100 00101 void CStreamingVwFeatures::expand_if_required(float64_t*& vec, int32_t& len) 00102 { 00103 int32_t dim = 1 << env->num_bits; 00104 if (dim > len) 00105 { 00106 vec = SG_REALLOC(float64_t, vec, len, dim); 00107 memset(&vec[len], 0, (dim-len) * sizeof(float64_t)); 00108 len = dim; 00109 } 00110 } 00111 00112 float32_t CStreamingVwFeatures::real_weight(float32_t w, float32_t gravity) 00113 { 00114 float32_t wprime = 0; 00115 if (gravity < fabsf(w)) 00116 wprime = CMath::sign(w)*(fabsf(w) - gravity); 00117 return wprime; 00118 } 00119 00120 int32_t CStreamingVwFeatures::get_nnz_features_for_vector() 00121 { 00122 return current_length; 00123 } 00124 00125 int32_t CStreamingVwFeatures::get_num_vectors() const 00126 { 00127 if (current_example) 00128 return 1; 00129 else 00130 return 0; 00131 } 00132 00133 EFeatureType CStreamingVwFeatures::get_feature_type() const 00134 { 00135 return F_DREAL; 00136 } 00137 00138 void CStreamingVwFeatures::init() 00139 { 00140 working_file=NULL; 00141 seekable=false; 00142 current_length=-1; 00143 current_example=NULL; 00144 env=NULL; 00145 00146 example_count = 0; 00147 } 00148 00149 void CStreamingVwFeatures::init(CStreamingVwFile* file, bool is_labelled, int32_t size) 00150 { 00151 init(); 00152 has_labels = is_labelled; 00153 working_file = file; 00154 parser.init(file, is_labelled, size); 00155 parser.set_free_vector_after_release(false); 00156 seekable=false; 00157 00158 // Get environment from the StreamingVwFile 00159 env = ((CStreamingVwFile*) file)->get_env(); 00160 SG_REF(env); 00161 } 00162 00163 void CStreamingVwFeatures::init(CStreamingVwCacheFile* file, bool is_labelled, int32_t size) 00164 { 00165 init(); 00166 has_labels = is_labelled; 00167 working_file = file; 00168 parser.init(file, is_labelled, size); 00169 parser.set_free_vector_after_release(false); 00170 seekable=true; 00171 00172 // Get environment from the StreamingVwFile 00173 env = ((CStreamingVwCacheFile*) file)->get_env(); 00174 SG_REF(env); 00175 } 00176 00177 void CStreamingVwFeatures::setup_example(VwExample* ae) 00178 { 00179 ae->pass = env->passes_complete; 00180 ae->num_features = 0; 00181 ae->total_sum_feat_sq = 1; 00182 ae->example_counter = ++example_count; 00183 ae->global_weight = ae->ld->weight; 00184 env->t += ae->global_weight; 00185 ae->example_t = env->t; 00186 00187 // If some namespaces should be ignored, remove them 00188 if (env->ignore_some) 00189 { 00190 for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++) 00191 if (env->ignore[*i]) 00192 { 00193 ae->atomics[*i].erase(); 00194 memmove(i,i+1,(ae->indices.end - (i+1))*sizeof(vw_size_t)); 00195 ae->indices.end--; 00196 i--; 00197 } 00198 } 00199 00200 // Add constant feature 00201 vw_size_t constant_namespace = 128; 00202 VwFeature temp = {1,constant_hash & env->mask}; 00203 ae->indices.push(constant_namespace); 00204 ae->atomics[constant_namespace].push(temp); 00205 ae->sum_feat_sq[constant_namespace] = 0; 00206 00207 if(env->stride != 1) 00208 { 00209 // Make room for per-feature information. 00210 vw_size_t stride = env->stride; 00211 for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++) 00212 for(VwFeature* j = ae->atomics[*i].begin; j != ae->atomics[*i].end; j++) 00213 j->weight_index = j->weight_index*stride; 00214 } 00215 00216 for (vw_size_t* i = ae->indices.begin; i != ae->indices.end; i++) 00217 { 00218 ae->num_features += ae->atomics[*i].end - ae->atomics[*i].begin; 00219 ae->total_sum_feat_sq += ae->sum_feat_sq[*i]; 00220 } 00221 00222 // For quadratic features 00223 for (int32_t k = 0; k < env->pairs.get_num_elements(); k++) 00224 { 00225 char* i = env->pairs.get_element(k); 00226 00227 ae->num_features 00228 += (ae->atomics[(int32_t)(i[0])].end - ae->atomics[(int32_t)(i[0])].begin) 00229 *(ae->atomics[(int32_t)(i[1])].end - ae->atomics[(int32_t)(i[1])].begin); 00230 00231 ae->total_sum_feat_sq += ae->sum_feat_sq[(int32_t)(i[0])]*ae->sum_feat_sq[(int32_t)(i[1])]; 00232 } 00233 } 00234 00235 void CStreamingVwFeatures::start_parser() 00236 { 00237 if (!parser.is_running()) 00238 parser.start_parser(); 00239 } 00240 00241 void CStreamingVwFeatures::end_parser() 00242 { 00243 parser.end_parser(); 00244 } 00245 00246 bool CStreamingVwFeatures::get_next_example() 00247 { 00248 bool ret_value; 00249 ret_value = (bool) parser.get_next_example(current_example, 00250 current_length, 00251 current_label); 00252 if (current_length < 1) 00253 return false; 00254 00255 if (ret_value) 00256 setup_example(current_example); 00257 else 00258 return false; 00259 00260 current_label = current_example->ld->label; 00261 current_length = current_example->num_features; 00262 00263 return ret_value; 00264 } 00265 00266 VwExample* CStreamingVwFeatures::get_example() 00267 { 00268 return current_example; 00269 } 00270 00271 float64_t CStreamingVwFeatures::get_label() 00272 { 00273 ASSERT(has_labels) 00274 00275 return current_label; 00276 } 00277 00278 void CStreamingVwFeatures::release_example() 00279 { 00280 env->example_number++; 00281 env->weighted_examples += current_example->ld->weight; 00282 00283 if (current_example->ld->label == FLT_MAX) 00284 env->weighted_labels += 0; 00285 else 00286 env->weighted_labels += current_example->ld->label * current_example->ld->weight; 00287 00288 env->total_features += current_example->num_features; 00289 env->sum_loss += current_example->loss; 00290 00291 current_example->reset_members(); 00292 parser.finalize_example(); 00293 } 00294 00295 int32_t CStreamingVwFeatures::get_dim_feature_space() const 00296 { 00297 return current_length; 00298 } 00299 00300 float32_t CStreamingVwFeatures::dot(CStreamingDotFeatures* df) 00301 { 00302 SG_NOTIMPLEMENTED 00303 return CMath::INFTY; 00304 } 00305 00306 float32_t CStreamingVwFeatures::dense_dot(VwExample* &ex, const float32_t* vec2) 00307 { 00308 float32_t ret = 0.; 00309 for (vw_size_t* i = ex->indices.begin; i!= ex->indices.end; i++) 00310 { 00311 for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++) 00312 ret += vec2[f->weight_index & env->thread_mask] * f->x; 00313 } 00314 return ret; 00315 } 00316 00317 float32_t CStreamingVwFeatures::dense_dot(const float32_t* vec2, int32_t vec2_len) 00318 { 00319 return dense_dot(current_example, vec2); 00320 } 00321 00322 float32_t CStreamingVwFeatures::dense_dot(SGSparseVector<float32_t>* vec1, const float32_t* vec2) 00323 { 00324 float32_t ret = 0.; 00325 for (int32_t i = 0; i < vec1->num_feat_entries; i++) 00326 ret += vec1->features[i].entry * vec2[vec1->features[i].feat_index & env->mask]; 00327 00328 return ret; 00329 } 00330 00331 float32_t CStreamingVwFeatures::dense_dot_truncated(const float32_t* vec2, VwExample* &ex, float32_t gravity) 00332 { 00333 float32_t ret = 0.; 00334 for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++) 00335 { 00336 for (VwFeature* f = ex->atomics[*i].begin; f!= ex->atomics[*i].end; f++) 00337 { 00338 float32_t w = vec2[f->weight_index & env->thread_mask]; 00339 float32_t wprime = real_weight(w,gravity); 00340 ret += wprime*f->x; 00341 } 00342 } 00343 00344 return ret; 00345 } 00346 00347 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, VwExample* &ex, float32_t* vec2, int32_t vec2_len, bool abs_val) 00348 { 00349 if (abs_val) 00350 { 00351 for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++) 00352 { 00353 for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++) 00354 vec2[f->weight_index & env->thread_mask] += alpha * abs(f->x); 00355 } 00356 } 00357 else 00358 { 00359 for (vw_size_t* i = ex->indices.begin; i != ex->indices.end; i++) 00360 { 00361 for (VwFeature* f = ex->atomics[*i].begin; f != ex->atomics[*i].end; f++) 00362 vec2[f->weight_index & env->thread_mask] += alpha * f->x; 00363 } 00364 } 00365 } 00366 00367 void CStreamingVwFeatures::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val) 00368 { 00369 add_to_dense_vec(alpha, current_example, vec2, vec2_len, abs_val); 00370 } 00371 00372 int32_t CStreamingVwFeatures::get_num_features() 00373 { 00374 return current_length; 00375 } 00376 00377 EFeatureClass CStreamingVwFeatures::get_feature_class() const 00378 { 00379 return C_STREAMING_VW; 00380 }