SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evangelos Anagnostopoulos 00008 * Copyright (C) 2013 Evangelos Anagnostopoulos 00009 */ 00010 #ifndef _STREAMING_HASHEDDOCDOTFEATURES__H__ 00011 #define _STREAMING_HASHEDDOCDOTFEATURES__H__ 00012 00013 #include <shogun/features/StringFeatures.h> 00014 #include <shogun/features/streaming/StreamingDotFeatures.h> 00015 #include <shogun/lib/Tokenizer.h> 00016 #include <shogun/converter/HashedDocConverter.h> 00017 #include <shogun/io/streaming/InputParser.h> 00018 #include <shogun/io/streaming/StreamingFileFromStringFeatures.h> 00019 00020 namespace shogun 00021 { 00022 class CStreamingDotFeatures; 00023 class CTokenizer; 00024 class CHashedDocConverter; 00025 00040 class CStreamingHashedDocDotFeatures : public CStreamingDotFeatures 00041 { 00042 public: 00044 CStreamingHashedDocDotFeatures(); 00045 00057 CStreamingHashedDocDotFeatures(CStreamingFile* file, bool is_labelled, int32_t size, 00058 CTokenizer* tzer, int32_t bits=20); 00059 00076 CStreamingHashedDocDotFeatures(CStringFeatures<char>* dot_features,CTokenizer* tzer, 00077 int32_t bits=20, float64_t* lab=NULL); 00078 00080 virtual ~CStreamingHashedDocDotFeatures(); 00081 00088 virtual float32_t dot(CStreamingDotFeatures* df); 00089 00095 virtual float32_t dense_dot(const float32_t* vec2, int32_t vec2_len); 00096 00104 virtual void add_to_dense_vec(float32_t alpha, float32_t* vec2, 00105 int32_t vec2_len, bool abs_val=false); 00106 00114 virtual int32_t get_dim_feature_space() const; 00115 00121 virtual const char* get_name() const; 00122 00128 virtual int32_t get_num_vectors() const; 00129 00135 virtual CFeatures* duplicate() const; 00136 00146 virtual void set_vector_reader(); 00147 00157 virtual void set_vector_and_label_reader(); 00158 00164 virtual EFeatureType get_feature_type() const; 00165 00171 virtual EFeatureClass get_feature_class() const; 00172 00177 virtual void start_parser(); 00178 00182 virtual void end_parser(); 00183 00191 virtual float64_t get_label(); 00192 00198 virtual bool get_next_example(); 00199 00205 virtual void release_example(); 00206 00212 virtual int32_t get_num_features(); 00213 00218 SGSparseVector<float64_t> get_vector(); 00219 00224 void set_normalization(bool normalize); 00225 00233 void set_k_skip_n_grams(int32_t k, int32_t n); 00234 00235 private: 00236 void init(CStreamingFile* file, bool is_labelled, int32_t size, CTokenizer* tzer, 00237 int32_t bits, bool normalize, int32_t n_grams, int32_t skips); 00238 00239 protected: 00240 00242 int32_t num_bits; 00243 00245 SGSparseVector<float64_t> current_vector; 00246 00248 CTokenizer *tokenizer; 00249 00251 CHashedDocConverter* converter; 00252 00254 CInputParser<char> parser; 00255 00257 float64_t current_label; 00258 }; 00259 } 00260 00261 #endif // _STREAMING_HASHEDDOCDOTFEATURES__H__