SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
StreamingHashedDocDotFeatures.h
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evangelos Anagnostopoulos
00008  * Copyright (C) 2013 Evangelos Anagnostopoulos
00009  */
00010 #ifndef _STREAMING_HASHEDDOCDOTFEATURES__H__
00011 #define _STREAMING_HASHEDDOCDOTFEATURES__H__
00012 
00013 #include <shogun/features/StringFeatures.h>
00014 #include <shogun/features/streaming/StreamingDotFeatures.h>
00015 #include <shogun/lib/Tokenizer.h>
00016 #include <shogun/converter/HashedDocConverter.h>
00017 #include <shogun/io/streaming/InputParser.h>
00018 #include <shogun/io/streaming/StreamingFileFromStringFeatures.h>
00019 
00020 namespace shogun
00021 {
00022 class CStreamingDotFeatures;
00023 class CTokenizer;
00024 class CHashedDocConverter;
00025 
00040 class CStreamingHashedDocDotFeatures : public CStreamingDotFeatures
00041 {
00042 public:
00044     CStreamingHashedDocDotFeatures();
00045 
00057     CStreamingHashedDocDotFeatures(CStreamingFile* file, bool is_labelled, int32_t size,
00058             CTokenizer* tzer, int32_t bits=20);
00059 
00076     CStreamingHashedDocDotFeatures(CStringFeatures<char>* dot_features,CTokenizer* tzer,
00077             int32_t bits=20, float64_t* lab=NULL);
00078 
00080     virtual ~CStreamingHashedDocDotFeatures();
00081 
00088     virtual float32_t dot(CStreamingDotFeatures* df);
00089 
00095     virtual float32_t dense_dot(const float32_t* vec2, int32_t vec2_len);
00096 
00104     virtual void add_to_dense_vec(float32_t alpha, float32_t* vec2,
00105             int32_t vec2_len, bool abs_val=false);
00106 
00114     virtual int32_t get_dim_feature_space() const;
00115 
00121     virtual const char* get_name() const;
00122 
00128     virtual int32_t get_num_vectors() const;
00129 
00135     virtual CFeatures* duplicate() const;
00136 
00146     virtual void set_vector_reader();
00147 
00157     virtual void set_vector_and_label_reader();
00158 
00164     virtual EFeatureType get_feature_type() const;
00165 
00171     virtual EFeatureClass get_feature_class() const;
00172 
00177     virtual void start_parser();
00178 
00182     virtual void end_parser();
00183 
00191     virtual float64_t get_label();
00192 
00198     virtual bool get_next_example();
00199 
00205     virtual void release_example();
00206 
00212     virtual int32_t get_num_features();
00213 
00218     SGSparseVector<float64_t> get_vector();
00219 
00224     void set_normalization(bool normalize);
00225 
00233     void set_k_skip_n_grams(int32_t k, int32_t n);
00234 
00235 private:
00236     void init(CStreamingFile* file, bool is_labelled, int32_t size, CTokenizer* tzer,
00237         int32_t bits, bool normalize, int32_t n_grams, int32_t skips);
00238 
00239 protected:
00240 
00242     int32_t num_bits;
00243 
00245     SGSparseVector<float64_t> current_vector;
00246 
00248     CTokenizer *tokenizer;
00249 
00251     CHashedDocConverter* converter;
00252 
00254     CInputParser<char> parser;
00255 
00257     float64_t current_label;
00258 };
00259 }
00260 
00261 #endif // _STREAMING_HASHEDDOCDOTFEATURES__H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation