SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
HashedDocDotFeatures.h
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evangelos Anagnostopoulos
00008  * Copyright (C) 2013 Evangelos Anagnostopoulos
00009  */
00010 
00011 #ifndef _HASHEDDOCDOTFEATURES__H__
00012 #define _HASHEDDOCDOTFEATURES__H__
00013 
00014 #include <shogun/features/DotFeatures.h>
00015 #include <shogun/features/StringFeatures.h>
00016 #include <shogun/converter/HashedDocConverter.h>
00017 #include <shogun/lib/Tokenizer.h>
00018 
00019 namespace shogun {
00020 template<class ST> class CStringFeatures;
00021 template<class ST> class SGMatrix;
00022 class CDotFeatures;
00023 class CHashedDocConverter;
00024 class CTokenizer;
00025 
00036 class CHashedDocDotFeatures: public CDotFeatures
00037 {
00038 public:
00039 
00050     CHashedDocDotFeatures(int32_t hash_bits=0, CStringFeatures<char>* docs=NULL,
00051             CTokenizer* tzer=NULL, bool normalize=true, int32_t n_grams=1, int32_t skips=0, int32_t size=0);
00052 
00054     CHashedDocDotFeatures(const CHashedDocDotFeatures& orig);
00055 
00060     CHashedDocDotFeatures(CFile* loader);
00061 
00063     virtual ~CHashedDocDotFeatures();
00064 
00072     virtual int32_t get_dim_feature_space() const;
00073 
00081     virtual float64_t dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2);
00082 
00088     virtual float64_t dense_dot_sgvec(int32_t vec_idx1, const SGVector<float64_t> vec2);
00089 
00096     virtual float64_t dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len);
00097 
00106     virtual void add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val=false);
00107 
00115     virtual int32_t get_nnz_features_for_vector(int32_t num);
00116 
00127     virtual void* get_feature_iterator(int32_t vector_index);
00128 
00140     virtual bool get_next_feature(int32_t& index, float64_t& value, void* iterator);
00141 
00148     virtual void free_feature_iterator(void* iterator);
00149 
00154     void set_doc_collection(CStringFeatures<char>* docs);
00155 
00156     virtual const char* get_name() const;
00157 
00162     virtual CFeatures* duplicate() const;
00163 
00168     virtual EFeatureType get_feature_type() const;
00169 
00174     virtual EFeatureClass get_feature_class() const;
00175 
00180     virtual int32_t get_num_vectors() const;
00181 
00190     static uint32_t calculate_token_hash(char* token, int32_t length,
00191             int32_t num_bits, uint32_t seed);
00192 
00193 private:
00194     void init(int32_t hash_bits, CStringFeatures<char>* docs, CTokenizer* tzer,
00195         bool normalize, int32_t n_grams, int32_t skips);
00196 
00197 protected:
00199     CStringFeatures<char>* doc_collection;
00200 
00202     int32_t num_bits;
00203 
00205     CTokenizer* tokenizer;
00206 
00208     bool should_normalize;
00209 
00211     int32_t ngrams;
00212 
00214     int32_t tokens_to_skip;
00215 };
00216 }
00217 
00218 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation