SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evangelos Anagnostopoulos 00008 * Copyright (C) 2013 Evangelos Anagnostopoulos 00009 */ 00010 00011 #ifndef _HASHEDDOCDOTFEATURES__H__ 00012 #define _HASHEDDOCDOTFEATURES__H__ 00013 00014 #include <shogun/features/DotFeatures.h> 00015 #include <shogun/features/StringFeatures.h> 00016 #include <shogun/converter/HashedDocConverter.h> 00017 #include <shogun/lib/Tokenizer.h> 00018 00019 namespace shogun { 00020 template<class ST> class CStringFeatures; 00021 template<class ST> class SGMatrix; 00022 class CDotFeatures; 00023 class CHashedDocConverter; 00024 class CTokenizer; 00025 00036 class CHashedDocDotFeatures: public CDotFeatures 00037 { 00038 public: 00039 00050 CHashedDocDotFeatures(int32_t hash_bits=0, CStringFeatures<char>* docs=NULL, 00051 CTokenizer* tzer=NULL, bool normalize=true, int32_t n_grams=1, int32_t skips=0, int32_t size=0); 00052 00054 CHashedDocDotFeatures(const CHashedDocDotFeatures& orig); 00055 00060 CHashedDocDotFeatures(CFile* loader); 00061 00063 virtual ~CHashedDocDotFeatures(); 00064 00072 virtual int32_t get_dim_feature_space() const; 00073 00081 virtual float64_t dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2); 00082 00088 virtual float64_t dense_dot_sgvec(int32_t vec_idx1, const SGVector<float64_t> vec2); 00089 00096 virtual float64_t dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len); 00097 00106 virtual void add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val=false); 00107 00115 virtual int32_t get_nnz_features_for_vector(int32_t num); 00116 00127 virtual void* get_feature_iterator(int32_t vector_index); 00128 00140 virtual bool get_next_feature(int32_t& index, float64_t& value, void* iterator); 00141 00148 virtual void free_feature_iterator(void* iterator); 00149 00154 void set_doc_collection(CStringFeatures<char>* docs); 00155 00156 virtual const char* get_name() const; 00157 00162 virtual CFeatures* duplicate() const; 00163 00168 virtual EFeatureType get_feature_type() const; 00169 00174 virtual EFeatureClass get_feature_class() const; 00175 00180 virtual int32_t get_num_vectors() const; 00181 00190 static uint32_t calculate_token_hash(char* token, int32_t length, 00191 int32_t num_bits, uint32_t seed); 00192 00193 private: 00194 void init(int32_t hash_bits, CStringFeatures<char>* docs, CTokenizer* tzer, 00195 bool normalize, int32_t n_grams, int32_t skips); 00196 00197 protected: 00199 CStringFeatures<char>* doc_collection; 00200 00202 int32_t num_bits; 00203 00205 CTokenizer* tokenizer; 00206 00208 bool should_normalize; 00209 00211 int32_t ngrams; 00212 00214 int32_t tokens_to_skip; 00215 }; 00216 } 00217 00218 #endif