SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evangelos Anagnostopoulos 00008 * Copyright (C) 2013 Evangelos Anagnostopoulos 00009 */ 00010 00011 #ifndef _HASHEDDOCCONVERTER__H__ 00012 #define _HASHEDDOCCONVERTER__H__ 00013 00014 #include <shogun/converter/Converter.h> 00015 #include <shogun/features/Features.h> 00016 #include <shogun/lib/Tokenizer.h> 00017 #include <shogun/features/SparseFeatures.h> 00018 00019 namespace shogun 00020 { 00021 class CFeatures; 00022 class CTokenizer; 00023 class CConverter; 00024 template<class T> class CSparseFeatures; 00025 00037 class CHashedDocConverter : public CConverter 00038 { 00039 public: 00041 CHashedDocConverter(); 00042 00051 CHashedDocConverter(int32_t hash_bits, bool normalize = false, int32_t n_grams = 1, int32_t skips = 0); 00052 00061 CHashedDocConverter(CTokenizer* tzer, int32_t hash_bits, bool normalize = false, int32_t n_grams = 1, 00062 int32_t skips = 0); 00063 00065 virtual ~CHashedDocConverter(); 00066 00072 virtual CFeatures* apply(CFeatures* features); 00073 00079 SGSparseVector<float64_t> apply(SGVector<char> document); 00080 00099 static index_t generate_ngram_hashes(SGVector<uint32_t>& hashes, index_t hashes_start, index_t len, 00100 SGVector<index_t>& ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip); 00101 00103 virtual const char* get_name() const; 00104 00109 void set_normalization(bool normalize); 00110 00118 void set_k_skip_n_grams(int32_t k, int32_t n); 00119 protected: 00120 00122 void init(CTokenizer* tzer, int32_t d, bool normalize, int32_t n_grams, int32_t skips); 00123 00130 int32_t count_distinct_indices(CDynamicArray<uint32_t>& hashed_indices); 00131 00138 SGSparseVector<float64_t> create_hashed_representation(CDynamicArray<uint32_t>& hashed_indices); 00139 00140 protected: 00141 00143 int32_t num_bits; 00144 00146 CTokenizer* tokenizer; 00147 00149 bool should_normalize; 00150 00152 int32_t ngrams; 00153 00155 int32_t tokens_to_skip; 00156 }; 00157 } 00158 00159 #endif