SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
HashedDocConverter.h
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evangelos Anagnostopoulos
00008  * Copyright (C) 2013 Evangelos Anagnostopoulos
00009  */
00010 
00011 #ifndef _HASHEDDOCCONVERTER__H__
00012 #define _HASHEDDOCCONVERTER__H__
00013 
00014 #include <shogun/converter/Converter.h>
00015 #include <shogun/features/Features.h>
00016 #include <shogun/lib/Tokenizer.h>
00017 #include <shogun/features/SparseFeatures.h>
00018 
00019 namespace shogun
00020 {
00021 class CFeatures;
00022 class CTokenizer;
00023 class CConverter;
00024 template<class T> class CSparseFeatures;
00025 
00037 class CHashedDocConverter : public CConverter
00038 {
00039 public:
00041     CHashedDocConverter();
00042 
00051     CHashedDocConverter(int32_t hash_bits, bool normalize = false, int32_t n_grams = 1, int32_t skips = 0);
00052 
00061     CHashedDocConverter(CTokenizer* tzer, int32_t hash_bits, bool normalize = false, int32_t n_grams = 1,
00062         int32_t skips = 0);
00063 
00065     virtual ~CHashedDocConverter();
00066 
00072     virtual CFeatures* apply(CFeatures* features);
00073 
00079     SGSparseVector<float64_t> apply(SGVector<char> document);
00080 
00099     static index_t generate_ngram_hashes(SGVector<uint32_t>& hashes, index_t hashes_start, index_t len,
00100             SGVector<index_t>& ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip);
00101 
00103     virtual const char* get_name() const;
00104 
00109     void set_normalization(bool normalize);
00110 
00118     void set_k_skip_n_grams(int32_t k, int32_t n);
00119 protected:
00120 
00122     void init(CTokenizer* tzer, int32_t d, bool normalize, int32_t n_grams, int32_t skips);
00123 
00130     int32_t count_distinct_indices(CDynamicArray<uint32_t>& hashed_indices);
00131 
00138     SGSparseVector<float64_t> create_hashed_representation(CDynamicArray<uint32_t>& hashed_indices);
00139 
00140 protected:
00141 
00143     int32_t num_bits;
00144 
00146     CTokenizer* tokenizer;
00147 
00149     bool should_normalize;
00150 
00152     int32_t ngrams;
00153 
00155     int32_t tokens_to_skip;
00156 };
00157 }
00158 
00159 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation