SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evangelos Anagnostopoulos 00008 * Copyright (C) 2013 Evangelos Anagnostopoulos 00009 */ 00010 00011 #ifndef _NGRAMTOKENIZER__H__ 00012 #define _NGRAMTOKENIZER__H__ 00013 00014 #include <shogun/lib/Tokenizer.h> 00015 00016 namespace shogun 00017 { 00018 class CTokenizer; 00019 00023 class CNGramTokenizer: public CTokenizer 00024 { 00025 public: 00030 CNGramTokenizer(int32_t ns=3); 00031 00036 CNGramTokenizer(const CNGramTokenizer& orig); 00037 00039 virtual ~CNGramTokenizer() {} 00040 00045 virtual void set_text(SGVector<char> txt); 00046 00052 virtual bool has_next(); 00053 00060 virtual index_t next_token_idx(index_t& start); 00061 00067 virtual const char* get_name() const; 00068 00069 virtual CNGramTokenizer* get_copy(); 00070 00071 private: 00072 void init(); 00073 00074 protected: 00075 00077 int32_t n; 00078 00080 index_t last_idx; 00081 }; 00082 } 00083 #endif /* _NGRAMTOKENIZER__H__ */ 00084