SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evangelos Anagnostopoulos 00008 * Copyright (C) 2013 Evangelos Anagnostopoulos 00009 */ 00010 00011 #ifndef _DELIMITERTOKENIZER__H__ 00012 #define _DELIMITERTOKENIZER__H__ 00013 00014 #include <shogun/lib/Tokenizer.h> 00015 00016 namespace shogun 00017 { 00018 class CTokenizer; 00019 00026 class CDelimiterTokenizer: public CTokenizer 00027 { 00028 public: 00033 CDelimiterTokenizer(bool skip_delimiters = false); 00034 00039 CDelimiterTokenizer(const CDelimiterTokenizer& orig); 00040 00042 virtual ~CDelimiterTokenizer() {} 00043 00048 virtual void set_text(SGVector<char> txt); 00049 00055 virtual bool has_next(); 00056 00065 virtual index_t next_token_idx(index_t& start); 00066 00072 virtual const char* get_name() const; 00073 00077 void init_for_whitespace(); 00078 00079 CDelimiterTokenizer* get_copy(); 00080 00082 void clear_delimiters(); 00083 00088 bool get_skip_delimiters() const; 00089 00094 void set_skip_delimiters(bool skip_delimiters); 00095 00096 private: 00097 void init(); 00098 00099 public: 00101 SGVector<bool> delimiters; 00102 00103 protected: 00105 index_t last_idx; 00106 00108 bool skip_consecutive_delimiters; 00109 }; 00110 } 00111 #endif /* _WHITESPACETOKENIZER__H__ */ 00112