SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
DelimiterTokenizer.h
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evangelos Anagnostopoulos
00008  * Copyright (C) 2013 Evangelos Anagnostopoulos
00009  */
00010 
00011 #ifndef _DELIMITERTOKENIZER__H__
00012 #define _DELIMITERTOKENIZER__H__
00013 
00014 #include <shogun/lib/Tokenizer.h>
00015 
00016 namespace shogun
00017 {
00018 class CTokenizer;
00019 
00026 class CDelimiterTokenizer: public CTokenizer
00027 {
00028 public:
00033     CDelimiterTokenizer(bool skip_delimiters = false);
00034 
00039     CDelimiterTokenizer(const CDelimiterTokenizer& orig);
00040 
00042     virtual ~CDelimiterTokenizer() {}
00043 
00048     virtual void set_text(SGVector<char> txt);
00049 
00055     virtual bool has_next();
00056 
00065     virtual index_t next_token_idx(index_t& start);
00066 
00072     virtual const char* get_name() const;
00073 
00077     void init_for_whitespace();
00078 
00079     CDelimiterTokenizer* get_copy();
00080 
00082     void clear_delimiters();
00083 
00088     bool get_skip_delimiters() const;
00089 
00094     void set_skip_delimiters(bool skip_delimiters);
00095 
00096 private:
00097     void init();
00098 
00099 public:
00101     SGVector<bool> delimiters;
00102 
00103 protected:
00105     index_t last_idx;
00106 
00108     bool skip_consecutive_delimiters;
00109 };
00110 }
00111 #endif  /* _WHITESPACETOKENIZER__H__ */
00112 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation