SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
NGramTokenizer.h
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evangelos Anagnostopoulos
00008  * Copyright (C) 2013 Evangelos Anagnostopoulos
00009  */
00010 
00011 #ifndef _NGRAMTOKENIZER__H__
00012 #define _NGRAMTOKENIZER__H__
00013 
00014 #include <shogun/lib/Tokenizer.h>
00015 
00016 namespace shogun
00017 {
00018 class CTokenizer;
00019 
00023 class CNGramTokenizer: public CTokenizer
00024 {
00025 public:
00030     CNGramTokenizer(int32_t ns=3);
00031 
00036     CNGramTokenizer(const CNGramTokenizer& orig);
00037 
00039     virtual ~CNGramTokenizer() {}
00040 
00045     virtual void set_text(SGVector<char> txt);
00046 
00052     virtual bool has_next();
00053 
00060     virtual index_t next_token_idx(index_t& start);
00061 
00067     virtual const char* get_name() const;
00068 
00069     virtual CNGramTokenizer* get_copy();
00070 
00071 private:
00072     void init();
00073 
00074 protected:
00075 
00077     int32_t n;
00078 
00080     index_t last_idx;
00081 };
00082 }
00083 #endif  /* _NGRAMTOKENIZER__H__ */
00084 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation