SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evangelos Anagnostopoulos 00008 * Copyright (C) 2013 Evangelos Anagnostopoulos 00009 */ 00010 00011 #include <shogun/lib/NGramTokenizer.h> 00012 #include <shogun/base/Parameter.h> 00013 00014 namespace shogun 00015 { 00016 00017 CNGramTokenizer::CNGramTokenizer(int32_t ns) : CTokenizer() 00018 { 00019 n = ns; 00020 last_idx = 0; 00021 init(); 00022 } 00023 00024 CNGramTokenizer::CNGramTokenizer(const CNGramTokenizer& orig) 00025 : CTokenizer(orig) 00026 { 00027 CTokenizer::set_text(orig.text); 00028 n = orig.n; 00029 init(); 00030 } 00031 00032 void CNGramTokenizer::init() 00033 { 00034 SG_ADD(&n, "n", "Size of n-grams", 00035 MS_NOT_AVAILABLE); 00036 SG_ADD(&last_idx, "last_idx", "Index of last token", 00037 MS_NOT_AVAILABLE); 00038 } 00039 00040 void CNGramTokenizer::set_text(SGVector<char> txt) 00041 { 00042 last_idx = 0; 00043 CTokenizer::set_text(txt); 00044 } 00045 00046 const char* CNGramTokenizer::get_name() const 00047 { 00048 return "NGramTokenizer"; 00049 } 00050 00051 bool CNGramTokenizer::has_next() 00052 { 00053 return last_idx<=text.size()-n; 00054 } 00055 00056 index_t CNGramTokenizer::next_token_idx(index_t& start) 00057 { 00058 start = last_idx++; 00059 return start + n; 00060 } 00061 00062 CNGramTokenizer* CNGramTokenizer::get_copy() 00063 { 00064 CNGramTokenizer* t = new CNGramTokenizer(n); 00065 return t; 00066 } 00067 }