SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evangelos Anagnostopoulos 00008 * Copyright (C) 2013 Evangelos Anagnostopoulos 00009 */ 00010 00011 #include <shogun/base/Parameter.h> 00012 #include <shogun/lib/DelimiterTokenizer.h> 00013 00014 namespace shogun 00015 { 00016 00017 CDelimiterTokenizer::CDelimiterTokenizer(bool skip_delimiters) : delimiters(256) 00018 { 00019 last_idx = 0; 00020 skip_consecutive_delimiters = skip_delimiters; 00021 init(); 00022 } 00023 00024 CDelimiterTokenizer::CDelimiterTokenizer(const CDelimiterTokenizer& orig) 00025 { 00026 CTokenizer::set_text(orig.text); 00027 delimiters = orig.delimiters; 00028 init(); 00029 } 00030 00031 void CDelimiterTokenizer::init() 00032 { 00033 SG_ADD(&last_idx, "last_idx", "Index of last token", 00034 MS_NOT_AVAILABLE); 00035 SG_ADD(&skip_consecutive_delimiters, "skip_consecutive_delimiters", 00036 "Whether to skip consecutive delimiters or not", MS_NOT_AVAILABLE); 00037 SGVector<bool>::fill_vector(delimiters, 256, 0); 00038 } 00039 00040 void CDelimiterTokenizer::set_text(SGVector<char> txt) 00041 { 00042 last_idx = 0; 00043 CTokenizer::set_text(txt); 00044 } 00045 00046 const char* CDelimiterTokenizer::get_name() const 00047 { 00048 return "DelimiterTokenizer"; 00049 } 00050 00051 bool CDelimiterTokenizer::has_next() 00052 { 00053 if (skip_consecutive_delimiters) 00054 { 00055 for (index_t i=last_idx; i<text.size(); i++) 00056 { 00057 if (! delimiters[(uint8_t) text[i]]) 00058 return true; 00059 } 00060 return false; 00061 } 00062 else 00063 return last_idx<text.size(); 00064 } 00065 00066 void CDelimiterTokenizer::init_for_whitespace() 00067 { 00068 clear_delimiters(); 00069 delimiters[' '] = 1; 00070 delimiters['\t'] = 1; 00071 } 00072 00073 void CDelimiterTokenizer::clear_delimiters() 00074 { 00075 memset(delimiters, 0, sizeof (delimiters)); 00076 } 00077 00078 index_t CDelimiterTokenizer::next_token_idx(index_t& start) 00079 { 00080 start = last_idx; 00081 00082 if (skip_consecutive_delimiters) 00083 { 00084 while(delimiters[(uint8_t) text[start]]) 00085 start++; 00086 } 00087 00088 if (! delimiters[(uint8_t) text[start]]) 00089 { 00090 for (last_idx=start+1; last_idx<text.size(); last_idx++) 00091 { 00092 if (delimiters[(uint8_t) text[last_idx]]) 00093 break; 00094 } 00095 } 00096 00097 return last_idx++; 00098 } 00099 00100 CDelimiterTokenizer* CDelimiterTokenizer::get_copy() 00101 { 00102 CDelimiterTokenizer* t = new CDelimiterTokenizer(); 00103 t->delimiters = delimiters; 00104 t->skip_consecutive_delimiters = skip_consecutive_delimiters; 00105 return t; 00106 } 00107 00108 void CDelimiterTokenizer::set_skip_delimiters(bool skip_delimiters) 00109 { 00110 skip_consecutive_delimiters = skip_delimiters; 00111 } 00112 00113 bool CDelimiterTokenizer::get_skip_delimiters() const 00114 { 00115 return skip_consecutive_delimiters; 00116 } 00117 }