SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
DelimiterTokenizer.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evangelos Anagnostopoulos
00008  * Copyright (C) 2013 Evangelos Anagnostopoulos
00009  */
00010 
00011 #include <shogun/base/Parameter.h>
00012 #include <shogun/lib/DelimiterTokenizer.h>
00013 
00014 namespace shogun
00015 {
00016 
00017 CDelimiterTokenizer::CDelimiterTokenizer(bool skip_delimiters) : delimiters(256)
00018 {
00019     last_idx = 0;
00020     skip_consecutive_delimiters = skip_delimiters;
00021     init();
00022 }
00023 
00024 CDelimiterTokenizer::CDelimiterTokenizer(const CDelimiterTokenizer& orig)
00025 {
00026     CTokenizer::set_text(orig.text);
00027     delimiters = orig.delimiters;
00028     init();
00029 }
00030 
00031 void CDelimiterTokenizer::init()
00032 {
00033     SG_ADD(&last_idx, "last_idx", "Index of last token",
00034         MS_NOT_AVAILABLE);
00035     SG_ADD(&skip_consecutive_delimiters, "skip_consecutive_delimiters",
00036         "Whether to skip consecutive delimiters or not", MS_NOT_AVAILABLE);
00037     SGVector<bool>::fill_vector(delimiters, 256, 0);
00038 }
00039 
00040 void CDelimiterTokenizer::set_text(SGVector<char> txt)
00041 {
00042     last_idx = 0;
00043     CTokenizer::set_text(txt);
00044 }
00045 
00046 const char* CDelimiterTokenizer::get_name() const
00047 {
00048     return "DelimiterTokenizer";
00049 }
00050 
00051 bool CDelimiterTokenizer::has_next()
00052 {
00053     if (skip_consecutive_delimiters)
00054     {
00055         for (index_t i=last_idx; i<text.size(); i++)
00056         {
00057             if (! delimiters[(uint8_t) text[i]])
00058                 return true;
00059         }
00060         return false;
00061     }
00062     else
00063         return last_idx<text.size();
00064 }
00065 
00066 void CDelimiterTokenizer::init_for_whitespace()
00067 {
00068     clear_delimiters();
00069     delimiters[' '] = 1;
00070     delimiters['\t'] = 1;
00071 }
00072 
00073 void CDelimiterTokenizer::clear_delimiters()
00074 {
00075     memset(delimiters, 0, sizeof (delimiters));
00076 }
00077 
00078 index_t CDelimiterTokenizer::next_token_idx(index_t& start)
00079 {
00080     start = last_idx;
00081 
00082     if (skip_consecutive_delimiters)
00083     {
00084         while(delimiters[(uint8_t) text[start]])
00085             start++;
00086     }
00087 
00088     if (! delimiters[(uint8_t) text[start]])
00089     {
00090         for (last_idx=start+1; last_idx<text.size(); last_idx++)
00091         {
00092             if (delimiters[(uint8_t) text[last_idx]])
00093                 break;
00094         }
00095     }
00096 
00097     return last_idx++;
00098 }
00099 
00100 CDelimiterTokenizer* CDelimiterTokenizer::get_copy()
00101 {
00102     CDelimiterTokenizer* t = new CDelimiterTokenizer();
00103     t->delimiters = delimiters;
00104     t->skip_consecutive_delimiters = skip_consecutive_delimiters;
00105     return t;
00106 }
00107 
00108 void CDelimiterTokenizer::set_skip_delimiters(bool skip_delimiters)
00109 {
00110     skip_consecutive_delimiters = skip_delimiters;
00111 }
00112 
00113 bool CDelimiterTokenizer::get_skip_delimiters() const
00114 {
00115     return skip_consecutive_delimiters;
00116 }
00117 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation