SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
NGramTokenizer.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evangelos Anagnostopoulos
00008  * Copyright (C) 2013 Evangelos Anagnostopoulos
00009  */
00010 
00011 #include <shogun/lib/NGramTokenizer.h>
00012 #include <shogun/base/Parameter.h>
00013 
00014 namespace shogun
00015 {
00016 
00017 CNGramTokenizer::CNGramTokenizer(int32_t ns) : CTokenizer()
00018 {
00019     n = ns;
00020     last_idx = 0;
00021     init();
00022 }
00023 
00024 CNGramTokenizer::CNGramTokenizer(const CNGramTokenizer& orig)
00025 : CTokenizer(orig)
00026 {
00027     CTokenizer::set_text(orig.text);
00028     n = orig.n;
00029     init();
00030 }
00031 
00032 void CNGramTokenizer::init()
00033 {
00034     SG_ADD(&n, "n", "Size of n-grams",
00035         MS_NOT_AVAILABLE);
00036     SG_ADD(&last_idx, "last_idx", "Index of last token",
00037         MS_NOT_AVAILABLE);
00038 }
00039 
00040 void CNGramTokenizer::set_text(SGVector<char> txt)
00041 {
00042     last_idx = 0;
00043     CTokenizer::set_text(txt);
00044 }
00045 
00046 const char* CNGramTokenizer::get_name() const
00047 {
00048     return "NGramTokenizer";
00049 }
00050 
00051 bool CNGramTokenizer::has_next()
00052 {
00053     return last_idx<=text.size()-n;
00054 }
00055 
00056 index_t CNGramTokenizer::next_token_idx(index_t& start)
00057 {
00058     start = last_idx++;
00059     return start + n;
00060 }
00061 
00062 CNGramTokenizer* CNGramTokenizer::get_copy()
00063 {
00064     CNGramTokenizer* t = new CNGramTokenizer(n);
00065     return t;
00066 }
00067 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation