SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
StreamingStringFeatures.cpp
Go to the documentation of this file.
00001 #include <shogun/features/streaming/StreamingStringFeatures.h>
00002 
00003 namespace shogun
00004 {
00005 
00006 
00007 template <class T>
00008 CStreamingStringFeatures<T>::CStreamingStringFeatures() : CStreamingFeatures()
00009 {
00010     init();
00011     set_read_functions();
00012     remap_to_bin=false;
00013 }
00014 
00015 template <class T>
00016 CStreamingStringFeatures<T>::CStreamingStringFeatures(CStreamingFile* file,
00017              bool is_labelled,
00018              int32_t size)
00019     : CStreamingFeatures()
00020 {
00021     init(file, is_labelled, size);
00022     set_read_functions();
00023     remap_to_bin=false;
00024 }
00025 
00026 template <class T>
00027 CStreamingStringFeatures<T>::~CStreamingStringFeatures()
00028 {
00029     if (parser.is_running())
00030         parser.end_parser();
00031     SG_UNREF(alphabet);
00032 }
00033 
00034 template <class T>
00035 void CStreamingStringFeatures<T>::use_alphabet(EAlphabet alpha)
00036 {
00037     SG_UNREF(alphabet);
00038 
00039     alphabet=new CAlphabet(alpha);
00040     SG_REF(alphabet);
00041     num_symbols=alphabet->get_num_symbols();
00042 }
00043 
00044 template <class T>
00045 void CStreamingStringFeatures<T>::use_alphabet(CAlphabet* alpha)
00046 {
00047     SG_UNREF(alphabet);
00048 
00049     alphabet=new CAlphabet(alpha);
00050     SG_REF(alphabet);
00051     num_symbols=alphabet->get_num_symbols();
00052 }
00053 
00054 template <class T>
00055 void CStreamingStringFeatures<T>::set_remap(CAlphabet* ascii_alphabet, CAlphabet* binary_alphabet)
00056 {
00057     remap_to_bin=true;
00058     alpha_ascii=new CAlphabet(ascii_alphabet);
00059     alpha_bin=new CAlphabet(binary_alphabet);
00060 }
00061 
00062 template <class T>
00063 void CStreamingStringFeatures<T>::set_remap(EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
00064 {
00065     remap_to_bin=true;
00066     alpha_ascii=new CAlphabet(ascii_alphabet);
00067     alpha_bin=new CAlphabet(binary_alphabet);
00068 }
00069 
00070 template <class T>
00071 CAlphabet* CStreamingStringFeatures<T>::get_alphabet()
00072 {
00073     SG_REF(alphabet);
00074     return alphabet;
00075 }
00076 
00077 template <class T>
00078 floatmax_t CStreamingStringFeatures<T>::get_num_symbols()
00079 {
00080     return num_symbols;
00081 }
00082 
00083 template <class T>
00084 CFeatures* CStreamingStringFeatures<T>::duplicate() const
00085 {
00086     return new CStreamingStringFeatures<T>(*this);
00087 }
00088 
00089 template <class T>
00090 int32_t CStreamingStringFeatures<T>::get_num_vectors() const
00091 {
00092     if (current_string)
00093         return 1;
00094     return 0;
00095 }
00096 
00097 template <class T>
00098 int32_t CStreamingStringFeatures<T>::get_num_features()
00099 {
00100     return current_length;
00101 }
00102 
00103 template <class T> void CStreamingStringFeatures<T>::set_vector_reader()
00104 {
00105     parser.set_read_vector(&CStreamingFile::get_string);
00106 }
00107 
00108 template <class T> void CStreamingStringFeatures<T>::set_vector_and_label_reader()
00109 {
00110     parser.set_read_vector_and_label
00111         (&CStreamingFile::get_string_and_label);
00112 }
00113 
00114 #define GET_FEATURE_TYPE(f_type, sg_type)               \
00115 template<> EFeatureType CStreamingStringFeatures<sg_type>::get_feature_type() const \
00116 {                                   \
00117     return f_type;                          \
00118 }
00119 
00120 GET_FEATURE_TYPE(F_BOOL, bool)
00121 GET_FEATURE_TYPE(F_CHAR, char)
00122 GET_FEATURE_TYPE(F_BYTE, uint8_t)
00123 GET_FEATURE_TYPE(F_BYTE, int8_t)
00124 GET_FEATURE_TYPE(F_SHORT, int16_t)
00125 GET_FEATURE_TYPE(F_WORD, uint16_t)
00126 GET_FEATURE_TYPE(F_INT, int32_t)
00127 GET_FEATURE_TYPE(F_UINT, uint32_t)
00128 GET_FEATURE_TYPE(F_LONG, int64_t)
00129 GET_FEATURE_TYPE(F_ULONG, uint64_t)
00130 GET_FEATURE_TYPE(F_SHORTREAL, float32_t)
00131 GET_FEATURE_TYPE(F_DREAL, float64_t)
00132 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t)
00133 #undef GET_FEATURE_TYPE
00134 
00135 
00136 template <class T>
00137 void CStreamingStringFeatures<T>::init()
00138 {
00139     working_file=NULL;
00140     alphabet=new CAlphabet();
00141 
00142     current_string=NULL;
00143     current_length=-1;
00144     current_sgstring.string=current_string;
00145     current_sgstring.slen=current_length;
00146 
00147     set_generic<T>();
00148 }
00149 
00150 template <class T>
00151 void CStreamingStringFeatures<T>::init(CStreamingFile* file,
00152                        bool is_labelled,
00153                        int32_t size)
00154 {
00155     init();
00156     has_labels=is_labelled;
00157     working_file=file;
00158     parser.init(file, is_labelled, size);
00159     parser.set_free_vector_after_release(false);
00160     parser.set_free_vectors_on_destruct(false);
00161 }
00162 
00163 template <class T>
00164 void CStreamingStringFeatures<T>::start_parser()
00165 {
00166     if (!remap_to_bin)
00167         alpha_ascii=alphabet;
00168 
00169     if (!parser.is_running())
00170         parser.start_parser();
00171 }
00172 
00173 template <class T>
00174 void CStreamingStringFeatures<T>::end_parser()
00175 {
00176     parser.end_parser();
00177 }
00178 
00179 template <class T>
00180 bool CStreamingStringFeatures<T>::get_next_example()
00181 {
00182     bool ret_value;
00183 
00184     ret_value = (bool) parser.get_next_example(current_string,
00185                            current_length,
00186                            current_label);
00187 
00188     if (!ret_value)
00189         return false;
00190 
00191     int32_t i;
00192     if (remap_to_bin)
00193     {
00194         alpha_ascii->add_string_to_histogram(current_string, current_length);
00195 
00196         for (i=0; i<current_length; i++)
00197             current_string[i]=alpha_ascii->remap_to_bin(current_string[i]);
00198         alpha_bin->add_string_to_histogram(current_string, current_length);
00199     }
00200     else
00201     {
00202         alpha_ascii->add_string_to_histogram(current_string, current_length);
00203     }
00204 
00205     /* Check the input using src alphabet, alpha_ascii */
00206     if ( !(alpha_ascii->check_alphabet_size() && alpha_ascii->check_alphabet()) )
00207     {
00208         SG_ERROR("StreamingStringFeatures: The given input was found to be incompatible with the alphabet!\n")
00209         return 0;
00210     }
00211 
00212     //SG_UNREF(alphabet);
00213 
00214     if (remap_to_bin)
00215         alphabet=alpha_bin;
00216     else
00217         alphabet=alpha_ascii;
00218 
00219     //SG_REF(alphabet);
00220     num_symbols=alphabet->get_num_symbols();
00221 
00222     return ret_value;
00223 }
00224 
00225 template <class T>
00226 SGString<T> CStreamingStringFeatures<T>::get_vector()
00227 {
00228     current_sgstring.string=current_string;
00229     current_sgstring.slen=current_length;
00230 
00231     return current_sgstring;
00232 }
00233 
00234 template <class T>
00235 float64_t CStreamingStringFeatures<T>::get_label()
00236 {
00237     ASSERT(has_labels)
00238 
00239     return current_label;
00240 }
00241 
00242 template <class T>
00243 void CStreamingStringFeatures<T>::release_example()
00244 {
00245     parser.finalize_example();
00246 }
00247 
00248 template <class T>
00249 int32_t CStreamingStringFeatures<T>::get_vector_length()
00250 {
00251     return current_length;
00252 }
00253 
00254 template <class T>
00255 EFeatureClass CStreamingStringFeatures<T>::get_feature_class() const
00256 {
00257     return C_STREAMING_STRING;
00258 }
00259 
00260 template class CStreamingStringFeatures<bool>;
00261 template class CStreamingStringFeatures<char>;
00262 template class CStreamingStringFeatures<int8_t>;
00263 template class CStreamingStringFeatures<uint8_t>;
00264 template class CStreamingStringFeatures<int16_t>;
00265 template class CStreamingStringFeatures<uint16_t>;
00266 template class CStreamingStringFeatures<int32_t>;
00267 template class CStreamingStringFeatures<uint32_t>;
00268 template class CStreamingStringFeatures<int64_t>;
00269 template class CStreamingStringFeatures<uint64_t>;
00270 template class CStreamingStringFeatures<float32_t>;
00271 template class CStreamingStringFeatures<float64_t>;
00272 template class CStreamingStringFeatures<floatmax_t>;
00273 
00274 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation