SHOGUN
v3.2.0
|
00001 #include <shogun/features/streaming/StreamingStringFeatures.h> 00002 00003 namespace shogun 00004 { 00005 00006 00007 template <class T> 00008 CStreamingStringFeatures<T>::CStreamingStringFeatures() : CStreamingFeatures() 00009 { 00010 init(); 00011 set_read_functions(); 00012 remap_to_bin=false; 00013 } 00014 00015 template <class T> 00016 CStreamingStringFeatures<T>::CStreamingStringFeatures(CStreamingFile* file, 00017 bool is_labelled, 00018 int32_t size) 00019 : CStreamingFeatures() 00020 { 00021 init(file, is_labelled, size); 00022 set_read_functions(); 00023 remap_to_bin=false; 00024 } 00025 00026 template <class T> 00027 CStreamingStringFeatures<T>::~CStreamingStringFeatures() 00028 { 00029 if (parser.is_running()) 00030 parser.end_parser(); 00031 SG_UNREF(alphabet); 00032 } 00033 00034 template <class T> 00035 void CStreamingStringFeatures<T>::use_alphabet(EAlphabet alpha) 00036 { 00037 SG_UNREF(alphabet); 00038 00039 alphabet=new CAlphabet(alpha); 00040 SG_REF(alphabet); 00041 num_symbols=alphabet->get_num_symbols(); 00042 } 00043 00044 template <class T> 00045 void CStreamingStringFeatures<T>::use_alphabet(CAlphabet* alpha) 00046 { 00047 SG_UNREF(alphabet); 00048 00049 alphabet=new CAlphabet(alpha); 00050 SG_REF(alphabet); 00051 num_symbols=alphabet->get_num_symbols(); 00052 } 00053 00054 template <class T> 00055 void CStreamingStringFeatures<T>::set_remap(CAlphabet* ascii_alphabet, CAlphabet* binary_alphabet) 00056 { 00057 remap_to_bin=true; 00058 alpha_ascii=new CAlphabet(ascii_alphabet); 00059 alpha_bin=new CAlphabet(binary_alphabet); 00060 } 00061 00062 template <class T> 00063 void CStreamingStringFeatures<T>::set_remap(EAlphabet ascii_alphabet, EAlphabet binary_alphabet) 00064 { 00065 remap_to_bin=true; 00066 alpha_ascii=new CAlphabet(ascii_alphabet); 00067 alpha_bin=new CAlphabet(binary_alphabet); 00068 } 00069 00070 template <class T> 00071 CAlphabet* CStreamingStringFeatures<T>::get_alphabet() 00072 { 00073 SG_REF(alphabet); 00074 return alphabet; 00075 } 00076 00077 template <class T> 00078 floatmax_t CStreamingStringFeatures<T>::get_num_symbols() 00079 { 00080 return num_symbols; 00081 } 00082 00083 template <class T> 00084 CFeatures* CStreamingStringFeatures<T>::duplicate() const 00085 { 00086 return new CStreamingStringFeatures<T>(*this); 00087 } 00088 00089 template <class T> 00090 int32_t CStreamingStringFeatures<T>::get_num_vectors() const 00091 { 00092 if (current_string) 00093 return 1; 00094 return 0; 00095 } 00096 00097 template <class T> 00098 int32_t CStreamingStringFeatures<T>::get_num_features() 00099 { 00100 return current_length; 00101 } 00102 00103 template <class T> void CStreamingStringFeatures<T>::set_vector_reader() 00104 { 00105 parser.set_read_vector(&CStreamingFile::get_string); 00106 } 00107 00108 template <class T> void CStreamingStringFeatures<T>::set_vector_and_label_reader() 00109 { 00110 parser.set_read_vector_and_label 00111 (&CStreamingFile::get_string_and_label); 00112 } 00113 00114 #define GET_FEATURE_TYPE(f_type, sg_type) \ 00115 template<> EFeatureType CStreamingStringFeatures<sg_type>::get_feature_type() const \ 00116 { \ 00117 return f_type; \ 00118 } 00119 00120 GET_FEATURE_TYPE(F_BOOL, bool) 00121 GET_FEATURE_TYPE(F_CHAR, char) 00122 GET_FEATURE_TYPE(F_BYTE, uint8_t) 00123 GET_FEATURE_TYPE(F_BYTE, int8_t) 00124 GET_FEATURE_TYPE(F_SHORT, int16_t) 00125 GET_FEATURE_TYPE(F_WORD, uint16_t) 00126 GET_FEATURE_TYPE(F_INT, int32_t) 00127 GET_FEATURE_TYPE(F_UINT, uint32_t) 00128 GET_FEATURE_TYPE(F_LONG, int64_t) 00129 GET_FEATURE_TYPE(F_ULONG, uint64_t) 00130 GET_FEATURE_TYPE(F_SHORTREAL, float32_t) 00131 GET_FEATURE_TYPE(F_DREAL, float64_t) 00132 GET_FEATURE_TYPE(F_LONGREAL, floatmax_t) 00133 #undef GET_FEATURE_TYPE 00134 00135 00136 template <class T> 00137 void CStreamingStringFeatures<T>::init() 00138 { 00139 working_file=NULL; 00140 alphabet=new CAlphabet(); 00141 00142 current_string=NULL; 00143 current_length=-1; 00144 current_sgstring.string=current_string; 00145 current_sgstring.slen=current_length; 00146 00147 set_generic<T>(); 00148 } 00149 00150 template <class T> 00151 void CStreamingStringFeatures<T>::init(CStreamingFile* file, 00152 bool is_labelled, 00153 int32_t size) 00154 { 00155 init(); 00156 has_labels=is_labelled; 00157 working_file=file; 00158 parser.init(file, is_labelled, size); 00159 parser.set_free_vector_after_release(false); 00160 parser.set_free_vectors_on_destruct(false); 00161 } 00162 00163 template <class T> 00164 void CStreamingStringFeatures<T>::start_parser() 00165 { 00166 if (!remap_to_bin) 00167 alpha_ascii=alphabet; 00168 00169 if (!parser.is_running()) 00170 parser.start_parser(); 00171 } 00172 00173 template <class T> 00174 void CStreamingStringFeatures<T>::end_parser() 00175 { 00176 parser.end_parser(); 00177 } 00178 00179 template <class T> 00180 bool CStreamingStringFeatures<T>::get_next_example() 00181 { 00182 bool ret_value; 00183 00184 ret_value = (bool) parser.get_next_example(current_string, 00185 current_length, 00186 current_label); 00187 00188 if (!ret_value) 00189 return false; 00190 00191 int32_t i; 00192 if (remap_to_bin) 00193 { 00194 alpha_ascii->add_string_to_histogram(current_string, current_length); 00195 00196 for (i=0; i<current_length; i++) 00197 current_string[i]=alpha_ascii->remap_to_bin(current_string[i]); 00198 alpha_bin->add_string_to_histogram(current_string, current_length); 00199 } 00200 else 00201 { 00202 alpha_ascii->add_string_to_histogram(current_string, current_length); 00203 } 00204 00205 /* Check the input using src alphabet, alpha_ascii */ 00206 if ( !(alpha_ascii->check_alphabet_size() && alpha_ascii->check_alphabet()) ) 00207 { 00208 SG_ERROR("StreamingStringFeatures: The given input was found to be incompatible with the alphabet!\n") 00209 return 0; 00210 } 00211 00212 //SG_UNREF(alphabet); 00213 00214 if (remap_to_bin) 00215 alphabet=alpha_bin; 00216 else 00217 alphabet=alpha_ascii; 00218 00219 //SG_REF(alphabet); 00220 num_symbols=alphabet->get_num_symbols(); 00221 00222 return ret_value; 00223 } 00224 00225 template <class T> 00226 SGString<T> CStreamingStringFeatures<T>::get_vector() 00227 { 00228 current_sgstring.string=current_string; 00229 current_sgstring.slen=current_length; 00230 00231 return current_sgstring; 00232 } 00233 00234 template <class T> 00235 float64_t CStreamingStringFeatures<T>::get_label() 00236 { 00237 ASSERT(has_labels) 00238 00239 return current_label; 00240 } 00241 00242 template <class T> 00243 void CStreamingStringFeatures<T>::release_example() 00244 { 00245 parser.finalize_example(); 00246 } 00247 00248 template <class T> 00249 int32_t CStreamingStringFeatures<T>::get_vector_length() 00250 { 00251 return current_length; 00252 } 00253 00254 template <class T> 00255 EFeatureClass CStreamingStringFeatures<T>::get_feature_class() const 00256 { 00257 return C_STREAMING_STRING; 00258 } 00259 00260 template class CStreamingStringFeatures<bool>; 00261 template class CStreamingStringFeatures<char>; 00262 template class CStreamingStringFeatures<int8_t>; 00263 template class CStreamingStringFeatures<uint8_t>; 00264 template class CStreamingStringFeatures<int16_t>; 00265 template class CStreamingStringFeatures<uint16_t>; 00266 template class CStreamingStringFeatures<int32_t>; 00267 template class CStreamingStringFeatures<uint32_t>; 00268 template class CStreamingStringFeatures<int64_t>; 00269 template class CStreamingStringFeatures<uint64_t>; 00270 template class CStreamingStringFeatures<float32_t>; 00271 template class CStreamingStringFeatures<float64_t>; 00272 template class CStreamingStringFeatures<floatmax_t>; 00273 00274 }