SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <shogun/lib/common.h> 00013 #include <shogun/io/SGIO.h> 00014 00015 #include <shogun/base/Parameter.h> 00016 00017 #include <shogun/distributions/LinearHMM.h> 00018 #include <shogun/features/StringFeatures.h> 00019 00020 using namespace shogun; 00021 00022 CLinearHMM::CLinearHMM() : CDistribution() 00023 { 00024 init(); 00025 } 00026 00027 CLinearHMM::CLinearHMM(CStringFeatures<uint16_t>* f) 00028 : CDistribution() 00029 { 00030 init(); 00031 00032 set_features(f); 00033 sequence_length = f->get_vector_length(0); 00034 num_symbols = (int32_t) f->get_num_symbols(); 00035 num_params = sequence_length*num_symbols; 00036 } 00037 00038 CLinearHMM::CLinearHMM(int32_t p_num_features, int32_t p_num_symbols) 00039 : CDistribution() 00040 { 00041 init(); 00042 00043 sequence_length = p_num_features; 00044 num_symbols = p_num_symbols; 00045 num_params = sequence_length*num_symbols; 00046 } 00047 00048 CLinearHMM::~CLinearHMM() 00049 { 00050 SG_FREE(transition_probs); 00051 SG_FREE(log_transition_probs); 00052 } 00053 00054 bool CLinearHMM::train(CFeatures* data) 00055 { 00056 if (data) 00057 { 00058 if (data->get_feature_class() != C_STRING || 00059 data->get_feature_type() != F_WORD) 00060 { 00061 SG_ERROR("Expected features of class string type word!\n") 00062 } 00063 set_features(data); 00064 } 00065 SG_FREE(transition_probs); 00066 SG_FREE(log_transition_probs); 00067 int32_t* int_transition_probs=SG_MALLOC(int32_t, num_params); 00068 00069 int32_t vec; 00070 int32_t i; 00071 00072 for (i=0; i< num_params; i++) 00073 int_transition_probs[i]=0; 00074 00075 for (vec=0; vec<features->get_num_vectors(); vec++) 00076 { 00077 int32_t len; 00078 bool free_vec; 00079 00080 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00081 get_feature_vector(vec, len, free_vec); 00082 00083 //just count the symbols per position -> transition_probsogram 00084 for (int32_t feat=0; feat<len ; feat++) 00085 int_transition_probs[feat*num_symbols+vector[feat]]++; 00086 00087 ((CStringFeatures<uint16_t>*) features)-> 00088 free_feature_vector(vector, vec, free_vec); 00089 } 00090 00091 //trade memory for speed 00092 transition_probs=SG_MALLOC(float64_t, num_params); 00093 log_transition_probs=SG_MALLOC(float64_t, num_params); 00094 00095 for (i=0;i<sequence_length;i++) 00096 { 00097 for (int32_t j=0; j<num_symbols; j++) 00098 { 00099 float64_t sum=0; 00100 int32_t offs=i*num_symbols+ 00101 ((CStringFeatures<uint16_t> *) features)-> 00102 get_masked_symbols((uint16_t)j,(uint8_t) 254); 00103 int32_t original_num_symbols=(int32_t) 00104 ((CStringFeatures<uint16_t> *) features)-> 00105 get_original_num_symbols(); 00106 00107 for (int32_t k=0; k<original_num_symbols; k++) 00108 sum+=int_transition_probs[offs+k]; 00109 00110 transition_probs[i*num_symbols+j]= 00111 (int_transition_probs[i*num_symbols+j]+pseudo_count)/ 00112 (sum+((CStringFeatures<uint16_t> *) features)-> 00113 get_original_num_symbols()*pseudo_count); 00114 log_transition_probs[i*num_symbols+j]= 00115 log(transition_probs[i*num_symbols+j]); 00116 } 00117 } 00118 00119 SG_FREE(int_transition_probs); 00120 return true; 00121 } 00122 00123 bool CLinearHMM::train( 00124 const int32_t* indizes, int32_t num_indizes, float64_t pseudo) 00125 { 00126 SG_FREE(transition_probs); 00127 SG_FREE(log_transition_probs); 00128 int32_t* int_transition_probs=SG_MALLOC(int32_t, num_params); 00129 int32_t vec; 00130 int32_t i; 00131 00132 for (i=0; i< num_params; i++) 00133 int_transition_probs[i]=0; 00134 00135 for (vec=0; vec<num_indizes; vec++) 00136 { 00137 int32_t len; 00138 bool free_vec; 00139 00140 ASSERT(indizes[vec]>=0 && 00141 indizes[vec]<((CStringFeatures<uint16_t>*) features)-> 00142 get_num_vectors()); 00143 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00144 get_feature_vector(indizes[vec], len, free_vec); 00145 ((CStringFeatures<uint16_t>*) features)-> 00146 free_feature_vector(vector, indizes[vec], free_vec); 00147 00148 //just count the symbols per position -> transition_probsogram 00149 // 00150 for (int32_t feat=0; feat<len ; feat++) 00151 int_transition_probs[feat*num_symbols+vector[feat]]++; 00152 } 00153 00154 //trade memory for speed 00155 transition_probs=SG_MALLOC(float64_t, num_params); 00156 log_transition_probs=SG_MALLOC(float64_t, num_params); 00157 00158 for (i=0;i<sequence_length;i++) 00159 { 00160 for (int32_t j=0; j<num_symbols; j++) 00161 { 00162 float64_t sum=0; 00163 int32_t original_num_symbols=(int32_t) 00164 ((CStringFeatures<uint16_t> *) features)-> 00165 get_original_num_symbols(); 00166 for (int32_t k=0; k<original_num_symbols; k++) 00167 { 00168 sum+=int_transition_probs[i*num_symbols+ 00169 ((CStringFeatures<uint16_t>*) features)-> 00170 get_masked_symbols((uint16_t)j,(uint8_t) 254)+k]; 00171 } 00172 00173 transition_probs[i*num_symbols+j]= 00174 (int_transition_probs[i*num_symbols+j]+pseudo)/ 00175 (sum+((CStringFeatures<uint16_t>*) features)-> 00176 get_original_num_symbols()*pseudo); 00177 log_transition_probs[i*num_symbols+j]= 00178 log(transition_probs[i*num_symbols+j]); 00179 } 00180 } 00181 00182 SG_FREE(int_transition_probs); 00183 return true; 00184 } 00185 00186 float64_t CLinearHMM::get_log_likelihood_example(uint16_t* vector, int32_t len) 00187 { 00188 float64_t result=log_transition_probs[vector[0]]; 00189 00190 for (int32_t i=1; i<len; i++) 00191 result+=log_transition_probs[i*num_symbols+vector[i]]; 00192 00193 return result; 00194 } 00195 00196 float64_t CLinearHMM::get_log_likelihood_example(int32_t num_example) 00197 { 00198 int32_t len; 00199 bool free_vec; 00200 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00201 get_feature_vector(num_example, len, free_vec); 00202 float64_t result=get_log_likelihood_example(vector, len); 00203 00204 ((CStringFeatures<uint16_t>*) features)-> 00205 free_feature_vector(vector, num_example, free_vec); 00206 00207 return result; 00208 } 00209 00210 float64_t CLinearHMM::get_likelihood_example(uint16_t* vector, int32_t len) 00211 { 00212 float64_t result=transition_probs[vector[0]]; 00213 00214 for (int32_t i=1; i<len; i++) 00215 result*=transition_probs[i*num_symbols+vector[i]]; 00216 00217 return result; 00218 } 00219 00220 float64_t CLinearHMM::get_likelihood_example(int32_t num_example) 00221 { 00222 int32_t len; 00223 bool free_vec; 00224 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00225 get_feature_vector(num_example, len, free_vec); 00226 00227 float64_t result=get_likelihood_example(vector, len); 00228 00229 ((CStringFeatures<uint16_t>*) features)-> 00230 free_feature_vector(vector, num_example, free_vec); 00231 00232 return result; 00233 } 00234 00235 float64_t CLinearHMM::get_log_derivative(int32_t num_param, int32_t num_example) 00236 { 00237 int32_t len; 00238 bool free_vec; 00239 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00240 get_feature_vector(num_example, len, free_vec); 00241 float64_t result=0; 00242 int32_t position=num_param/num_symbols; 00243 ASSERT(position>=0 && position<len) 00244 uint16_t sym=(uint16_t) (num_param-position*num_symbols); 00245 00246 if (vector[position]==sym && transition_probs[num_param]!=0) 00247 result=1.0/transition_probs[num_param]; 00248 ((CStringFeatures<uint16_t>*) features)-> 00249 free_feature_vector(vector, num_example, free_vec); 00250 00251 return result; 00252 } 00253 00254 SGVector<float64_t> CLinearHMM::get_transition_probs() 00255 { 00256 return SGVector<float64_t>(transition_probs, num_params, false); 00257 } 00258 00259 bool CLinearHMM::set_transition_probs(const SGVector<float64_t> probs) 00260 { 00261 ASSERT(probs.vlen == num_params) 00262 00263 if (!log_transition_probs) 00264 log_transition_probs=SG_MALLOC(float64_t, num_params); 00265 00266 if (!transition_probs) 00267 transition_probs=SG_MALLOC(float64_t, num_params); 00268 00269 for (int32_t i=0; i<num_params; i++) 00270 { 00271 transition_probs[i]=probs.vector[i]; 00272 log_transition_probs[i]=log(transition_probs[i]); 00273 } 00274 00275 return true; 00276 } 00277 00278 SGVector<float64_t> CLinearHMM::get_log_transition_probs() 00279 { 00280 return SGVector<float64_t>(log_transition_probs, num_params, false); 00281 } 00282 00283 bool CLinearHMM::set_log_transition_probs(const SGVector<float64_t> probs) 00284 { 00285 ASSERT(probs.vlen == num_params) 00286 00287 if (!log_transition_probs) 00288 log_transition_probs=SG_MALLOC(float64_t, num_params); 00289 00290 if (!transition_probs) 00291 transition_probs=SG_MALLOC(float64_t, num_params); 00292 00293 for (int32_t i=0; i<num_params; i++) 00294 { 00295 log_transition_probs[i]=probs.vector[i]; 00296 transition_probs[i]=exp(log_transition_probs[i]); 00297 } 00298 00299 return true; 00300 } 00301 00302 void CLinearHMM::load_serializable_post() throw (ShogunException) 00303 { 00304 CSGObject::load_serializable_post(); 00305 00306 num_params = sequence_length*num_symbols; 00307 } 00308 00309 void CLinearHMM::init() 00310 { 00311 sequence_length = 0; 00312 num_symbols = 0; 00313 num_params = 0; 00314 transition_probs = NULL; 00315 log_transition_probs = NULL; 00316 00317 m_parameters->add_matrix(&transition_probs, &num_symbols, &sequence_length, 00318 "transition_probs", "Transition probabilities."); 00319 m_parameters->add_matrix(&log_transition_probs, &num_symbols, &sequence_length, 00320 "log_transition_probs", "Transition probabilities (logspace)."); 00321 }