SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2012 Fernando José Iglesias García 00008 * Copyright (C) 2012 Fernando José Iglesias García 00009 */ 00010 00011 #include <shogun/structure/TwoStateModel.h> 00012 #include <shogun/mathematics/Math.h> 00013 #include <shogun/features/MatrixFeatures.h> 00014 #include <shogun/structure/Plif.h> 00015 00016 using namespace shogun; 00017 00018 CTwoStateModel::CTwoStateModel() : CStateModel() 00019 { 00020 // The number of states in this state model is equal to four. 00021 // Although parameters are learnt only for two of them, other 00022 // two states (start and stop) are used 00023 m_num_states = 4; 00024 m_num_transmission_params = 4; 00025 00026 m_state_loss_mat = SGMatrix< float64_t >(m_num_states, m_num_states); 00027 m_state_loss_mat.zero(); 00028 for ( int32_t i = 0 ; i < m_num_states-1 ; ++i ) 00029 { 00030 m_state_loss_mat(m_num_states-1, i) = 1; 00031 m_state_loss_mat(i, m_num_states-1) = 1; 00032 } 00033 00034 // Initialize the start and stop states 00035 m_p = SGVector< float64_t >(m_num_states); 00036 m_q = SGVector< float64_t >(m_num_states); 00037 m_p.set_const(-CMath::INFTY); 00038 m_q.set_const(-CMath::INFTY); 00039 m_p[0] = 0; // start state 00040 m_q[1] = 0; // stop state 00041 } 00042 00043 CTwoStateModel::~CTwoStateModel() 00044 { 00045 } 00046 00047 SGMatrix< float64_t > CTwoStateModel::loss_matrix(CSequence* label_seq) 00048 { 00049 SGVector< int32_t > state_seq = labels_to_states(label_seq); 00050 SGMatrix< float64_t > loss_mat(m_num_states, state_seq.vlen); 00051 00052 for ( int32_t i = 0 ; i < loss_mat.num_cols ; ++i ) 00053 { 00054 for ( int32_t s = 0 ; s < loss_mat.num_rows ; ++s ) 00055 loss_mat(s,i) = m_state_loss_mat(s, state_seq[i]); 00056 } 00057 00058 return loss_mat; 00059 } 00060 00061 float64_t CTwoStateModel::loss(CSequence* label_seq_lhs, CSequence* label_seq_rhs) 00062 { 00063 SGVector< int32_t > state_seq_lhs = labels_to_states(label_seq_lhs); 00064 SGVector< int32_t > state_seq_rhs = labels_to_states(label_seq_rhs); 00065 00066 ASSERT(state_seq_lhs.vlen == state_seq_rhs.vlen) 00067 00068 float64_t ret = 0.0; 00069 for ( int32_t i = 0 ; i < state_seq_lhs.vlen ; ++i ) 00070 ret += m_state_loss_mat(state_seq_lhs[i], state_seq_rhs[i]); 00071 00072 return ret; 00073 } 00074 00075 SGVector< int32_t > CTwoStateModel::labels_to_states(CSequence* label_seq) const 00076 { 00077 // 0 -> start state 00078 // 1 -> stop state 00079 // 2 -> negative state (label == 0) 00080 // 3 -> positive state (label == 1) 00081 00082 SGVector< int32_t > seq_data = label_seq->get_data(); 00083 SGVector< int32_t > state_seq(seq_data.size()); 00084 for ( int32_t i = 1 ; i < state_seq.vlen-1 ; ++i ) 00085 { 00086 //FIXME make independent of values 0-1 in labels 00087 state_seq[i] = seq_data[i] + 2; 00088 } 00089 00090 // The first element is always start state 00091 state_seq[0] = 0; 00092 // The last element is always stop state 00093 state_seq[state_seq.vlen-1] = 1; 00094 00095 return state_seq; 00096 } 00097 00098 CSequence* CTwoStateModel::states_to_labels(SGVector< int32_t > state_seq) const 00099 { 00100 SGVector< int32_t > label_seq(state_seq.vlen); 00101 00102 //FIXME make independent of values 0-1 in labels 00103 // Legend for state indices: 00104 // 0 -> start state => label 0 00105 // 1 -> stop state => label 0 00106 // 2 -> negative state (label == 0) => label 0 00107 // 3 -> positive state (label == 1) => label 1 00108 label_seq.zero(); 00109 for ( int32_t i = 0 ; i < state_seq.vlen ; ++i ) 00110 { 00111 if ( state_seq[i] == 3 ) 00112 label_seq[i] = 1; 00113 } 00114 00115 CSequence* ret = new CSequence(label_seq); 00116 SG_REF(ret); 00117 return ret; 00118 } 00119 00120 void CTwoStateModel::reshape_emission_params(SGVector< float64_t >& emission_weights, 00121 SGVector< float64_t > w, int32_t num_feats, int32_t num_obs) 00122 { 00123 emission_weights.zero(); 00124 00125 // Legend for state indices: 00126 // 0 -> start state 00127 // 1 -> stop state 00128 // 2 -> negative state (label == 0) 00129 // 3 -> positive state (label == 1) 00130 // 00131 // start and stop states have no emission scores 00132 00133 index_t em_idx, w_idx = m_num_transmission_params; 00134 for ( int32_t s = 2 ; s < m_num_states ; ++s ) 00135 { 00136 for ( int32_t f = 0 ; f < num_feats ; ++f ) 00137 { 00138 for ( int32_t o = 0 ; o < num_obs ; ++o ) 00139 { 00140 em_idx = s*num_feats*num_obs + f*num_obs + o; 00141 emission_weights[em_idx] = w[w_idx++]; 00142 } 00143 } 00144 } 00145 } 00146 00147 void CTwoStateModel::reshape_emission_params(CDynamicObjectArray* plif_matrix, 00148 SGVector< float64_t > w, int32_t num_feats, int32_t num_plif_nodes) 00149 { 00150 CPlif* plif; 00151 index_t p_idx, w_idx = m_num_transmission_params; 00152 for ( int32_t s = 2 ; s < m_num_states ; ++s ) 00153 { 00154 for ( int32_t f = 0 ; f < num_feats ; ++f ) 00155 { 00156 SGVector< float64_t > penalties(num_plif_nodes); 00157 p_idx = 0; 00158 00159 for ( int32_t i = 0 ; i < num_plif_nodes ; ++i ) 00160 penalties[p_idx++] = w[w_idx++]; 00161 00162 plif = (CPlif*) plif_matrix->get_element(m_num_states*f + s); 00163 plif->set_plif_penalty(penalties); 00164 SG_UNREF(plif); 00165 } 00166 } 00167 } 00168 00169 void CTwoStateModel::reshape_transmission_params( 00170 SGMatrix< float64_t >& transmission_weights, SGVector< float64_t > w) 00171 { 00172 transmission_weights.set_const(-CMath::INFTY); 00173 00174 // Legend for state indices: 00175 // 0 -> start state 00176 // 1 -> stop state 00177 // 2 -> negative state (label == 0) 00178 // 3 -> positive state (label == 1) 00179 00180 // From start 00181 transmission_weights(0,2) = 0; // to negative 00182 transmission_weights(0,3) = 0; // to positive 00183 // From negative 00184 transmission_weights(2,1) = 0; // to stop 00185 transmission_weights(2,2) = w[0]; // to negative 00186 transmission_weights(2,3) = w[1]; // to positive 00187 // From positive 00188 transmission_weights(3,1) = 0; // to stop 00189 transmission_weights(3,2) = w[3]; // to positive 00190 transmission_weights(3,3) = w[2]; // to negative 00191 } 00192 00193 void CTwoStateModel::weights_to_vector(SGVector< float64_t >& psi, 00194 SGMatrix< float64_t > transmission_weights, 00195 SGVector< float64_t > emission_weights, 00196 int32_t num_feats, int32_t num_obs) const 00197 { 00198 // Legend for state indices: 00199 // 0 -> start state 00200 // 1 -> stop state 00201 // 2 -> negative state 00202 // 3 -> positive state 00203 psi[0] = transmission_weights(2,2); 00204 psi[1] = transmission_weights(2,3); 00205 psi[2] = transmission_weights(3,3); 00206 psi[3] = transmission_weights(3,2); 00207 00208 // start and stop states have no emission scores 00209 index_t obs_idx, psi_idx = m_num_transmission_params; 00210 for ( int32_t s = 2 ; s < m_num_states ; ++s ) 00211 { 00212 for ( int32_t f = 0 ; f < num_feats ; ++f ) 00213 { 00214 for ( int32_t o = 0 ; o < num_obs ; ++o ) 00215 { 00216 obs_idx = s*num_feats*num_obs + f*num_obs + o; 00217 psi[psi_idx++] = emission_weights[obs_idx]; 00218 } 00219 } 00220 } 00221 00222 } 00223 00224 SGVector< float64_t > CTwoStateModel::weights_to_vector(SGMatrix< float64_t > transmission_weights, 00225 SGVector< float64_t > emission_weights, int32_t num_feats, int32_t num_obs) const 00226 { 00227 int32_t num_free_states = 2; 00228 SGVector< float64_t > vec(num_free_states*(num_free_states + num_feats*num_obs)); 00229 vec.zero(); 00230 weights_to_vector(vec, transmission_weights, emission_weights, num_feats, num_obs); 00231 return vec; 00232 } 00233 00234 SGVector< int32_t > CTwoStateModel::get_monotonicity(int32_t num_free_states, 00235 int32_t num_feats) const 00236 { 00237 REQUIRE(num_free_states == 2, "Using the TwoStateModel only two states are free\n") 00238 00239 SGVector< int32_t > monotonicity(num_feats*num_free_states); 00240 00241 for ( int32_t i = 0 ; i < num_feats ; ++i ) 00242 monotonicity[i] = -1; 00243 for ( int32_t i = num_feats ; i < 2*num_feats ; ++i ) 00244 monotonicity[i] = +1; 00245 00246 return monotonicity; 00247 } 00248 00249 CHMSVMModel* CTwoStateModel::simulate_data(int32_t num_exm, int32_t exm_len, 00250 int32_t num_features, int32_t num_noise_features) 00251 { 00252 // Number of different states 00253 int32_t num_states = 2; 00254 // Min and max length of positive block 00255 int32_t block_len[] = {10, 100}; 00256 // Min and max number of positive blocks per example 00257 int32_t num_blocks[] = {0, 3}; 00258 00259 // Proportion of wrong labels 00260 float64_t prop_distort = 0.2; 00261 // Standard deviation of Gaussian noise 00262 float64_t noise_std = 4; 00263 00264 // Generate label sequence randomly containing from num_blocks[0] to 00265 // num_blocks[1] blocks of positive labels each of length between 00266 // block_len[0] and block_len[1] 00267 00268 CSequenceLabels* labels = new CSequenceLabels(num_exm, num_states); 00269 SGVector< int32_t > ll(num_exm*exm_len); 00270 ll.zero(); 00271 int32_t rnb, rl, rp; 00272 00273 for ( int32_t i = 0 ; i < num_exm ; ++i) 00274 { 00275 SGVector< int32_t > lab(exm_len); 00276 lab.zero(); 00277 rnb = num_blocks[0] + CMath::ceil((num_blocks[1]-num_blocks[0])* 00278 CMath::random(0.0, 1.0)) - 1; 00279 00280 for ( int32_t j = 0 ; j < rnb ; ++j ) 00281 { 00282 rl = block_len[0] + CMath::ceil((block_len[1]-block_len[0])* 00283 CMath::random(0.0, 1.0)) - 1; 00284 rp = CMath::ceil((exm_len-rl)*CMath::random(0.0, 1.0)); 00285 00286 for ( int32_t idx = rp-1 ; idx < rp+rl ; ++idx ) 00287 { 00288 lab[idx] = 1; 00289 ll[i*exm_len + idx] = 1; 00290 } 00291 } 00292 00293 labels->add_vector_label(lab); 00294 } 00295 00296 // Generate features by 00297 // i) introducing label noise, i.e. flipping a propotion prop_distort 00298 // of labels and 00299 // ii) adding Gaussian noise to the (distorted) label sequence 00300 00301 SGVector< int32_t > distort(num_exm*exm_len); 00302 SGVector< int32_t > d1(CMath::round(distort.vlen*prop_distort)); 00303 SGVector< int32_t > d2(d1.vlen); 00304 SGVector< int32_t > lf; 00305 SGMatrix< float64_t > signal(num_features, distort.vlen); 00306 00307 for ( int32_t i = 0 ; i < num_features ; ++i ) 00308 { 00309 lf = ll; 00310 distort.randperm(); 00311 00312 for ( int32_t j = 0 ; j < d1.vlen ; ++j ) 00313 d1[j] = distort[j]; 00314 00315 for ( int32_t j = 0 ; j < d2.vlen ; ++j ) 00316 d2[j] = distort[ distort.vlen-d2.vlen+j ]; 00317 00318 for ( int32_t j = 0 ; j < d1.vlen ; ++j ) 00319 lf[ d1[j] ] = lf[ d2[j] ]; 00320 00321 int32_t idx = i*signal.num_cols; 00322 for ( int32_t j = 0 ; j < signal.num_cols ; ++j ) 00323 signal[idx++] = lf[j] + noise_std*CMath::normal_random((float64_t)0.0, 1.0); 00324 } 00325 00326 // Substitute some features by pure noise 00327 SGVector< int32_t > ridx(num_features); 00328 ridx.randperm(); 00329 for ( int32_t i = 0 ; i < num_noise_features ; ++i ) 00330 { 00331 int32_t idx = i*signal.num_cols; 00332 for ( int32_t j = 0 ; j < signal.num_cols ; ++j ) 00333 signal[idx++] = noise_std*CMath::normal_random((float64_t)0.0, 1.0); 00334 } 00335 00336 CMatrixFeatures< float64_t >* features = 00337 new CMatrixFeatures< float64_t >(signal, exm_len, num_exm); 00338 00339 int32_t num_obs = 0; // continuous observations, dummy value 00340 bool use_plifs = true; 00341 return new CHMSVMModel(features, labels, SMT_TWO_STATE, num_obs, use_plifs); 00342 }