SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
TwoStateModel.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2012 Fernando José Iglesias García
00008  * Copyright (C) 2012 Fernando José Iglesias García
00009  */
00010 
00011 #include <shogun/structure/TwoStateModel.h>
00012 #include <shogun/mathematics/Math.h>
00013 #include <shogun/features/MatrixFeatures.h>
00014 #include <shogun/structure/Plif.h>
00015 
00016 using namespace shogun;
00017 
00018 CTwoStateModel::CTwoStateModel() : CStateModel()
00019 {
00020     // The number of states in this state model is equal to four.
00021     // Although parameters are learnt only for two of them, other
00022     // two states (start and stop) are used
00023     m_num_states = 4;
00024     m_num_transmission_params = 4;
00025 
00026     m_state_loss_mat = SGMatrix< float64_t >(m_num_states, m_num_states);
00027     m_state_loss_mat.zero();
00028     for ( int32_t i = 0 ; i < m_num_states-1 ; ++i )
00029     {
00030         m_state_loss_mat(m_num_states-1, i) = 1;
00031         m_state_loss_mat(i, m_num_states-1) = 1;
00032     }
00033 
00034     // Initialize the start and stop states
00035     m_p = SGVector< float64_t >(m_num_states);
00036     m_q = SGVector< float64_t >(m_num_states);
00037     m_p.set_const(-CMath::INFTY);
00038     m_q.set_const(-CMath::INFTY);
00039     m_p[0] = 0; // start state
00040     m_q[1] = 0; // stop  state
00041 }
00042 
00043 CTwoStateModel::~CTwoStateModel()
00044 {
00045 }
00046 
00047 SGMatrix< float64_t > CTwoStateModel::loss_matrix(CSequence* label_seq)
00048 {
00049     SGVector< int32_t > state_seq = labels_to_states(label_seq);
00050     SGMatrix< float64_t > loss_mat(m_num_states, state_seq.vlen);
00051 
00052     for ( int32_t i = 0 ; i < loss_mat.num_cols ; ++i )
00053     {
00054         for ( int32_t s = 0 ; s < loss_mat.num_rows ; ++s )
00055             loss_mat(s,i) = m_state_loss_mat(s, state_seq[i]);
00056     }
00057 
00058     return loss_mat;
00059 }
00060 
00061 float64_t CTwoStateModel::loss(CSequence* label_seq_lhs, CSequence* label_seq_rhs)
00062 {
00063     SGVector< int32_t > state_seq_lhs = labels_to_states(label_seq_lhs);
00064     SGVector< int32_t > state_seq_rhs = labels_to_states(label_seq_rhs);
00065 
00066     ASSERT(state_seq_lhs.vlen == state_seq_rhs.vlen)
00067 
00068     float64_t ret = 0.0;
00069     for ( int32_t i = 0 ; i < state_seq_lhs.vlen ; ++i )
00070         ret += m_state_loss_mat(state_seq_lhs[i], state_seq_rhs[i]);
00071 
00072     return ret;
00073 }
00074 
00075 SGVector< int32_t > CTwoStateModel::labels_to_states(CSequence* label_seq) const
00076 {
00077     // 0 -> start state
00078     // 1 -> stop state
00079     // 2 -> negative state (label == 0)
00080     // 3 -> positive state (label == 1)
00081 
00082     SGVector< int32_t > seq_data = label_seq->get_data();
00083     SGVector< int32_t > state_seq(seq_data.size());
00084     for ( int32_t i = 1 ; i < state_seq.vlen-1 ; ++i )
00085     {
00086         //FIXME make independent of values 0-1 in labels
00087         state_seq[i] = seq_data[i] + 2;
00088     }
00089 
00090     // The first element is always start state
00091     state_seq[0] = 0;
00092     // The last element is always stop state
00093     state_seq[state_seq.vlen-1] = 1;
00094 
00095     return state_seq;
00096 }
00097 
00098 CSequence* CTwoStateModel::states_to_labels(SGVector< int32_t > state_seq) const
00099 {
00100     SGVector< int32_t > label_seq(state_seq.vlen);
00101 
00102     //FIXME make independent of values 0-1 in labels
00103     // Legend for state indices:
00104     // 0 -> start state => label 0
00105     // 1 -> stop state => label 0
00106     // 2 -> negative state (label == 0) => label 0
00107     // 3 -> positive state (label == 1) => label 1
00108     label_seq.zero();
00109     for ( int32_t i = 0 ; i < state_seq.vlen ; ++i )
00110     {
00111         if ( state_seq[i] == 3 )
00112             label_seq[i] = 1;
00113     }
00114 
00115     CSequence* ret = new CSequence(label_seq);
00116     SG_REF(ret);
00117     return ret;
00118 }
00119 
00120 void CTwoStateModel::reshape_emission_params(SGVector< float64_t >& emission_weights,
00121         SGVector< float64_t > w, int32_t num_feats, int32_t num_obs)
00122 {
00123     emission_weights.zero();
00124 
00125     // Legend for state indices:
00126     // 0 -> start state
00127     // 1 -> stop state
00128     // 2 -> negative state (label == 0)
00129     // 3 -> positive state (label == 1)
00130     //
00131     // start and stop states have no emission scores
00132 
00133     index_t em_idx, w_idx = m_num_transmission_params;
00134     for ( int32_t s = 2 ; s < m_num_states ; ++s )
00135     {
00136         for ( int32_t f = 0 ; f < num_feats ; ++f )
00137         {
00138             for ( int32_t o = 0 ; o < num_obs ; ++o )
00139             {
00140                 em_idx = s*num_feats*num_obs + f*num_obs + o;
00141                 emission_weights[em_idx] = w[w_idx++];
00142             }
00143         }
00144     }
00145 }
00146 
00147 void CTwoStateModel::reshape_emission_params(CDynamicObjectArray* plif_matrix,
00148         SGVector< float64_t > w, int32_t num_feats, int32_t num_plif_nodes)
00149 {
00150     CPlif* plif;
00151     index_t p_idx, w_idx = m_num_transmission_params;
00152     for ( int32_t s = 2 ; s < m_num_states ; ++s )
00153     {
00154         for ( int32_t f = 0 ; f < num_feats ; ++f )
00155         {
00156             SGVector< float64_t > penalties(num_plif_nodes);
00157             p_idx = 0;
00158 
00159             for ( int32_t i = 0 ; i < num_plif_nodes ; ++i )
00160                 penalties[p_idx++] = w[w_idx++];
00161 
00162             plif = (CPlif*) plif_matrix->get_element(m_num_states*f + s);
00163             plif->set_plif_penalty(penalties);
00164             SG_UNREF(plif);
00165         }
00166     }
00167 }
00168 
00169 void CTwoStateModel::reshape_transmission_params(
00170         SGMatrix< float64_t >& transmission_weights, SGVector< float64_t > w)
00171 {
00172     transmission_weights.set_const(-CMath::INFTY);
00173 
00174     // Legend for state indices:
00175     // 0 -> start state
00176     // 1 -> stop state
00177     // 2 -> negative state (label == 0)
00178     // 3 -> positive state (label == 1)
00179 
00180     // From start
00181     transmission_weights(0,2) = 0;    // to negative
00182     transmission_weights(0,3) = 0;    // to positive
00183     // From negative
00184     transmission_weights(2,1) = 0;    // to stop
00185     transmission_weights(2,2) = w[0]; // to negative
00186     transmission_weights(2,3) = w[1]; // to positive
00187     // From positive
00188     transmission_weights(3,1) = 0;    // to stop
00189     transmission_weights(3,2) = w[3]; // to positive
00190     transmission_weights(3,3) = w[2]; // to negative
00191 }
00192 
00193 void CTwoStateModel::weights_to_vector(SGVector< float64_t >& psi,
00194         SGMatrix< float64_t > transmission_weights,
00195         SGVector< float64_t > emission_weights,
00196         int32_t num_feats, int32_t num_obs) const
00197 {
00198     // Legend for state indices:
00199     // 0 -> start state
00200     // 1 -> stop state
00201     // 2 -> negative state
00202     // 3 -> positive state
00203     psi[0] = transmission_weights(2,2);
00204     psi[1] = transmission_weights(2,3);
00205     psi[2] = transmission_weights(3,3);
00206     psi[3] = transmission_weights(3,2);
00207 
00208     // start and stop states have no emission scores
00209     index_t obs_idx, psi_idx = m_num_transmission_params;
00210     for ( int32_t s = 2 ; s < m_num_states ; ++s )
00211     {
00212         for ( int32_t f = 0 ; f < num_feats ; ++f )
00213         {
00214             for ( int32_t o = 0 ; o < num_obs ; ++o )
00215             {
00216                 obs_idx = s*num_feats*num_obs + f*num_obs + o;
00217                 psi[psi_idx++] = emission_weights[obs_idx];
00218             }
00219         }
00220     }
00221 
00222 }
00223 
00224 SGVector< float64_t > CTwoStateModel::weights_to_vector(SGMatrix< float64_t > transmission_weights,
00225         SGVector< float64_t > emission_weights, int32_t num_feats, int32_t num_obs) const
00226 {
00227     int32_t num_free_states = 2;
00228     SGVector< float64_t > vec(num_free_states*(num_free_states + num_feats*num_obs));
00229     vec.zero();
00230     weights_to_vector(vec, transmission_weights, emission_weights, num_feats, num_obs);
00231     return vec;
00232 }
00233 
00234 SGVector< int32_t > CTwoStateModel::get_monotonicity(int32_t num_free_states,
00235         int32_t num_feats) const
00236 {
00237     REQUIRE(num_free_states == 2, "Using the TwoStateModel only two states are free\n")
00238 
00239     SGVector< int32_t > monotonicity(num_feats*num_free_states);
00240 
00241     for ( int32_t i = 0 ; i < num_feats ; ++i )
00242         monotonicity[i] = -1;
00243     for ( int32_t i = num_feats ; i < 2*num_feats ; ++i )
00244         monotonicity[i] = +1;
00245 
00246     return monotonicity;
00247 }
00248 
00249 CHMSVMModel* CTwoStateModel::simulate_data(int32_t num_exm, int32_t exm_len,
00250     int32_t num_features, int32_t num_noise_features)
00251 {
00252     // Number of different states
00253     int32_t num_states = 2;
00254     // Min and max length of positive block
00255     int32_t block_len[] = {10, 100};
00256     // Min and max number of positive blocks per example
00257     int32_t num_blocks[] = {0, 3};
00258 
00259     // Proportion of wrong labels
00260     float64_t prop_distort = 0.2;
00261     // Standard deviation of Gaussian noise
00262     float64_t noise_std = 4;
00263 
00264     // Generate label sequence randomly containing from num_blocks[0] to
00265     // num_blocks[1] blocks of positive labels each of length between
00266     // block_len[0] and block_len[1]
00267 
00268     CSequenceLabels* labels = new CSequenceLabels(num_exm, num_states);
00269     SGVector< int32_t > ll(num_exm*exm_len);
00270     ll.zero();
00271     int32_t rnb, rl, rp;
00272 
00273     for ( int32_t i = 0 ; i < num_exm ; ++i)
00274     {
00275         SGVector< int32_t > lab(exm_len);
00276         lab.zero();
00277         rnb = num_blocks[0] + CMath::ceil((num_blocks[1]-num_blocks[0])*
00278             CMath::random(0.0, 1.0)) - 1;
00279 
00280         for ( int32_t j = 0 ; j < rnb ; ++j )
00281         {
00282             rl = block_len[0] + CMath::ceil((block_len[1]-block_len[0])*
00283                 CMath::random(0.0, 1.0)) - 1;
00284             rp = CMath::ceil((exm_len-rl)*CMath::random(0.0, 1.0));
00285 
00286             for ( int32_t idx = rp-1 ; idx < rp+rl ; ++idx )
00287             {
00288                 lab[idx] = 1;
00289                 ll[i*exm_len + idx] = 1;
00290             }
00291         }
00292 
00293         labels->add_vector_label(lab);
00294     }
00295 
00296     // Generate features by
00297     // i) introducing label noise, i.e. flipping a propotion prop_distort
00298     // of labels and
00299     // ii) adding Gaussian noise to the (distorted) label sequence
00300 
00301     SGVector< int32_t >   distort(num_exm*exm_len);
00302     SGVector< int32_t >   d1(CMath::round(distort.vlen*prop_distort));
00303     SGVector< int32_t >   d2(d1.vlen);
00304     SGVector< int32_t >   lf;
00305     SGMatrix< float64_t > signal(num_features, distort.vlen);
00306 
00307     for ( int32_t i = 0 ; i < num_features ; ++i )
00308     {
00309         lf = ll;
00310         distort.randperm();
00311 
00312         for ( int32_t j = 0 ; j < d1.vlen ; ++j )
00313             d1[j] = distort[j];
00314 
00315         for ( int32_t j = 0 ; j < d2.vlen ; ++j )
00316             d2[j] = distort[ distort.vlen-d2.vlen+j ];
00317 
00318         for ( int32_t j = 0 ; j < d1.vlen ; ++j )
00319             lf[ d1[j] ] = lf[ d2[j] ];
00320 
00321         int32_t idx = i*signal.num_cols;
00322         for ( int32_t j = 0 ; j < signal.num_cols ; ++j )
00323             signal[idx++] = lf[j] + noise_std*CMath::normal_random((float64_t)0.0, 1.0);
00324     }
00325 
00326     // Substitute some features by pure noise
00327     SGVector< int32_t > ridx(num_features);
00328     ridx.randperm();
00329     for ( int32_t i = 0 ; i < num_noise_features ; ++i )
00330     {
00331         int32_t idx = i*signal.num_cols;
00332         for ( int32_t j = 0 ; j < signal.num_cols ; ++j )
00333             signal[idx++] = noise_std*CMath::normal_random((float64_t)0.0, 1.0);
00334     }
00335 
00336     CMatrixFeatures< float64_t >* features =
00337         new CMatrixFeatures< float64_t >(signal, exm_len, num_exm);
00338 
00339     int32_t num_obs = 0; // continuous observations, dummy value
00340     bool use_plifs = true;
00341     return new CHMSVMModel(features, labels, SMT_TWO_STATE, num_obs, use_plifs);
00342 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation