SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2011 Sergey Lisitsyn 00008 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society 00009 */ 00010 00011 #include <shogun/multiclass/GaussianNaiveBayes.h> 00012 #include <shogun/features/Features.h> 00013 #include <shogun/labels/Labels.h> 00014 #include <shogun/labels/RegressionLabels.h> 00015 #include <shogun/labels/MulticlassLabels.h> 00016 #include <shogun/mathematics/Math.h> 00017 #include <shogun/lib/Signal.h> 00018 00019 using namespace shogun; 00020 00021 CGaussianNaiveBayes::CGaussianNaiveBayes() : CNativeMulticlassMachine(), m_features(NULL), 00022 m_min_label(0), m_num_classes(0), m_dim(0), m_means(), m_variances(), 00023 m_label_prob(), m_rates() 00024 { 00025 00026 }; 00027 00028 CGaussianNaiveBayes::CGaussianNaiveBayes(CFeatures* train_examples, 00029 CLabels* train_labels) : CNativeMulticlassMachine(), m_features(NULL), 00030 m_min_label(0), m_num_classes(0), m_dim(0), m_means(), 00031 m_variances(), m_label_prob(), m_rates() 00032 { 00033 ASSERT(train_examples->get_num_vectors() == train_labels->get_num_labels()) 00034 set_labels(train_labels); 00035 00036 if (!train_examples->has_property(FP_DOT)) 00037 SG_ERROR("Specified features are not of type CDotFeatures\n") 00038 00039 set_features((CDotFeatures*)train_examples); 00040 }; 00041 00042 CGaussianNaiveBayes::~CGaussianNaiveBayes() 00043 { 00044 SG_UNREF(m_features); 00045 }; 00046 00047 CFeatures* CGaussianNaiveBayes::get_features() 00048 { 00049 SG_REF(m_features); 00050 return m_features; 00051 } 00052 00053 void CGaussianNaiveBayes::set_features(CFeatures* features) 00054 { 00055 if (!features->has_property(FP_DOT)) 00056 SG_ERROR("Specified features are not of type CDotFeatures\n") 00057 00058 SG_REF(features); 00059 SG_UNREF(m_features); 00060 m_features = (CDotFeatures*)features; 00061 } 00062 00063 bool CGaussianNaiveBayes::train_machine(CFeatures* data) 00064 { 00065 // init features with data if necessary and assure type is correct 00066 if (data) 00067 { 00068 if (!data->has_property(FP_DOT)) 00069 SG_ERROR("Specified features are not of type CDotFeatures\n") 00070 set_features((CDotFeatures*) data); 00071 } 00072 00073 // get int labels to train_labels and check length equality 00074 ASSERT(m_labels) 00075 ASSERT(m_labels->get_label_type() == LT_MULTICLASS) 00076 SGVector<int32_t> train_labels = ((CMulticlassLabels*) m_labels)->get_int_labels(); 00077 ASSERT(m_features->get_num_vectors()==train_labels.vlen) 00078 00079 // init min_label, max_label and loop variables 00080 int32_t min_label = train_labels.vector[0]; 00081 int32_t max_label = train_labels.vector[0]; 00082 int i,j; 00083 00084 // find minimal and maximal label 00085 for (i=1; i<train_labels.vlen; i++) 00086 { 00087 min_label = CMath::min(min_label, train_labels.vector[i]); 00088 max_label = CMath::max(max_label, train_labels.vector[i]); 00089 } 00090 00091 // subtract minimal label from all labels 00092 for (i=0; i<train_labels.vlen; i++) 00093 train_labels.vector[i]-= min_label; 00094 00095 // get number of classes, minimal label and dimensionality 00096 m_num_classes = max_label-min_label+1; 00097 m_min_label = min_label; 00098 m_dim = m_features->get_dim_feature_space(); 00099 00100 // allocate memory for distributions' parameters and a priori probability 00101 m_means=SGMatrix<float64_t>(m_dim,m_num_classes); 00102 m_variances=SGMatrix<float64_t>(m_dim, m_num_classes); 00103 m_label_prob=SGVector<float64_t>(m_num_classes); 00104 00105 // allocate memory for label rates 00106 m_rates=SGVector<float64_t>(m_num_classes); 00107 00108 // make arrays filled by zeros before using 00109 m_means.zero(); 00110 m_variances.zero(); 00111 m_label_prob.zero(); 00112 m_rates.zero(); 00113 00114 // number of iterations in all cycles 00115 int32_t max_progress = 2 * train_labels.vlen + 2 * m_num_classes; 00116 00117 // current progress 00118 int32_t progress = 0; 00119 SG_PROGRESS(progress, 0, max_progress) 00120 00121 // get sum of features among labels 00122 for (i=0; i<train_labels.vlen; i++) 00123 { 00124 SGVector<float64_t> fea = m_features->get_computed_dot_feature_vector(i); 00125 for (j=0; j<m_dim; j++) 00126 m_means(j, train_labels.vector[i]) += fea.vector[j]; 00127 00128 m_label_prob.vector[train_labels.vector[i]]+=1.0; 00129 00130 progress++; 00131 SG_PROGRESS(progress, 0, max_progress) 00132 } 00133 00134 // get means of features of labels 00135 for (i=0; i<m_num_classes; i++) 00136 { 00137 for (j=0; j<m_dim; j++) 00138 m_means(j, i) /= m_label_prob.vector[i]; 00139 00140 progress++; 00141 SG_PROGRESS(progress, 0, max_progress) 00142 } 00143 00144 // compute squared residuals with means available 00145 for (i=0; i<train_labels.vlen; i++) 00146 { 00147 SGVector<float64_t> fea = m_features->get_computed_dot_feature_vector(i); 00148 for (j=0; j<m_dim; j++) 00149 { 00150 m_variances(j, train_labels.vector[i]) += 00151 CMath::sq(fea[j]-m_means(j, train_labels.vector[i])); 00152 } 00153 00154 progress++; 00155 SG_PROGRESS(progress, 0, max_progress) 00156 } 00157 00158 // get variance of features of labels 00159 for (i=0; i<m_num_classes; i++) 00160 { 00161 for (j=0; j<m_dim; j++) 00162 m_variances(j, i) /= m_label_prob.vector[i] > 1 ? m_label_prob.vector[i]-1 : 1; 00163 00164 // get a priori probabilities of labels 00165 m_label_prob.vector[i]/= m_num_classes; 00166 00167 progress++; 00168 SG_PROGRESS(progress, 0, max_progress) 00169 } 00170 SG_DONE() 00171 00172 return true; 00173 } 00174 00175 CMulticlassLabels* CGaussianNaiveBayes::apply_multiclass(CFeatures* data) 00176 { 00177 if (data) 00178 set_features(data); 00179 00180 ASSERT(m_features) 00181 00182 // init number of vectors 00183 int32_t num_vectors = m_features->get_num_vectors(); 00184 00185 // init result labels 00186 CMulticlassLabels* result = new CMulticlassLabels(num_vectors); 00187 00188 // classify each example of data 00189 SG_PROGRESS(0, 0, num_vectors) 00190 for (int i = 0; i < num_vectors; i++) 00191 { 00192 result->set_label(i,apply_one(i)); 00193 SG_PROGRESS(i + 1, 0, num_vectors) 00194 } 00195 SG_DONE() 00196 return result; 00197 }; 00198 00199 float64_t CGaussianNaiveBayes::apply_one(int32_t idx) 00200 { 00201 // get [idx] feature vector 00202 SGVector<float64_t> feature_vector = m_features->get_computed_dot_feature_vector(idx); 00203 00204 // init loop variables 00205 int i,k; 00206 00207 // rate all labels 00208 for (i=0; i<m_num_classes; i++) 00209 { 00210 // set rate to 0.0 if a priori probability is 0.0 and continue 00211 if (m_label_prob.vector[i]==0.0) 00212 { 00213 m_rates.vector[i] = 0.0; 00214 continue; 00215 } 00216 else 00217 m_rates.vector[i] = CMath::log(m_label_prob.vector[i]); 00218 00219 // product all conditional gaussian probabilities 00220 for (k=0; k<m_dim; k++) 00221 if (m_variances(k,i)!=0.0) 00222 m_rates.vector[i]+= CMath::log(0.39894228/CMath::sqrt(m_variances(k, i))) - 00223 0.5*CMath::sq(feature_vector.vector[k]-m_means(k, i))/(m_variances(k, i)); 00224 } 00225 00226 // find label with maximum rate 00227 int32_t max_label_idx = 0; 00228 00229 for (i=0; i<m_num_classes; i++) 00230 { 00231 if (m_rates.vector[i]>m_rates.vector[max_label_idx]) 00232 max_label_idx = i; 00233 } 00234 00235 return max_label_idx+m_min_label; 00236 };