SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2006 Christian Gehl 00008 * Written (W) 2006-2009 Soeren Sonnenburg 00009 * Written (W) 2011 Sergey Lisitsyn 00010 * Written (W) 2012 Fernando José Iglesias García, cover tree support 00011 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society 00012 */ 00013 00014 #include <shogun/multiclass/KNN.h> 00015 #include <shogun/labels/Labels.h> 00016 #include <shogun/labels/MulticlassLabels.h> 00017 #include <shogun/mathematics/Math.h> 00018 #include <shogun/lib/Signal.h> 00019 #include <shogun/lib/JLCoverTree.h> 00020 #include <shogun/lib/Time.h> 00021 #include <shogun/base/Parameter.h> 00022 00023 //#define BENCHMARK_KNN 00024 //#define DEBUG_KNN 00025 00026 using namespace shogun; 00027 00028 CKNN::CKNN() 00029 : CDistanceMachine() 00030 { 00031 init(); 00032 } 00033 00034 CKNN::CKNN(int32_t k, CDistance* d, CLabels* trainlab) 00035 : CDistanceMachine() 00036 { 00037 init(); 00038 00039 m_k=k; 00040 00041 ASSERT(d) 00042 ASSERT(trainlab) 00043 00044 set_distance(d); 00045 set_labels(trainlab); 00046 m_train_labels.vlen=trainlab->get_num_labels(); 00047 } 00048 00049 void CKNN::init() 00050 { 00051 /* do not store model features by default (CDistanceMachine::apply(...) is 00052 * overwritten */ 00053 set_store_model_features(false); 00054 00055 m_k=3; 00056 m_q=1.0; 00057 m_use_covertree=false; 00058 m_num_classes=0; 00059 00060 /* use the method classify_multiply_k to experiment with different values 00061 * of k */ 00062 SG_ADD(&m_k, "m_k", "Parameter k", MS_NOT_AVAILABLE); 00063 SG_ADD(&m_q, "m_q", "Parameter q", MS_AVAILABLE); 00064 SG_ADD(&m_use_covertree, "m_use_covertree", "Parameter use_covertree", MS_NOT_AVAILABLE); 00065 SG_ADD(&m_num_classes, "m_num_classes", "Number of classes", MS_NOT_AVAILABLE); 00066 } 00067 00068 CKNN::~CKNN() 00069 { 00070 } 00071 00072 bool CKNN::train_machine(CFeatures* data) 00073 { 00074 ASSERT(m_labels) 00075 ASSERT(distance) 00076 00077 if (data) 00078 { 00079 if (m_labels->get_num_labels() != data->get_num_vectors()) 00080 SG_ERROR("Number of training vectors does not match number of labels\n") 00081 distance->init(data, data); 00082 } 00083 00084 SGVector<int32_t> lab=((CMulticlassLabels*) m_labels)->get_int_labels(); 00085 m_train_labels=lab.clone(); 00086 ASSERT(m_train_labels.vlen>0) 00087 00088 int32_t max_class=m_train_labels[0]; 00089 int32_t min_class=m_train_labels[0]; 00090 00091 for (int32_t i=1; i<m_train_labels.vlen; i++) 00092 { 00093 max_class=CMath::max(max_class, m_train_labels[i]); 00094 min_class=CMath::min(min_class, m_train_labels[i]); 00095 } 00096 00097 for (int32_t i=0; i<m_train_labels.vlen; i++) 00098 m_train_labels[i]-=min_class; 00099 00100 m_min_label=min_class; 00101 m_num_classes=max_class-min_class+1; 00102 00103 SG_INFO("m_num_classes: %d (%+d to %+d) num_train: %d\n", m_num_classes, 00104 min_class, max_class, m_train_labels.vlen); 00105 00106 return true; 00107 } 00108 00109 SGMatrix<index_t> CKNN::nearest_neighbors() 00110 { 00111 //number of examples to which kNN is applied 00112 int32_t n=distance->get_num_vec_rhs(); 00113 //distances to train data 00114 float64_t* dists=SG_MALLOC(float64_t, m_train_labels.vlen); 00115 //indices to train data 00116 index_t* train_idxs=SG_MALLOC(index_t, m_train_labels.vlen); 00117 //pre-allocation of the nearest neighbors 00118 SGMatrix<index_t> NN(m_k, n); 00119 00120 //for each test example 00121 for (int32_t i=0; i<n && (!CSignal::cancel_computations()); i++) 00122 { 00123 SG_PROGRESS(i, 0, n) 00124 00125 //lhs idx 0..num train examples-1 (i.e., all train examples) and rhs idx i 00126 distances_lhs(dists,0,m_train_labels.vlen-1,i); 00127 00128 //fill in an array with 0..num train examples-1 00129 for (int32_t j=0; j<m_train_labels.vlen; j++) 00130 train_idxs[j]=j; 00131 00132 //sort the distance vector between test example i and all train examples 00133 CMath::qsort_index(dists, train_idxs, m_train_labels.vlen); 00134 00135 #ifdef DEBUG_KNN 00136 SG_PRINT("\nQuick sort query %d\n", i) 00137 for (int32_t j=0; j<m_k; j++) 00138 SG_PRINT("%d ", train_idxs[j]) 00139 SG_PRINT("\n") 00140 #endif 00141 00142 //fill in the output the indices of the nearest neighbors 00143 for (int32_t j=0; j<m_k; j++) 00144 NN(j,i) = train_idxs[j]; 00145 } 00146 00147 SG_FREE(train_idxs); 00148 SG_FREE(dists); 00149 00150 return NN; 00151 } 00152 00153 CMulticlassLabels* CKNN::apply_multiclass(CFeatures* data) 00154 { 00155 if (data) 00156 init_distance(data); 00157 00158 //redirecting to fast (without sorting) classify if k==1 00159 if (m_k == 1) 00160 return classify_NN(); 00161 00162 ASSERT(m_num_classes>0) 00163 ASSERT(distance) 00164 ASSERT(distance->get_num_vec_rhs()) 00165 00166 int32_t num_lab=distance->get_num_vec_rhs(); 00167 ASSERT(m_k<=distance->get_num_vec_lhs()) 00168 00169 CMulticlassLabels* output=new CMulticlassLabels(num_lab); 00170 00171 //labels of the k nearest neighbors 00172 int32_t* train_lab=SG_MALLOC(int32_t, m_k); 00173 00174 SG_INFO("%d test examples\n", num_lab) 00175 CSignal::clear_cancel(); 00176 00177 //histogram of classes and returned output 00178 float64_t* classes=SG_MALLOC(float64_t, m_num_classes); 00179 00180 #ifdef BENCHMARK_KNN 00181 CTime tstart; 00182 float64_t tfinish, tparsed, tcreated, tqueried; 00183 #endif 00184 00185 if ( ! m_use_covertree ) 00186 { 00187 //get the k nearest neighbors of each example 00188 SGMatrix<index_t> NN = nearest_neighbors(); 00189 00190 //from the indices to the nearest neighbors, compute the class labels 00191 for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++) 00192 { 00193 //write the labels of the k nearest neighbors from theirs indices 00194 for (int32_t j=0; j<m_k; j++) 00195 train_lab[j] = m_train_labels[ NN(j,i) ]; 00196 00197 //get the index of the 'nearest' class 00198 int32_t out_idx = choose_class(classes, train_lab); 00199 //write the label of 'nearest' in the output 00200 output->set_label(i, out_idx + m_min_label); 00201 } 00202 00203 #ifdef BENCHMARK_KNN 00204 SG_PRINT(">>>> Quick sort applied in %9.4f\n", 00205 (tfinish = tstart.cur_time_diff(false))); 00206 #endif 00207 } 00208 else // Use cover tree 00209 { 00210 // m_q != 1.0 not supported with cover tree because the neighbors 00211 // are not retrieved in increasing order of distance to the query 00212 float64_t old_q = m_q; 00213 if ( old_q != 1.0 ) 00214 SG_INFO("q != 1.0 not supported with cover tree, using q = 1\n") 00215 00216 // From the sets of features (lhs and rhs) stored in distance, 00217 // build arrays of cover tree points 00218 v_array< CJLCoverTreePoint > set_of_points = 00219 parse_points(distance, FC_LHS); 00220 v_array< CJLCoverTreePoint > set_of_queries = 00221 parse_points(distance, FC_RHS); 00222 00223 #ifdef BENCHMARK_KNN 00224 SG_PRINT(">>>> JL parsed in %9.4f\n", 00225 ( tparsed = tstart.cur_time_diff(false) ) - tfinish); 00226 #endif 00227 // Build the cover trees, one for the test vectors (rhs features) 00228 // and another for the training vectors (lhs features) 00229 CFeatures* r = distance->replace_rhs( distance->get_lhs() ); 00230 node< CJLCoverTreePoint > top = batch_create(set_of_points); 00231 CFeatures* l = distance->replace_lhs(r); 00232 distance->replace_rhs(r); 00233 node< CJLCoverTreePoint > top_query = batch_create(set_of_queries); 00234 00235 #ifdef BENCHMARK_KNN 00236 SG_PRINT(">>>> Cover trees created in %9.4f\n", 00237 (tcreated = tstart.cur_time_diff(false)) - tparsed); 00238 #endif 00239 00240 // Get the k nearest neighbors to all the test vectors (batch method) 00241 distance->replace_lhs(l); 00242 v_array< v_array< CJLCoverTreePoint > > res; 00243 k_nearest_neighbor(top, top_query, res, m_k); 00244 00245 #ifdef BENCHMARK_KNN 00246 SG_PRINT(">>>> Query finished in %9.4f\n", 00247 (tqueried = tstart.cur_time_diff(false)) - tcreated); 00248 #endif 00249 00250 #ifdef DEBUG_KNN 00251 SG_PRINT("\nJL Results:\n") 00252 for ( int32_t i = 0 ; i < res.index ; ++i ) 00253 { 00254 for ( int32_t j = 0 ; j < res[i].index ; ++j ) 00255 { 00256 printf("%d ", res[i][j].m_index); 00257 } 00258 printf("\n"); 00259 } 00260 SG_PRINT("\n") 00261 #endif 00262 00263 for ( int32_t i = 0 ; i < res.index ; ++i ) 00264 { 00265 // Translate from indices to labels of the nearest neighbors 00266 for ( int32_t j = 0; j < m_k; ++j ) 00267 // The first index in res[i] points to the test vector 00268 train_lab[j] = m_train_labels.vector[ res[i][j+1].m_index ]; 00269 00270 // Get the index of the 'nearest' class 00271 int32_t out_idx = choose_class(classes, train_lab); 00272 output->set_label(res[i][0].m_index, out_idx+m_min_label); 00273 } 00274 00275 m_q = old_q; 00276 00277 #ifdef BENCHMARK_KNN 00278 SG_PRINT(">>>> JL applied in %9.4f\n", tstart.cur_time_diff(false)) 00279 #endif 00280 } 00281 00282 SG_FREE(classes); 00283 SG_FREE(train_lab); 00284 00285 return output; 00286 } 00287 00288 CMulticlassLabels* CKNN::classify_NN() 00289 { 00290 ASSERT(distance) 00291 ASSERT(m_num_classes>0) 00292 00293 int32_t num_lab = distance->get_num_vec_rhs(); 00294 ASSERT(num_lab) 00295 00296 CMulticlassLabels* output = new CMulticlassLabels(num_lab); 00297 float64_t* distances = SG_MALLOC(float64_t, m_train_labels.vlen); 00298 00299 SG_INFO("%d test examples\n", num_lab) 00300 CSignal::clear_cancel(); 00301 00302 // for each test example 00303 for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++) 00304 { 00305 SG_PROGRESS(i,0,num_lab) 00306 00307 // get distances from i-th test example to 0..num_m_train_labels-1 train examples 00308 distances_lhs(distances,0,m_train_labels.vlen-1,i); 00309 int32_t j; 00310 00311 // assuming 0th train examples as nearest to i-th test example 00312 int32_t out_idx = 0; 00313 float64_t min_dist = distances[0]; 00314 00315 // searching for nearest neighbor by comparing distances 00316 for (j=0; j<m_train_labels.vlen; j++) 00317 { 00318 if (distances[j]<min_dist) 00319 { 00320 min_dist = distances[j]; 00321 out_idx = j; 00322 } 00323 } 00324 00325 // label i-th test example with label of nearest neighbor with out_idx index 00326 output->set_label(i,m_train_labels.vector[out_idx]+m_min_label); 00327 } 00328 00329 SG_FREE(distances); 00330 return output; 00331 } 00332 00333 SGMatrix<int32_t> CKNN::classify_for_multiple_k() 00334 { 00335 ASSERT(m_num_classes>0) 00336 ASSERT(distance) 00337 ASSERT(distance->get_num_vec_rhs()) 00338 00339 int32_t num_lab=distance->get_num_vec_rhs(); 00340 ASSERT(m_k<=num_lab) 00341 00342 int32_t* output=SG_MALLOC(int32_t, m_k*num_lab); 00343 00344 //working buffer of m_train_labels 00345 int32_t* train_lab=SG_MALLOC(int32_t, m_k); 00346 00347 //histogram of classes and returned output 00348 int32_t* classes=SG_MALLOC(int32_t, m_num_classes); 00349 00350 SG_INFO("%d test examples\n", num_lab) 00351 CSignal::clear_cancel(); 00352 00353 if ( ! m_use_covertree ) 00354 { 00355 //get the k nearest neighbors of each example 00356 SGMatrix<index_t> NN = nearest_neighbors(); 00357 00358 for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++) 00359 { 00360 //write the labels of the k nearest neighbors from theirs indices 00361 for (int32_t j=0; j<m_k; j++) 00362 train_lab[j] = m_train_labels[ NN(j,i) ]; 00363 00364 choose_class_for_multiple_k(output+i, classes, train_lab, num_lab); 00365 } 00366 } 00367 else // Use cover tree 00368 { 00369 //allocation for distances to nearest neighbors 00370 float64_t* dists=SG_MALLOC(float64_t, m_k); 00371 00372 // From the sets of features (lhs and rhs) stored in distance, 00373 // build arrays of cover tree points 00374 v_array< CJLCoverTreePoint > set_of_points = 00375 parse_points(distance, FC_LHS); 00376 v_array< CJLCoverTreePoint > set_of_queries = 00377 parse_points(distance, FC_RHS); 00378 00379 // Build the cover trees, one for the test vectors (rhs features) 00380 // and another for the training vectors (lhs features) 00381 CFeatures* r = distance->replace_rhs( distance->get_lhs() ); 00382 node< CJLCoverTreePoint > top = batch_create(set_of_points); 00383 CFeatures* l = distance->replace_lhs(r); 00384 distance->replace_rhs(r); 00385 node< CJLCoverTreePoint > top_query = batch_create(set_of_queries); 00386 00387 // Get the k nearest neighbors to all the test vectors (batch method) 00388 distance->replace_lhs(l); 00389 v_array< v_array< CJLCoverTreePoint > > res; 00390 k_nearest_neighbor(top, top_query, res, m_k); 00391 00392 for ( int32_t i = 0 ; i < res.index ; ++i ) 00393 { 00394 // Handle the fact that cover tree doesn't return neighbors 00395 // ordered by distance 00396 00397 for ( int32_t j = 0 ; j < m_k ; ++j ) 00398 { 00399 // The first index in res[i] points to the test vector 00400 dists[j] = distance->distance(res[i][j+1].m_index, 00401 res[i][0].m_index); 00402 train_lab[j] = m_train_labels.vector[ 00403 res[i][j+1].m_index ]; 00404 } 00405 00406 // Now we get the indices to the neighbors sorted by distance 00407 CMath::qsort_index(dists, train_lab, m_k); 00408 00409 choose_class_for_multiple_k(output+res[i][0].m_index, classes, 00410 train_lab, num_lab); 00411 } 00412 00413 SG_FREE(dists); 00414 } 00415 00416 SG_FREE(train_lab); 00417 SG_FREE(classes); 00418 00419 return SGMatrix<int32_t>(output,num_lab,m_k,true); 00420 } 00421 00422 void CKNN::init_distance(CFeatures* data) 00423 { 00424 if (!distance) 00425 SG_ERROR("No distance assigned!\n") 00426 CFeatures* lhs=distance->get_lhs(); 00427 if (!lhs || !lhs->get_num_vectors()) 00428 { 00429 SG_UNREF(lhs); 00430 SG_ERROR("No vectors on left hand side\n") 00431 } 00432 distance->init(lhs, data); 00433 SG_UNREF(lhs); 00434 } 00435 00436 bool CKNN::load(FILE* srcfile) 00437 { 00438 SG_SET_LOCALE_C; 00439 SG_RESET_LOCALE; 00440 return false; 00441 } 00442 00443 bool CKNN::save(FILE* dstfile) 00444 { 00445 SG_SET_LOCALE_C; 00446 SG_RESET_LOCALE; 00447 return false; 00448 } 00449 00450 void CKNN::store_model_features() 00451 { 00452 CFeatures* d_lhs=distance->get_lhs(); 00453 CFeatures* d_rhs=distance->get_rhs(); 00454 00455 /* copy lhs of underlying distance */ 00456 distance->init(d_lhs->duplicate(), d_rhs); 00457 00458 SG_UNREF(d_lhs); // because of d_lhs->duplicate() 00459 SG_UNREF(d_lhs); // because of distance->get_lhs() 00460 SG_UNREF(d_rhs); 00461 } 00462 00463 int32_t CKNN::choose_class(float64_t* classes, int32_t* train_lab) 00464 { 00465 memset(classes, 0, sizeof(float64_t)*m_num_classes); 00466 00467 float64_t multiplier = m_q; 00468 for (int32_t j=0; j<m_k; j++) 00469 { 00470 classes[train_lab[j]]+= multiplier; 00471 multiplier*= multiplier; 00472 } 00473 00474 //choose the class that got 'outputted' most often 00475 int32_t out_idx=0; 00476 float64_t out_max=0; 00477 00478 for (int32_t j=0; j<m_num_classes; j++) 00479 { 00480 if (out_max< classes[j]) 00481 { 00482 out_idx= j; 00483 out_max= classes[j]; 00484 } 00485 } 00486 00487 return out_idx; 00488 } 00489 00490 void CKNN::choose_class_for_multiple_k(int32_t* output, int32_t* classes, int32_t* train_lab, int32_t step) 00491 { 00492 //compute histogram of class outputs of the first k nearest neighbours 00493 memset(classes, 0, sizeof(int32_t)*m_num_classes); 00494 00495 for (int32_t j=0; j<m_k; j++) 00496 { 00497 classes[train_lab[j]]++; 00498 00499 //choose the class that got 'outputted' most often 00500 int32_t out_idx=0; 00501 int32_t out_max=0; 00502 00503 for (int32_t c=0; c<m_num_classes; c++) 00504 { 00505 if (out_max< classes[c]) 00506 { 00507 out_idx= c; 00508 out_max= classes[c]; 00509 } 00510 } 00511 00512 output[j*step]=out_idx+m_min_label; 00513 } 00514 }