SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
KNN.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006 Christian Gehl
00008  * Written (W) 2006-2009 Soeren Sonnenburg
00009  * Written (W) 2011 Sergey Lisitsyn
00010  * Written (W) 2012 Fernando José Iglesias García, cover tree support
00011  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
00012  */
00013 
00014 #include <shogun/multiclass/KNN.h>
00015 #include <shogun/labels/Labels.h>
00016 #include <shogun/labels/MulticlassLabels.h>
00017 #include <shogun/mathematics/Math.h>
00018 #include <shogun/lib/Signal.h>
00019 #include <shogun/lib/JLCoverTree.h>
00020 #include <shogun/lib/Time.h>
00021 #include <shogun/base/Parameter.h>
00022 
00023 //#define BENCHMARK_KNN
00024 //#define DEBUG_KNN
00025 
00026 using namespace shogun;
00027 
00028 CKNN::CKNN()
00029 : CDistanceMachine()
00030 {
00031     init();
00032 }
00033 
00034 CKNN::CKNN(int32_t k, CDistance* d, CLabels* trainlab)
00035 : CDistanceMachine()
00036 {
00037     init();
00038 
00039     m_k=k;
00040 
00041     ASSERT(d)
00042     ASSERT(trainlab)
00043 
00044     set_distance(d);
00045     set_labels(trainlab);
00046     m_train_labels.vlen=trainlab->get_num_labels();
00047 }
00048 
00049 void CKNN::init()
00050 {
00051     /* do not store model features by default (CDistanceMachine::apply(...) is
00052      * overwritten */
00053     set_store_model_features(false);
00054 
00055     m_k=3;
00056     m_q=1.0;
00057     m_use_covertree=false;
00058     m_num_classes=0;
00059 
00060     /* use the method classify_multiply_k to experiment with different values
00061      * of k */
00062     SG_ADD(&m_k, "m_k", "Parameter k", MS_NOT_AVAILABLE);
00063     SG_ADD(&m_q, "m_q", "Parameter q", MS_AVAILABLE);
00064     SG_ADD(&m_use_covertree, "m_use_covertree", "Parameter use_covertree", MS_NOT_AVAILABLE);
00065     SG_ADD(&m_num_classes, "m_num_classes", "Number of classes", MS_NOT_AVAILABLE);
00066 }
00067 
00068 CKNN::~CKNN()
00069 {
00070 }
00071 
00072 bool CKNN::train_machine(CFeatures* data)
00073 {
00074     ASSERT(m_labels)
00075     ASSERT(distance)
00076 
00077     if (data)
00078     {
00079         if (m_labels->get_num_labels() != data->get_num_vectors())
00080             SG_ERROR("Number of training vectors does not match number of labels\n")
00081         distance->init(data, data);
00082     }
00083 
00084     SGVector<int32_t> lab=((CMulticlassLabels*) m_labels)->get_int_labels();
00085     m_train_labels=lab.clone();
00086     ASSERT(m_train_labels.vlen>0)
00087 
00088     int32_t max_class=m_train_labels[0];
00089     int32_t min_class=m_train_labels[0];
00090 
00091     for (int32_t i=1; i<m_train_labels.vlen; i++)
00092     {
00093         max_class=CMath::max(max_class, m_train_labels[i]);
00094         min_class=CMath::min(min_class, m_train_labels[i]);
00095     }
00096 
00097     for (int32_t i=0; i<m_train_labels.vlen; i++)
00098         m_train_labels[i]-=min_class;
00099 
00100     m_min_label=min_class;
00101     m_num_classes=max_class-min_class+1;
00102 
00103     SG_INFO("m_num_classes: %d (%+d to %+d) num_train: %d\n", m_num_classes,
00104             min_class, max_class, m_train_labels.vlen);
00105 
00106     return true;
00107 }
00108 
00109 SGMatrix<index_t> CKNN::nearest_neighbors()
00110 {
00111     //number of examples to which kNN is applied
00112     int32_t n=distance->get_num_vec_rhs();
00113     //distances to train data
00114     float64_t* dists=SG_MALLOC(float64_t, m_train_labels.vlen);
00115     //indices to train data
00116     index_t* train_idxs=SG_MALLOC(index_t, m_train_labels.vlen);
00117     //pre-allocation of the nearest neighbors
00118     SGMatrix<index_t> NN(m_k, n);
00119 
00120     //for each test example
00121     for (int32_t i=0; i<n && (!CSignal::cancel_computations()); i++)
00122     {
00123         SG_PROGRESS(i, 0, n)
00124 
00125         //lhs idx 0..num train examples-1 (i.e., all train examples) and rhs idx i
00126         distances_lhs(dists,0,m_train_labels.vlen-1,i);
00127 
00128         //fill in an array with 0..num train examples-1
00129         for (int32_t j=0; j<m_train_labels.vlen; j++)
00130             train_idxs[j]=j;
00131 
00132         //sort the distance vector between test example i and all train examples
00133         CMath::qsort_index(dists, train_idxs, m_train_labels.vlen);
00134 
00135 #ifdef DEBUG_KNN
00136         SG_PRINT("\nQuick sort query %d\n", i)
00137         for (int32_t j=0; j<m_k; j++)
00138             SG_PRINT("%d ", train_idxs[j])
00139         SG_PRINT("\n")
00140 #endif
00141 
00142         //fill in the output the indices of the nearest neighbors
00143         for (int32_t j=0; j<m_k; j++)
00144             NN(j,i) = train_idxs[j];
00145     }
00146 
00147     SG_FREE(train_idxs);
00148     SG_FREE(dists);
00149 
00150     return NN;
00151 }
00152 
00153 CMulticlassLabels* CKNN::apply_multiclass(CFeatures* data)
00154 {
00155     if (data)
00156         init_distance(data);
00157 
00158     //redirecting to fast (without sorting) classify if k==1
00159     if (m_k == 1)
00160         return classify_NN();
00161 
00162     ASSERT(m_num_classes>0)
00163     ASSERT(distance)
00164     ASSERT(distance->get_num_vec_rhs())
00165 
00166     int32_t num_lab=distance->get_num_vec_rhs();
00167     ASSERT(m_k<=distance->get_num_vec_lhs())
00168 
00169     CMulticlassLabels* output=new CMulticlassLabels(num_lab);
00170 
00171     //labels of the k nearest neighbors
00172     int32_t* train_lab=SG_MALLOC(int32_t, m_k);
00173 
00174     SG_INFO("%d test examples\n", num_lab)
00175     CSignal::clear_cancel();
00176 
00177     //histogram of classes and returned output
00178     float64_t* classes=SG_MALLOC(float64_t, m_num_classes);
00179 
00180 #ifdef BENCHMARK_KNN
00181     CTime tstart;
00182     float64_t tfinish, tparsed, tcreated, tqueried;
00183 #endif
00184 
00185     if ( ! m_use_covertree )
00186     {
00187         //get the k nearest neighbors of each example
00188         SGMatrix<index_t> NN = nearest_neighbors();
00189 
00190         //from the indices to the nearest neighbors, compute the class labels
00191         for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
00192         {
00193             //write the labels of the k nearest neighbors from theirs indices
00194             for (int32_t j=0; j<m_k; j++)
00195                 train_lab[j] = m_train_labels[ NN(j,i) ];
00196 
00197             //get the index of the 'nearest' class
00198             int32_t out_idx = choose_class(classes, train_lab);
00199             //write the label of 'nearest' in the output
00200             output->set_label(i, out_idx + m_min_label);
00201         }
00202 
00203 #ifdef BENCHMARK_KNN
00204         SG_PRINT(">>>> Quick sort applied in %9.4f\n",
00205                 (tfinish = tstart.cur_time_diff(false)));
00206 #endif
00207     }
00208     else    // Use cover tree
00209     {
00210         // m_q != 1.0 not supported with cover tree because the neighbors
00211         // are not retrieved in increasing order of distance to the query
00212         float64_t old_q = m_q;
00213         if ( old_q != 1.0 )
00214             SG_INFO("q != 1.0 not supported with cover tree, using q = 1\n")
00215 
00216         // From the sets of features (lhs and rhs) stored in distance,
00217         // build arrays of cover tree points
00218         v_array< CJLCoverTreePoint > set_of_points  =
00219             parse_points(distance, FC_LHS);
00220         v_array< CJLCoverTreePoint > set_of_queries =
00221             parse_points(distance, FC_RHS);
00222 
00223 #ifdef BENCHMARK_KNN
00224         SG_PRINT(">>>> JL parsed in %9.4f\n",
00225             ( tparsed = tstart.cur_time_diff(false) ) - tfinish);
00226 #endif
00227         // Build the cover trees, one for the test vectors (rhs features)
00228         // and another for the training vectors (lhs features)
00229         CFeatures* r = distance->replace_rhs( distance->get_lhs() );
00230         node< CJLCoverTreePoint > top = batch_create(set_of_points);
00231         CFeatures* l = distance->replace_lhs(r);
00232         distance->replace_rhs(r);
00233         node< CJLCoverTreePoint > top_query = batch_create(set_of_queries);
00234 
00235 #ifdef BENCHMARK_KNN
00236         SG_PRINT(">>>> Cover trees created in %9.4f\n",
00237                 (tcreated = tstart.cur_time_diff(false)) - tparsed);
00238 #endif
00239 
00240         // Get the k nearest neighbors to all the test vectors (batch method)
00241         distance->replace_lhs(l);
00242         v_array< v_array< CJLCoverTreePoint > > res;
00243         k_nearest_neighbor(top, top_query, res, m_k);
00244 
00245 #ifdef BENCHMARK_KNN
00246         SG_PRINT(">>>> Query finished in %9.4f\n",
00247                 (tqueried = tstart.cur_time_diff(false)) - tcreated);
00248 #endif
00249 
00250 #ifdef DEBUG_KNN
00251         SG_PRINT("\nJL Results:\n")
00252         for ( int32_t i = 0 ; i < res.index ; ++i )
00253         {
00254             for ( int32_t j = 0 ; j < res[i].index ; ++j )
00255             {
00256                 printf("%d ", res[i][j].m_index);
00257             }
00258             printf("\n");
00259         }
00260         SG_PRINT("\n")
00261 #endif
00262 
00263         for ( int32_t i = 0 ; i < res.index ; ++i )
00264         {
00265             // Translate from indices to labels of the nearest neighbors
00266             for ( int32_t j = 0; j < m_k; ++j )
00267                 // The first index in res[i] points to the test vector
00268                 train_lab[j] = m_train_labels.vector[ res[i][j+1].m_index ];
00269 
00270             // Get the index of the 'nearest' class
00271             int32_t out_idx = choose_class(classes, train_lab);
00272             output->set_label(res[i][0].m_index, out_idx+m_min_label);
00273         }
00274 
00275         m_q = old_q;
00276 
00277 #ifdef BENCHMARK_KNN
00278         SG_PRINT(">>>> JL applied in %9.4f\n", tstart.cur_time_diff(false))
00279 #endif
00280     }
00281 
00282     SG_FREE(classes);
00283     SG_FREE(train_lab);
00284 
00285     return output;
00286 }
00287 
00288 CMulticlassLabels* CKNN::classify_NN()
00289 {
00290     ASSERT(distance)
00291     ASSERT(m_num_classes>0)
00292 
00293     int32_t num_lab = distance->get_num_vec_rhs();
00294     ASSERT(num_lab)
00295 
00296     CMulticlassLabels* output = new CMulticlassLabels(num_lab);
00297     float64_t* distances = SG_MALLOC(float64_t, m_train_labels.vlen);
00298 
00299     SG_INFO("%d test examples\n", num_lab)
00300     CSignal::clear_cancel();
00301 
00302     // for each test example
00303     for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
00304     {
00305         SG_PROGRESS(i,0,num_lab)
00306 
00307         // get distances from i-th test example to 0..num_m_train_labels-1 train examples
00308         distances_lhs(distances,0,m_train_labels.vlen-1,i);
00309         int32_t j;
00310 
00311         // assuming 0th train examples as nearest to i-th test example
00312         int32_t out_idx = 0;
00313         float64_t min_dist = distances[0];
00314 
00315         // searching for nearest neighbor by comparing distances
00316         for (j=0; j<m_train_labels.vlen; j++)
00317         {
00318             if (distances[j]<min_dist)
00319             {
00320                 min_dist = distances[j];
00321                 out_idx = j;
00322             }
00323         }
00324 
00325         // label i-th test example with label of nearest neighbor with out_idx index
00326         output->set_label(i,m_train_labels.vector[out_idx]+m_min_label);
00327     }
00328 
00329     SG_FREE(distances);
00330     return output;
00331 }
00332 
00333 SGMatrix<int32_t> CKNN::classify_for_multiple_k()
00334 {
00335     ASSERT(m_num_classes>0)
00336     ASSERT(distance)
00337     ASSERT(distance->get_num_vec_rhs())
00338 
00339     int32_t num_lab=distance->get_num_vec_rhs();
00340     ASSERT(m_k<=num_lab)
00341 
00342     int32_t* output=SG_MALLOC(int32_t, m_k*num_lab);
00343 
00344     //working buffer of m_train_labels
00345     int32_t* train_lab=SG_MALLOC(int32_t, m_k);
00346 
00347     //histogram of classes and returned output
00348     int32_t* classes=SG_MALLOC(int32_t, m_num_classes);
00349 
00350     SG_INFO("%d test examples\n", num_lab)
00351     CSignal::clear_cancel();
00352 
00353     if ( ! m_use_covertree )
00354     {
00355         //get the k nearest neighbors of each example
00356         SGMatrix<index_t> NN = nearest_neighbors();
00357 
00358         for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
00359         {
00360             //write the labels of the k nearest neighbors from theirs indices
00361             for (int32_t j=0; j<m_k; j++)
00362                 train_lab[j] = m_train_labels[ NN(j,i) ];
00363 
00364             choose_class_for_multiple_k(output+i, classes, train_lab, num_lab);
00365         }
00366     }
00367     else    // Use cover tree
00368     {
00369         //allocation for distances to nearest neighbors
00370         float64_t* dists=SG_MALLOC(float64_t, m_k);
00371 
00372         // From the sets of features (lhs and rhs) stored in distance,
00373         // build arrays of cover tree points
00374         v_array< CJLCoverTreePoint > set_of_points  =
00375             parse_points(distance, FC_LHS);
00376         v_array< CJLCoverTreePoint > set_of_queries =
00377             parse_points(distance, FC_RHS);
00378 
00379         // Build the cover trees, one for the test vectors (rhs features)
00380         // and another for the training vectors (lhs features)
00381         CFeatures* r = distance->replace_rhs( distance->get_lhs() );
00382         node< CJLCoverTreePoint > top = batch_create(set_of_points);
00383         CFeatures* l = distance->replace_lhs(r);
00384         distance->replace_rhs(r);
00385         node< CJLCoverTreePoint > top_query = batch_create(set_of_queries);
00386 
00387         // Get the k nearest neighbors to all the test vectors (batch method)
00388         distance->replace_lhs(l);
00389         v_array< v_array< CJLCoverTreePoint > > res;
00390         k_nearest_neighbor(top, top_query, res, m_k);
00391 
00392         for ( int32_t i = 0 ; i < res.index ; ++i )
00393         {
00394             // Handle the fact that cover tree doesn't return neighbors
00395             // ordered by distance
00396 
00397             for ( int32_t j = 0 ; j < m_k ; ++j )
00398             {
00399                 // The first index in res[i] points to the test vector
00400                 dists[j]     = distance->distance(res[i][j+1].m_index,
00401                             res[i][0].m_index);
00402                 train_lab[j] = m_train_labels.vector[
00403                             res[i][j+1].m_index ];
00404             }
00405 
00406             // Now we get the indices to the neighbors sorted by distance
00407             CMath::qsort_index(dists, train_lab, m_k);
00408 
00409             choose_class_for_multiple_k(output+res[i][0].m_index, classes,
00410                     train_lab, num_lab);
00411         }
00412 
00413         SG_FREE(dists);
00414     }
00415 
00416     SG_FREE(train_lab);
00417     SG_FREE(classes);
00418 
00419     return SGMatrix<int32_t>(output,num_lab,m_k,true);
00420 }
00421 
00422 void CKNN::init_distance(CFeatures* data)
00423 {
00424     if (!distance)
00425         SG_ERROR("No distance assigned!\n")
00426     CFeatures* lhs=distance->get_lhs();
00427     if (!lhs || !lhs->get_num_vectors())
00428     {
00429         SG_UNREF(lhs);
00430         SG_ERROR("No vectors on left hand side\n")
00431     }
00432     distance->init(lhs, data);
00433     SG_UNREF(lhs);
00434 }
00435 
00436 bool CKNN::load(FILE* srcfile)
00437 {
00438     SG_SET_LOCALE_C;
00439     SG_RESET_LOCALE;
00440     return false;
00441 }
00442 
00443 bool CKNN::save(FILE* dstfile)
00444 {
00445     SG_SET_LOCALE_C;
00446     SG_RESET_LOCALE;
00447     return false;
00448 }
00449 
00450 void CKNN::store_model_features()
00451 {
00452     CFeatures* d_lhs=distance->get_lhs();
00453     CFeatures* d_rhs=distance->get_rhs();
00454 
00455     /* copy lhs of underlying distance */
00456     distance->init(d_lhs->duplicate(), d_rhs);
00457 
00458     SG_UNREF(d_lhs); // because of d_lhs->duplicate()
00459     SG_UNREF(d_lhs); // because of distance->get_lhs()
00460     SG_UNREF(d_rhs);
00461 }
00462 
00463 int32_t CKNN::choose_class(float64_t* classes, int32_t* train_lab)
00464 {
00465     memset(classes, 0, sizeof(float64_t)*m_num_classes);
00466 
00467     float64_t multiplier = m_q;
00468     for (int32_t j=0; j<m_k; j++)
00469     {
00470         classes[train_lab[j]]+= multiplier;
00471         multiplier*= multiplier;
00472     }
00473 
00474     //choose the class that got 'outputted' most often
00475     int32_t out_idx=0;
00476     float64_t out_max=0;
00477 
00478     for (int32_t j=0; j<m_num_classes; j++)
00479     {
00480         if (out_max< classes[j])
00481         {
00482             out_idx= j;
00483             out_max= classes[j];
00484         }
00485     }
00486 
00487     return out_idx;
00488 }
00489 
00490 void CKNN::choose_class_for_multiple_k(int32_t* output, int32_t* classes, int32_t* train_lab, int32_t step)
00491 {
00492     //compute histogram of class outputs of the first k nearest neighbours
00493     memset(classes, 0, sizeof(int32_t)*m_num_classes);
00494 
00495     for (int32_t j=0; j<m_k; j++)
00496     {
00497         classes[train_lab[j]]++;
00498 
00499         //choose the class that got 'outputted' most often
00500         int32_t out_idx=0;
00501         int32_t out_max=0;
00502 
00503         for (int32_t c=0; c<m_num_classes; c++)
00504         {
00505             if (out_max< classes[c])
00506             {
00507                 out_idx= c;
00508                 out_max= classes[c];
00509             }
00510         }
00511 
00512         output[j*step]=out_idx+m_min_label;
00513     }
00514 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation