SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
KMeans.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 2007-2009 Soeren Sonnenburg
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <shogun/clustering/KMeansLloydImpl.h>
00013 #include "shogun/clustering/KMeansMiniBatchImpl.h"
00014 #include <shogun/clustering/KMeans.h>
00015 #include <shogun/distance/Distance.h>
00016 #include <shogun/labels/Labels.h>
00017 #include <shogun/features/DenseFeatures.h>
00018 #include <shogun/mathematics/Math.h>
00019 #include <shogun/base/Parallel.h>
00020 
00021 #ifdef HAVE_PTHREAD
00022 #include <pthread.h>
00023 #endif
00024 
00025 #define MUSRECALC
00026 
00027 using namespace shogun;
00028 
00029 CKMeans::CKMeans()
00030 : CDistanceMachine()
00031 {
00032     init();
00033 }
00034 
00035 CKMeans::CKMeans(int32_t k_, CDistance* d, EKMeansMethod f)
00036 :CDistanceMachine()
00037 {
00038     init();
00039     k=k_;
00040     set_distance(d);
00041     train_method=f;
00042 }
00043 
00044 CKMeans::CKMeans(int32_t k_, CDistance* d, bool use_kmpp, EKMeansMethod f)
00045 : CDistanceMachine()
00046 {   
00047     init();
00048     k=k_;
00049     set_distance(d);
00050     use_kmeanspp=use_kmpp;
00051     train_method=f;
00052 }
00053 
00054 CKMeans::CKMeans(int32_t k_i, CDistance* d_i, SGMatrix<float64_t> centers_i, EKMeansMethod f)
00055 : CDistanceMachine()
00056 {
00057     init();
00058     k = k_i;
00059     set_distance(d_i);
00060     set_initial_centers(centers_i);
00061     train_method=f;
00062 }
00063 
00064 CKMeans::~CKMeans()
00065 {
00066 }
00067 
00068 void CKMeans::set_initial_centers(SGMatrix<float64_t> centers)
00069 {
00070     dimensions = ((CDenseFeatures<float64_t>*) distance->get_lhs())->get_num_features();
00071     REQUIRE(centers.num_cols == k,
00072             "Expected %d initial cluster centers, got %d", k, centers.num_cols);
00073     REQUIRE(centers.num_rows == dimensions,
00074             "Expected %d dimensionional cluster centers, got %d", dimensions, centers.num_rows);
00075     mus_initial = centers;
00076 }
00077 
00078 void CKMeans::set_random_centers(SGVector<float64_t> weights_set, SGVector<int32_t> ClList, int32_t XSize)
00079 {
00080     CDenseFeatures<float64_t>* lhs=
00081         CDenseFeatures<float64_t>::obtain_from_generic(distance->get_lhs());
00082 
00083     for (int32_t i=0; i<XSize; i++)
00084     {
00085         const int32_t Cl=CMath::random(0, k-1);
00086         weights_set[Cl]+=1.0;
00087         ClList[i]=Cl;
00088 
00089         int32_t vlen=0;
00090         bool vfree=false;
00091         float64_t* vec=lhs->get_feature_vector(i, vlen, vfree);
00092 
00093         for (int32_t j=0; j<dimensions; j++)
00094             mus.matrix[Cl*dimensions+j] += vec[j];
00095 
00096         lhs->free_feature_vector(vec, i, vfree);
00097     }
00098 
00099     SG_UNREF(lhs);
00100 
00101     for (int32_t i=0; i<k; i++)
00102     {
00103         if (weights_set[i]!=0.0)
00104         {
00105             for (int32_t j=0; j<dimensions; j++)
00106                 mus.matrix[i*dimensions+j] /= weights_set[i];
00107         }
00108     }
00109 }
00110 
00111 void CKMeans::set_initial_centers(SGVector<float64_t> weights_set, 
00112                 SGVector<int32_t> ClList, int32_t XSize)
00113 {
00114     ASSERT(mus_initial.matrix);
00115 
00117     CDenseFeatures<float64_t>* rhs_mus=new CDenseFeatures<float64_t>(0);
00118     CFeatures* rhs_cache=distance->replace_rhs(rhs_mus);
00119     rhs_mus->copy_feature_matrix(mus_initial);
00120 
00121     SGVector<float64_t> dists=SGVector<float64_t>(k*XSize);
00122     dists.zero();
00123 
00124     for(int32_t idx=0;idx<XSize;idx++)
00125     {
00126         for(int32_t m=0;m<k;m++)
00127             dists[k*idx+m] = distance->distance(idx,m);
00128     }
00129 
00130     for (int32_t i=0; i<XSize; i++)
00131     {
00132         float64_t mini=dists[i*k];
00133         int32_t Cl = 0, j;
00134 
00135         for (j=1; j<k; j++)
00136         {
00137             if (dists[i*k+j]<mini)
00138             {
00139                 Cl=j;
00140                 mini=dists[i*k+j];
00141             }
00142         }
00143         ClList[i]=Cl;
00144     }
00145 
00146     /* Compute the sum of all points belonging to a cluster
00147      * and count the points */
00148     CDenseFeatures<float64_t>* lhs=
00149             (CDenseFeatures<float64_t>*)distance->get_lhs();
00150     for (int32_t i=0; i<XSize; i++)
00151     {
00152         const int32_t Cl = ClList[i];
00153         weights_set[Cl]+=1.0;
00154 
00155         int32_t vlen=0;
00156         bool vfree=false;
00157         float64_t* vec=lhs->get_feature_vector(i, vlen, vfree);
00158 
00159         for (int32_t j=0; j<dimensions; j++)
00160             mus.matrix[Cl*dimensions+j] += vec[j];
00161 
00162         lhs->free_feature_vector(vec, i, vfree);
00163     }
00164     SG_UNREF(lhs);
00165     distance->replace_rhs(rhs_cache);
00166     delete rhs_mus;
00167 
00168         /* normalization to get the mean */
00169     for (int32_t i=0; i<k; i++)
00170     {
00171         if (weights_set[i]!=0.0)
00172         {
00173             for (int32_t j=0; j<dimensions; j++)
00174                 mus.matrix[i*dimensions+j] /= weights_set[i];
00175         }
00176     }
00177 
00178 }
00179 
00180 void CKMeans::compute_cluster_variances()
00181 {
00182     /* compute the ,,variances'' of the clusters */
00183     for (int32_t i=0; i<k; i++)
00184     {
00185         float64_t rmin1=0;
00186         float64_t rmin2=0;
00187 
00188         bool first_round=true;
00189 
00190         for (int32_t j=0; j<k; j++)
00191         {
00192             if (j!=i)
00193             {
00194                 int32_t l;
00195                 float64_t dist = 0;
00196 
00197                 for (l=0; l<dimensions; l++)
00198                 {
00199                     dist+=CMath::sq(
00200                             mus.matrix[i*dimensions+l]
00201                                     -mus.matrix[j*dimensions+l]);
00202                 }
00203 
00204                 if (first_round)
00205                 {
00206                     rmin1=dist;
00207                     rmin2=dist;
00208                     first_round=false;
00209                 }
00210                 else
00211                 {
00212                     if ((dist<rmin2) && (dist>=rmin1))
00213                         rmin2=dist;
00214 
00215                     if (dist<rmin1)
00216                     {
00217                         rmin2=rmin1;
00218                         rmin1=dist;
00219                     }
00220                 }
00221             }
00222         }
00223 
00224         R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
00225     }
00226 }
00227 
00228 bool CKMeans::train_machine(CFeatures* data)
00229 {
00230     ASSERT(distance && distance->get_feature_type()==F_DREAL)
00231 
00232     if (data)
00233         distance->init(data, data);
00234 
00235     CDenseFeatures<float64_t>* lhs=
00236         CDenseFeatures<float64_t>::obtain_from_generic(distance->get_lhs());
00237 
00238     ASSERT(lhs);
00239     int32_t XSize=lhs->get_num_vectors();
00240     dimensions=lhs->get_num_features();
00241     const int32_t XDimk=dimensions*k;
00242 
00243     ASSERT(XSize>0 && dimensions>0);
00244 
00246     if (use_kmeanspp)
00247         mus_initial=kmeanspp();
00248 
00249     R=SGVector<float64_t>(k);
00250 
00251     mus=SGMatrix<float64_t>(dimensions, k);
00252     /* cluster_centers=zeros(dimensions, k) ; */
00253     memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
00254 
00255     SGVector<int32_t> ClList=SGVector<int32_t>(XSize);
00256     ClList.zero();
00257     SGVector<float64_t> weights_set=SGVector<float64_t>(k);
00258     weights_set.zero();
00259 
00260     if (mus_initial.matrix)
00261         set_initial_centers(weights_set, ClList, XSize);
00262     else
00263         set_random_centers(weights_set, ClList, XSize);
00264     
00265     if (train_method==KMM_MINI_BATCH)
00266     {
00267         CKMeansMiniBatchImpl::minibatch_KMeans(k, distance, batch_size, minib_iter, mus);
00268     }
00269     else
00270     {
00271         CKMeansLloydImpl::Lloyd_KMeans(k, distance, max_iter, mus, ClList, weights_set, fixed_centers);
00272     }
00273 
00274     compute_cluster_variances();
00275     SG_UNREF(lhs);
00276     return true;
00277 }
00278 
00279 bool CKMeans::load(FILE* srcfile)
00280 {
00281     SG_SET_LOCALE_C;
00282     SG_RESET_LOCALE;
00283     return false;
00284 }
00285 
00286 bool CKMeans::save(FILE* dstfile)
00287 {
00288     SG_SET_LOCALE_C;
00289     SG_RESET_LOCALE;
00290     return false;
00291 }
00292 
00293 void CKMeans::set_use_kmeanspp(bool kmpp)
00294 {
00295     use_kmeanspp=kmpp;
00296 }
00297 
00298 bool CKMeans::get_use_kmeanspp() const
00299 {
00300     return use_kmeanspp;
00301 }
00302 
00303 void CKMeans::set_k(int32_t p_k)
00304 {
00305     REQUIRE(p_k>0, "number of clusters should be > 0");
00306     this->k=p_k;
00307 }
00308 
00309 int32_t CKMeans::get_k()
00310 {
00311     return k;
00312 }
00313 
00314 void CKMeans::set_max_iter(int32_t iter)
00315 {
00316     REQUIRE(iter>0, "number of clusters should be > 0");
00317     max_iter=iter;
00318 }
00319 
00320 float64_t CKMeans::get_max_iter()
00321 {
00322     return max_iter;
00323 }
00324 
00325 void CKMeans::set_train_method(EKMeansMethod f)
00326 {
00327     train_method=f;
00328 }
00329 
00330 EKMeansMethod CKMeans::get_train_method() const
00331 {
00332     return train_method;
00333 }
00334 
00335 void CKMeans::set_mbKMeans_batch_size(int32_t b)
00336 {
00337     REQUIRE(b>0, "Parameter bach size should be > 0");
00338     batch_size=b;
00339 }
00340 
00341 int32_t CKMeans::get_mbKMeans_batch_size() const
00342 {
00343     return batch_size;
00344 }
00345 
00346 void CKMeans::set_mbKMeans_iter(int32_t i)
00347 {
00348     REQUIRE(i>0, "Parameter number of iterations should be > 0");
00349     minib_iter=i;
00350 }
00351 
00352 int32_t CKMeans::get_mbKMeans_iter() const
00353 {
00354     return minib_iter;
00355 }
00356 
00357 void CKMeans::set_mbKMeans_params(int32_t b, int32_t t)
00358 {
00359     REQUIRE(b>0, "Parameter bach size should be > 0");
00360     REQUIRE(t>0, "Parameter number of iterations should be > 0");
00361     batch_size=b;
00362     minib_iter=t;
00363 }
00364 
00365 SGVector<float64_t> CKMeans::get_radiuses()
00366 {
00367     return R;
00368 }
00369 
00370 SGMatrix<float64_t> CKMeans::get_cluster_centers()
00371 {
00372     if (!R.vector)
00373         return SGMatrix<float64_t>();
00374 
00375     CDenseFeatures<float64_t>* lhs=
00376         (CDenseFeatures<float64_t>*)distance->get_lhs();
00377     SGMatrix<float64_t> centers=lhs->get_feature_matrix();
00378     SG_UNREF(lhs);
00379     return centers;
00380 }
00381 
00382 int32_t CKMeans::get_dimensions()
00383 {
00384     return dimensions;
00385 }
00386 
00387 void CKMeans::set_fixed_centers(bool fixed)
00388 {
00389     fixed_centers=fixed;
00390 }
00391 
00392 bool CKMeans::get_fixed_centers()
00393 {
00394     return fixed_centers;
00395 }
00396 
00397 void CKMeans::store_model_features()
00398 {
00399     /* set lhs of underlying distance to cluster centers */
00400     CDenseFeatures<float64_t>* cluster_centers=new CDenseFeatures<float64_t>(
00401             mus);
00402 
00403     /* store cluster centers in lhs of distance variable */
00404     CFeatures* rhs=distance->get_rhs();
00405     distance->init(cluster_centers, rhs);
00406     SG_UNREF(rhs);
00407 }
00408 
00409 SGMatrix<float64_t> CKMeans::kmeanspp()
00410 {
00411     int32_t num=distance->get_num_vec_lhs();
00412     SGVector<float64_t> dists=SGVector<float64_t>(num);
00413     SGVector<int32_t> mu_index=SGVector<int32_t>(k);
00414     
00415     /* 1st center */
00416     int32_t mu_1=CMath::random((int32_t) 0,num-1);
00417     mu_index[0]=mu_1;
00418     
00419     /* choose a center - do k-1 times */
00420     int32_t count=0;
00421     while (++count<k)
00422     {
00423         float64_t sum=0.0;
00424         /* for each data point find distance to nearest already chosen center */
00425         for (int32_t point_idx=0;point_idx<num;point_idx++)
00426         {
00427             dists[point_idx]=distance->distance(mu_index[0],point_idx);
00428             int32_t cent_id=1;
00429 
00430             while (cent_id<count)
00431             {
00432                 float64_t dist_temp=distance->distance(mu_index[cent_id],point_idx); 
00433                 if (dists[point_idx]>dist_temp)
00434                     dists[point_idx]=dist_temp; 
00435                 cent_id++;
00436             }
00437 
00438             dists[point_idx]*=dists[point_idx];
00439             sum+=dists[point_idx];
00440         }
00441 
00442         /*random choosing - points weighted by square of distance from nearset center*/
00443         int32_t mu_next=0;
00444         float64_t chosen=CMath::random(0.0,sum);
00445         while ((chosen-=dists[mu_next])>0)
00446             mu_next++;
00447 
00448         mu_index[count]=mu_next;
00449     }
00450 
00451     CDenseFeatures<float64_t>* lhs=(CDenseFeatures<float64_t>*)distance->get_lhs();
00452     int32_t dim=lhs->get_num_features();
00453     SGMatrix<float64_t> mat=SGMatrix<float64_t>(dim,k);
00454     for (int32_t c_m=0;c_m<k;c_m++)
00455     {
00456         SGVector<float64_t> feature=lhs->get_feature_vector(c_m);
00457         for (int32_t r_m=0;r_m<dim;r_m++)
00458             mat(r_m,c_m)=feature[r_m];
00459     }
00460     SG_UNREF(lhs);
00461     return mat;
00462 }
00463 
00464 void CKMeans::init()
00465 {
00466     max_iter=10000;
00467     k=3;
00468     dimensions=0;
00469     fixed_centers=false;
00470     use_kmeanspp=false;
00471     train_method=KMM_LLOYD;
00472     batch_size=-1;
00473     minib_iter=-1;
00474     SG_ADD(&max_iter, "max_iter", "Maximum number of iterations", MS_AVAILABLE);
00475     SG_ADD(&k, "k", "k, the number of clusters", MS_AVAILABLE);
00476     SG_ADD(&dimensions, "dimensions", "Dimensions of data", MS_NOT_AVAILABLE);
00477     SG_ADD(&R, "R", "Cluster radiuses", MS_NOT_AVAILABLE);
00478 }
00479 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation