SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2008 Gunnar Raetsch 00008 * Written (W) 2007-2009 Soeren Sonnenburg 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <shogun/clustering/KMeansLloydImpl.h> 00013 #include "shogun/clustering/KMeansMiniBatchImpl.h" 00014 #include <shogun/clustering/KMeans.h> 00015 #include <shogun/distance/Distance.h> 00016 #include <shogun/labels/Labels.h> 00017 #include <shogun/features/DenseFeatures.h> 00018 #include <shogun/mathematics/Math.h> 00019 #include <shogun/base/Parallel.h> 00020 00021 #ifdef HAVE_PTHREAD 00022 #include <pthread.h> 00023 #endif 00024 00025 #define MUSRECALC 00026 00027 using namespace shogun; 00028 00029 CKMeans::CKMeans() 00030 : CDistanceMachine() 00031 { 00032 init(); 00033 } 00034 00035 CKMeans::CKMeans(int32_t k_, CDistance* d, EKMeansMethod f) 00036 :CDistanceMachine() 00037 { 00038 init(); 00039 k=k_; 00040 set_distance(d); 00041 train_method=f; 00042 } 00043 00044 CKMeans::CKMeans(int32_t k_, CDistance* d, bool use_kmpp, EKMeansMethod f) 00045 : CDistanceMachine() 00046 { 00047 init(); 00048 k=k_; 00049 set_distance(d); 00050 use_kmeanspp=use_kmpp; 00051 train_method=f; 00052 } 00053 00054 CKMeans::CKMeans(int32_t k_i, CDistance* d_i, SGMatrix<float64_t> centers_i, EKMeansMethod f) 00055 : CDistanceMachine() 00056 { 00057 init(); 00058 k = k_i; 00059 set_distance(d_i); 00060 set_initial_centers(centers_i); 00061 train_method=f; 00062 } 00063 00064 CKMeans::~CKMeans() 00065 { 00066 } 00067 00068 void CKMeans::set_initial_centers(SGMatrix<float64_t> centers) 00069 { 00070 dimensions = ((CDenseFeatures<float64_t>*) distance->get_lhs())->get_num_features(); 00071 REQUIRE(centers.num_cols == k, 00072 "Expected %d initial cluster centers, got %d", k, centers.num_cols); 00073 REQUIRE(centers.num_rows == dimensions, 00074 "Expected %d dimensionional cluster centers, got %d", dimensions, centers.num_rows); 00075 mus_initial = centers; 00076 } 00077 00078 void CKMeans::set_random_centers(SGVector<float64_t> weights_set, SGVector<int32_t> ClList, int32_t XSize) 00079 { 00080 CDenseFeatures<float64_t>* lhs= 00081 CDenseFeatures<float64_t>::obtain_from_generic(distance->get_lhs()); 00082 00083 for (int32_t i=0; i<XSize; i++) 00084 { 00085 const int32_t Cl=CMath::random(0, k-1); 00086 weights_set[Cl]+=1.0; 00087 ClList[i]=Cl; 00088 00089 int32_t vlen=0; 00090 bool vfree=false; 00091 float64_t* vec=lhs->get_feature_vector(i, vlen, vfree); 00092 00093 for (int32_t j=0; j<dimensions; j++) 00094 mus.matrix[Cl*dimensions+j] += vec[j]; 00095 00096 lhs->free_feature_vector(vec, i, vfree); 00097 } 00098 00099 SG_UNREF(lhs); 00100 00101 for (int32_t i=0; i<k; i++) 00102 { 00103 if (weights_set[i]!=0.0) 00104 { 00105 for (int32_t j=0; j<dimensions; j++) 00106 mus.matrix[i*dimensions+j] /= weights_set[i]; 00107 } 00108 } 00109 } 00110 00111 void CKMeans::set_initial_centers(SGVector<float64_t> weights_set, 00112 SGVector<int32_t> ClList, int32_t XSize) 00113 { 00114 ASSERT(mus_initial.matrix); 00115 00117 CDenseFeatures<float64_t>* rhs_mus=new CDenseFeatures<float64_t>(0); 00118 CFeatures* rhs_cache=distance->replace_rhs(rhs_mus); 00119 rhs_mus->copy_feature_matrix(mus_initial); 00120 00121 SGVector<float64_t> dists=SGVector<float64_t>(k*XSize); 00122 dists.zero(); 00123 00124 for(int32_t idx=0;idx<XSize;idx++) 00125 { 00126 for(int32_t m=0;m<k;m++) 00127 dists[k*idx+m] = distance->distance(idx,m); 00128 } 00129 00130 for (int32_t i=0; i<XSize; i++) 00131 { 00132 float64_t mini=dists[i*k]; 00133 int32_t Cl = 0, j; 00134 00135 for (j=1; j<k; j++) 00136 { 00137 if (dists[i*k+j]<mini) 00138 { 00139 Cl=j; 00140 mini=dists[i*k+j]; 00141 } 00142 } 00143 ClList[i]=Cl; 00144 } 00145 00146 /* Compute the sum of all points belonging to a cluster 00147 * and count the points */ 00148 CDenseFeatures<float64_t>* lhs= 00149 (CDenseFeatures<float64_t>*)distance->get_lhs(); 00150 for (int32_t i=0; i<XSize; i++) 00151 { 00152 const int32_t Cl = ClList[i]; 00153 weights_set[Cl]+=1.0; 00154 00155 int32_t vlen=0; 00156 bool vfree=false; 00157 float64_t* vec=lhs->get_feature_vector(i, vlen, vfree); 00158 00159 for (int32_t j=0; j<dimensions; j++) 00160 mus.matrix[Cl*dimensions+j] += vec[j]; 00161 00162 lhs->free_feature_vector(vec, i, vfree); 00163 } 00164 SG_UNREF(lhs); 00165 distance->replace_rhs(rhs_cache); 00166 delete rhs_mus; 00167 00168 /* normalization to get the mean */ 00169 for (int32_t i=0; i<k; i++) 00170 { 00171 if (weights_set[i]!=0.0) 00172 { 00173 for (int32_t j=0; j<dimensions; j++) 00174 mus.matrix[i*dimensions+j] /= weights_set[i]; 00175 } 00176 } 00177 00178 } 00179 00180 void CKMeans::compute_cluster_variances() 00181 { 00182 /* compute the ,,variances'' of the clusters */ 00183 for (int32_t i=0; i<k; i++) 00184 { 00185 float64_t rmin1=0; 00186 float64_t rmin2=0; 00187 00188 bool first_round=true; 00189 00190 for (int32_t j=0; j<k; j++) 00191 { 00192 if (j!=i) 00193 { 00194 int32_t l; 00195 float64_t dist = 0; 00196 00197 for (l=0; l<dimensions; l++) 00198 { 00199 dist+=CMath::sq( 00200 mus.matrix[i*dimensions+l] 00201 -mus.matrix[j*dimensions+l]); 00202 } 00203 00204 if (first_round) 00205 { 00206 rmin1=dist; 00207 rmin2=dist; 00208 first_round=false; 00209 } 00210 else 00211 { 00212 if ((dist<rmin2) && (dist>=rmin1)) 00213 rmin2=dist; 00214 00215 if (dist<rmin1) 00216 { 00217 rmin2=rmin1; 00218 rmin1=dist; 00219 } 00220 } 00221 } 00222 } 00223 00224 R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2)); 00225 } 00226 } 00227 00228 bool CKMeans::train_machine(CFeatures* data) 00229 { 00230 ASSERT(distance && distance->get_feature_type()==F_DREAL) 00231 00232 if (data) 00233 distance->init(data, data); 00234 00235 CDenseFeatures<float64_t>* lhs= 00236 CDenseFeatures<float64_t>::obtain_from_generic(distance->get_lhs()); 00237 00238 ASSERT(lhs); 00239 int32_t XSize=lhs->get_num_vectors(); 00240 dimensions=lhs->get_num_features(); 00241 const int32_t XDimk=dimensions*k; 00242 00243 ASSERT(XSize>0 && dimensions>0); 00244 00246 if (use_kmeanspp) 00247 mus_initial=kmeanspp(); 00248 00249 R=SGVector<float64_t>(k); 00250 00251 mus=SGMatrix<float64_t>(dimensions, k); 00252 /* cluster_centers=zeros(dimensions, k) ; */ 00253 memset(mus.matrix, 0, sizeof(float64_t)*XDimk); 00254 00255 SGVector<int32_t> ClList=SGVector<int32_t>(XSize); 00256 ClList.zero(); 00257 SGVector<float64_t> weights_set=SGVector<float64_t>(k); 00258 weights_set.zero(); 00259 00260 if (mus_initial.matrix) 00261 set_initial_centers(weights_set, ClList, XSize); 00262 else 00263 set_random_centers(weights_set, ClList, XSize); 00264 00265 if (train_method==KMM_MINI_BATCH) 00266 { 00267 CKMeansMiniBatchImpl::minibatch_KMeans(k, distance, batch_size, minib_iter, mus); 00268 } 00269 else 00270 { 00271 CKMeansLloydImpl::Lloyd_KMeans(k, distance, max_iter, mus, ClList, weights_set, fixed_centers); 00272 } 00273 00274 compute_cluster_variances(); 00275 SG_UNREF(lhs); 00276 return true; 00277 } 00278 00279 bool CKMeans::load(FILE* srcfile) 00280 { 00281 SG_SET_LOCALE_C; 00282 SG_RESET_LOCALE; 00283 return false; 00284 } 00285 00286 bool CKMeans::save(FILE* dstfile) 00287 { 00288 SG_SET_LOCALE_C; 00289 SG_RESET_LOCALE; 00290 return false; 00291 } 00292 00293 void CKMeans::set_use_kmeanspp(bool kmpp) 00294 { 00295 use_kmeanspp=kmpp; 00296 } 00297 00298 bool CKMeans::get_use_kmeanspp() const 00299 { 00300 return use_kmeanspp; 00301 } 00302 00303 void CKMeans::set_k(int32_t p_k) 00304 { 00305 REQUIRE(p_k>0, "number of clusters should be > 0"); 00306 this->k=p_k; 00307 } 00308 00309 int32_t CKMeans::get_k() 00310 { 00311 return k; 00312 } 00313 00314 void CKMeans::set_max_iter(int32_t iter) 00315 { 00316 REQUIRE(iter>0, "number of clusters should be > 0"); 00317 max_iter=iter; 00318 } 00319 00320 float64_t CKMeans::get_max_iter() 00321 { 00322 return max_iter; 00323 } 00324 00325 void CKMeans::set_train_method(EKMeansMethod f) 00326 { 00327 train_method=f; 00328 } 00329 00330 EKMeansMethod CKMeans::get_train_method() const 00331 { 00332 return train_method; 00333 } 00334 00335 void CKMeans::set_mbKMeans_batch_size(int32_t b) 00336 { 00337 REQUIRE(b>0, "Parameter bach size should be > 0"); 00338 batch_size=b; 00339 } 00340 00341 int32_t CKMeans::get_mbKMeans_batch_size() const 00342 { 00343 return batch_size; 00344 } 00345 00346 void CKMeans::set_mbKMeans_iter(int32_t i) 00347 { 00348 REQUIRE(i>0, "Parameter number of iterations should be > 0"); 00349 minib_iter=i; 00350 } 00351 00352 int32_t CKMeans::get_mbKMeans_iter() const 00353 { 00354 return minib_iter; 00355 } 00356 00357 void CKMeans::set_mbKMeans_params(int32_t b, int32_t t) 00358 { 00359 REQUIRE(b>0, "Parameter bach size should be > 0"); 00360 REQUIRE(t>0, "Parameter number of iterations should be > 0"); 00361 batch_size=b; 00362 minib_iter=t; 00363 } 00364 00365 SGVector<float64_t> CKMeans::get_radiuses() 00366 { 00367 return R; 00368 } 00369 00370 SGMatrix<float64_t> CKMeans::get_cluster_centers() 00371 { 00372 if (!R.vector) 00373 return SGMatrix<float64_t>(); 00374 00375 CDenseFeatures<float64_t>* lhs= 00376 (CDenseFeatures<float64_t>*)distance->get_lhs(); 00377 SGMatrix<float64_t> centers=lhs->get_feature_matrix(); 00378 SG_UNREF(lhs); 00379 return centers; 00380 } 00381 00382 int32_t CKMeans::get_dimensions() 00383 { 00384 return dimensions; 00385 } 00386 00387 void CKMeans::set_fixed_centers(bool fixed) 00388 { 00389 fixed_centers=fixed; 00390 } 00391 00392 bool CKMeans::get_fixed_centers() 00393 { 00394 return fixed_centers; 00395 } 00396 00397 void CKMeans::store_model_features() 00398 { 00399 /* set lhs of underlying distance to cluster centers */ 00400 CDenseFeatures<float64_t>* cluster_centers=new CDenseFeatures<float64_t>( 00401 mus); 00402 00403 /* store cluster centers in lhs of distance variable */ 00404 CFeatures* rhs=distance->get_rhs(); 00405 distance->init(cluster_centers, rhs); 00406 SG_UNREF(rhs); 00407 } 00408 00409 SGMatrix<float64_t> CKMeans::kmeanspp() 00410 { 00411 int32_t num=distance->get_num_vec_lhs(); 00412 SGVector<float64_t> dists=SGVector<float64_t>(num); 00413 SGVector<int32_t> mu_index=SGVector<int32_t>(k); 00414 00415 /* 1st center */ 00416 int32_t mu_1=CMath::random((int32_t) 0,num-1); 00417 mu_index[0]=mu_1; 00418 00419 /* choose a center - do k-1 times */ 00420 int32_t count=0; 00421 while (++count<k) 00422 { 00423 float64_t sum=0.0; 00424 /* for each data point find distance to nearest already chosen center */ 00425 for (int32_t point_idx=0;point_idx<num;point_idx++) 00426 { 00427 dists[point_idx]=distance->distance(mu_index[0],point_idx); 00428 int32_t cent_id=1; 00429 00430 while (cent_id<count) 00431 { 00432 float64_t dist_temp=distance->distance(mu_index[cent_id],point_idx); 00433 if (dists[point_idx]>dist_temp) 00434 dists[point_idx]=dist_temp; 00435 cent_id++; 00436 } 00437 00438 dists[point_idx]*=dists[point_idx]; 00439 sum+=dists[point_idx]; 00440 } 00441 00442 /*random choosing - points weighted by square of distance from nearset center*/ 00443 int32_t mu_next=0; 00444 float64_t chosen=CMath::random(0.0,sum); 00445 while ((chosen-=dists[mu_next])>0) 00446 mu_next++; 00447 00448 mu_index[count]=mu_next; 00449 } 00450 00451 CDenseFeatures<float64_t>* lhs=(CDenseFeatures<float64_t>*)distance->get_lhs(); 00452 int32_t dim=lhs->get_num_features(); 00453 SGMatrix<float64_t> mat=SGMatrix<float64_t>(dim,k); 00454 for (int32_t c_m=0;c_m<k;c_m++) 00455 { 00456 SGVector<float64_t> feature=lhs->get_feature_vector(c_m); 00457 for (int32_t r_m=0;r_m<dim;r_m++) 00458 mat(r_m,c_m)=feature[r_m]; 00459 } 00460 SG_UNREF(lhs); 00461 return mat; 00462 } 00463 00464 void CKMeans::init() 00465 { 00466 max_iter=10000; 00467 k=3; 00468 dimensions=0; 00469 fixed_centers=false; 00470 use_kmeanspp=false; 00471 train_method=KMM_LLOYD; 00472 batch_size=-1; 00473 minib_iter=-1; 00474 SG_ADD(&max_iter, "max_iter", "Maximum number of iterations", MS_AVAILABLE); 00475 SG_ADD(&k, "k", "k, the number of clusters", MS_AVAILABLE); 00476 SG_ADD(&dimensions, "dimensions", "Dimensions of data", MS_NOT_AVAILABLE); 00477 SG_ADD(&R, "R", "Cluster radiuses", MS_NOT_AVAILABLE); 00478 } 00479