SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
Distance.cpp
Go to the documentation of this file.
00001 /*
00002  * this program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2009 Christian Gehl
00008  * Written (W) 2006-2009 Soeren Sonnenburg
00009  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <shogun/lib/config.h>
00013 #include <shogun/lib/common.h>
00014 #include <shogun/io/SGIO.h>
00015 #include <shogun/io/File.h>
00016 #include <shogun/lib/Time.h>
00017 #include <shogun/lib/Signal.h>
00018 #include <shogun/base/Parallel.h>
00019 #include <shogun/base/Parameter.h>
00020 
00021 #include <shogun/distance/Distance.h>
00022 #include <shogun/features/Features.h>
00023 
00024 #include <string.h>
00025 #include <unistd.h>
00026 
00027 #ifdef HAVE_PTHREAD
00028 #include <pthread.h>
00029 #endif
00030 
00031 using namespace shogun;
00032 
00034 template <class T> struct D_THREAD_PARAM
00035 {
00037     CDistance* distance;
00039     int32_t start;
00041     int32_t end;
00043     int32_t total_start;
00045     int32_t total_end;
00047     int32_t m;
00049     int32_t n;
00051     T* result;
00053     bool symmetric;
00055     bool verbose;
00056 };
00057 
00058 CDistance::CDistance() : CSGObject()
00059 {
00060     init();
00061 }
00062 
00063 
00064 CDistance::CDistance(CFeatures* p_lhs, CFeatures* p_rhs) : CSGObject()
00065 {
00066     init();
00067     init(p_lhs, p_rhs);
00068 }
00069 
00070 CDistance::~CDistance()
00071 {
00072     SG_FREE(precomputed_matrix);
00073     precomputed_matrix=NULL;
00074 
00075     remove_lhs_and_rhs();
00076 }
00077 
00078 bool CDistance::init(CFeatures* l, CFeatures* r)
00079 {
00080     //make sure features were indeed supplied
00081     ASSERT(l)
00082     ASSERT(r)
00083 
00084     //make sure features are compatible
00085     ASSERT(l->get_feature_class()==r->get_feature_class())
00086     ASSERT(l->get_feature_type()==r->get_feature_type())
00087 
00088     //remove references to previous features
00089     remove_lhs_and_rhs();
00090 
00091     //increase reference counts
00092     SG_REF(l);
00093     SG_REF(r);
00094 
00095     lhs=l;
00096     rhs=r;
00097 
00098     num_lhs=l->get_num_vectors();
00099     num_rhs=r->get_num_vectors();
00100 
00101     SG_FREE(precomputed_matrix);
00102     precomputed_matrix=NULL ;
00103 
00104     return true;
00105 }
00106 
00107 void CDistance::load(CFile* loader)
00108 {
00109     SG_SET_LOCALE_C;
00110     SG_RESET_LOCALE;
00111 }
00112 
00113 void CDistance::save(CFile* writer)
00114 {
00115     SG_SET_LOCALE_C;
00116     SG_RESET_LOCALE;
00117 }
00118 
00119 void CDistance::remove_lhs_and_rhs()
00120 {
00121     SG_UNREF(rhs);
00122     rhs = NULL;
00123     num_rhs=0;
00124 
00125     SG_UNREF(lhs);
00126     lhs = NULL;
00127     num_lhs=0;
00128 }
00129 
00130 void CDistance::remove_lhs()
00131 {
00132     SG_UNREF(lhs);
00133     lhs = NULL;
00134     num_lhs=0;
00135 }
00136 
00138 void CDistance::remove_rhs()
00139 {
00140     SG_UNREF(rhs);
00141     rhs = NULL;
00142     num_rhs=0;
00143 }
00144 
00145 CFeatures* CDistance::replace_rhs(CFeatures* r)
00146 {
00147     //make sure features were indeed supplied
00148     ASSERT(r)
00149 
00150     //make sure features are compatible
00151     ASSERT(lhs->get_feature_class()==r->get_feature_class())
00152     ASSERT(lhs->get_feature_type()==r->get_feature_type())
00153 
00154     //remove references to previous rhs features
00155     CFeatures* tmp=rhs;
00156 
00157     rhs=r;
00158     num_rhs=r->get_num_vectors();
00159 
00160     SG_FREE(precomputed_matrix);
00161     precomputed_matrix=NULL ;
00162 
00163     // return old features including reference count
00164     return tmp;
00165 }
00166 
00167 CFeatures* CDistance::replace_lhs(CFeatures* l)
00168 {
00169     //make sure features were indeed supplied
00170     ASSERT(l)
00171 
00172     //make sure features are compatible
00173     ASSERT(rhs->get_feature_class()==l->get_feature_class())
00174     ASSERT(rhs->get_feature_type()==l->get_feature_type())
00175 
00176     //remove references to previous rhs features
00177     CFeatures* tmp=lhs;
00178 
00179     lhs=l;
00180     num_lhs=l->get_num_vectors();
00181 
00182     SG_FREE(precomputed_matrix);
00183     precomputed_matrix=NULL ;
00184 
00185     // return old features including reference count
00186     return tmp;
00187 }
00188 
00189 float64_t CDistance::distance(int32_t idx_a, int32_t idx_b)
00190 {
00191     REQUIRE(idx_a >= 0 && idx_b >= 0, "In CDistance::distance(int32_t,int32_t), idx_a and "
00192             "idx_b must be positive, %d and %d given instead\n", idx_a, idx_b)
00193 
00194     ASSERT(lhs)
00195     ASSERT(rhs)
00196 
00197     if (lhs==rhs)
00198     {
00199         int32_t num_vectors = lhs->get_num_vectors();
00200 
00201         if (idx_a>=num_vectors)
00202             idx_a=2*num_vectors-1-idx_a;
00203 
00204         if (idx_b>=num_vectors)
00205             idx_b=2*num_vectors-1-idx_b;
00206     }
00207 
00208     REQUIRE(idx_a < lhs->get_num_vectors() && idx_b < rhs->get_num_vectors(),
00209             "In CDistance::distance(int32_t,int32_t), idx_a and idx_b must be less than "
00210             "the number of vectors, but %d >= %d or %d >= %d\n",
00211             idx_a, lhs->get_num_vectors(), idx_b, rhs->get_num_vectors())
00212 
00213     if (precompute_matrix && (precomputed_matrix==NULL) && (lhs==rhs))
00214         do_precompute_matrix() ;
00215 
00216     if (precompute_matrix && (precomputed_matrix!=NULL))
00217     {
00218         if (idx_a>=idx_b)
00219             return precomputed_matrix[idx_a*(idx_a+1)/2+idx_b] ;
00220         else
00221             return precomputed_matrix[idx_b*(idx_b+1)/2+idx_a] ;
00222     }
00223 
00224     return compute(idx_a, idx_b);
00225 }
00226 
00227 void CDistance::do_precompute_matrix()
00228 {
00229     int32_t num_left=lhs->get_num_vectors();
00230     int32_t num_right=rhs->get_num_vectors();
00231     SG_INFO("precomputing distance matrix (%ix%i)\n", num_left, num_right)
00232 
00233     ASSERT(num_left==num_right)
00234     ASSERT(lhs==rhs)
00235     int32_t num=num_left;
00236 
00237     SG_FREE(precomputed_matrix);
00238     precomputed_matrix=SG_MALLOC(float32_t, num*(num+1)/2);
00239 
00240     for (int32_t i=0; i<num; i++)
00241     {
00242         SG_PROGRESS(i*i,0,num*num)
00243         for (int32_t j=0; j<=i; j++)
00244             precomputed_matrix[i*(i+1)/2+j] = compute(i,j) ;
00245     }
00246 
00247     SG_PROGRESS(num*num,0,num*num)
00248     SG_DONE()
00249 }
00250 
00251 void CDistance::init()
00252 {
00253     precomputed_matrix = NULL;
00254     precompute_matrix = false;
00255     lhs = NULL;
00256     rhs = NULL;
00257     num_lhs=0;
00258     num_rhs=0;
00259 
00260     m_parameters->add((CSGObject**) &lhs, "lhs",
00261                       "Feature vectors to occur on left hand side.");
00262     m_parameters->add((CSGObject**) &rhs, "rhs",
00263                       "Feature vectors to occur on right hand side.");
00264 }
00265 
00266 template <class T> void* CDistance::get_distance_matrix_helper(void* p)
00267 {
00268     D_THREAD_PARAM<T>* params= (D_THREAD_PARAM<T>*) p;
00269     int32_t i_start=params->start;
00270     int32_t i_end=params->end;
00271     CDistance* k=params->distance;
00272     T* result=params->result;
00273     bool symmetric=params->symmetric;
00274     int32_t n=params->n;
00275     int32_t m=params->m;
00276     bool verbose=params->verbose;
00277     int64_t total_start=params->total_start;
00278     int64_t total_end=params->total_end;
00279     int64_t total=total_start;
00280 
00281     for (int32_t i=i_start; i<i_end; i++)
00282     {
00283         int32_t j_start=0;
00284 
00285         if (symmetric)
00286             j_start=i;
00287 
00288         for (int32_t j=j_start; j<n; j++)
00289         {
00290             float64_t v=k->distance(i,j);
00291             result[i+j*m]=v;
00292 
00293             if (symmetric && i!=j)
00294                 result[j+i*m]=v;
00295 
00296             if (verbose)
00297             {
00298                 total++;
00299 
00300                 if (symmetric && i!=j)
00301                     total++;
00302 
00303                 if (total%100 == 0)
00304                     SG_OBJ_PROGRESS(k, total, total_start, total_end)
00305 
00306                 if (CSignal::cancel_computations())
00307                     break;
00308             }
00309         }
00310 
00311     }
00312 
00313     return NULL;
00314 }
00315 
00316 template <class T>
00317 SGMatrix<T> CDistance::get_distance_matrix()
00318 {
00319     T* result = NULL;
00320 
00321     REQUIRE(has_features(), "no features assigned to distance\n")
00322 
00323     int32_t m=get_num_vec_lhs();
00324     int32_t n=get_num_vec_rhs();
00325 
00326     int64_t total_num = int64_t(m)*n;
00327 
00328     // if lhs == rhs and sizes match assume k(i,j)=k(j,i)
00329     bool symmetric= (lhs && lhs==rhs && m==n);
00330 
00331     SG_DEBUG("returning distance matrix of size %dx%d\n", m, n)
00332 
00333         result=SG_MALLOC(T, total_num);
00334 
00335     int32_t num_threads=parallel->get_num_threads();
00336     if (num_threads < 2)
00337     {
00338         D_THREAD_PARAM<T> params;
00339         params.distance=this;
00340         params.result=result;
00341         params.start=0;
00342         params.end=m;
00343         params.total_start=0;
00344         params.total_end=total_num;
00345         params.n=n;
00346         params.m=m;
00347         params.symmetric=symmetric;
00348         params.verbose=true;
00349         get_distance_matrix_helper<T>((void*) &params);
00350     }
00351     else
00352     {
00353         pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
00354         D_THREAD_PARAM<T>* params = SG_MALLOC(D_THREAD_PARAM<T>, num_threads);
00355         int64_t step= total_num/num_threads;
00356 
00357         int32_t t;
00358 
00359         num_threads--;
00360         for (t=0; t<num_threads; t++)
00361         {
00362             params[t].distance = this;
00363             params[t].result = result;
00364             params[t].start = compute_row_start(t*step, n, symmetric);
00365             params[t].end = compute_row_start((t+1)*step, n, symmetric);
00366             params[t].total_start=t*step;
00367             params[t].total_end=(t+1)*step;
00368             params[t].n=n;
00369             params[t].m=m;
00370             params[t].symmetric=symmetric;
00371             params[t].verbose=false;
00372 
00373             int code=pthread_create(&threads[t], NULL,
00374                     CDistance::get_distance_matrix_helper<T>, (void*)&params[t]);
00375 
00376             if (code != 0)
00377             {
00378                 SG_WARNING("Thread creation failed (thread %d of %d) "
00379                         "with error:'%s'\n",t, num_threads, strerror(code));
00380                 num_threads=t;
00381                 break;
00382             }
00383         }
00384 
00385         params[t].distance = this;
00386         params[t].result = result;
00387         params[t].start = compute_row_start(t*step, n, symmetric);
00388         params[t].end = m;
00389         params[t].total_start=t*step;
00390         params[t].total_end=total_num;
00391         params[t].n=n;
00392         params[t].m=m;
00393         params[t].symmetric=symmetric;
00394         params[t].verbose=true;
00395         get_distance_matrix_helper<T>(&params[t]);
00396 
00397         for (t=0; t<num_threads; t++)
00398         {
00399             if (pthread_join(threads[t], NULL) != 0)
00400                 SG_WARNING("pthread_join of thread %d/%d failed\n", t, num_threads)
00401         }
00402 
00403         SG_FREE(params);
00404         SG_FREE(threads);
00405     }
00406 
00407     SG_DONE()
00408 
00409     return SGMatrix<T>(result,m,n,true);
00410 }
00411 
00412 template SGMatrix<float64_t> CDistance::get_distance_matrix<float64_t>();
00413 template SGMatrix<float32_t> CDistance::get_distance_matrix<float32_t>();
00414 
00415 template void* CDistance::get_distance_matrix_helper<float64_t>(void* p);
00416 template void* CDistance::get_distance_matrix_helper<float32_t>(void* p);
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation