SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
QuadraticTimeMMD.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2012-2013 Heiko Strathmann
00008  */
00009 
00010 #include <shogun/statistics/QuadraticTimeMMD.h>
00011 #include <shogun/features/Features.h>
00012 #include <shogun/mathematics/Statistics.h>
00013 #include <shogun/kernel/Kernel.h>
00014 #include <shogun/kernel/CombinedKernel.h>
00015 #include <shogun/kernel/CustomKernel.h>
00016 
00017 using namespace shogun;
00018 
00019 CQuadraticTimeMMD::CQuadraticTimeMMD() : CKernelTwoSampleTestStatistic()
00020 {
00021     init();
00022 }
00023 
00024 CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p_and_q,
00025         index_t m) :
00026         CKernelTwoSampleTestStatistic(kernel, p_and_q, m)
00027 {
00028     init();
00029 
00030     if (p_and_q && m!=p_and_q->get_num_vectors()/2)
00031     {
00032         SG_ERROR("CQuadraticTimeMMD: Only features with equal number of vectors "
00033                 "are currently possible\n");
00034     }
00035 }
00036 
00037 CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p,
00038         CFeatures* q) : CKernelTwoSampleTestStatistic(kernel, p, q)
00039 {
00040     init();
00041 
00042     if (p && q && p->get_num_vectors()!=q->get_num_vectors())
00043     {
00044         SG_ERROR("CQuadraticTimeMMD: Only features with equal number of vectors "
00045                 "are currently possible\n");
00046     }
00047 }
00048 
00049 CQuadraticTimeMMD::CQuadraticTimeMMD(CCustomKernel* custom_kernel, index_t m) :
00050         CKernelTwoSampleTestStatistic(custom_kernel, NULL, m)
00051 {
00052     init();
00053 }
00054 
00055 CQuadraticTimeMMD::~CQuadraticTimeMMD()
00056 {
00057 
00058 }
00059 
00060 void CQuadraticTimeMMD::init()
00061 {
00062     SG_ADD(&m_num_samples_spectrum, "num_samples_spectrum", "Number of samples"
00063             " for spectrum method null-distribution approximation",
00064             MS_NOT_AVAILABLE);
00065     SG_ADD(&m_num_eigenvalues_spectrum, "num_eigenvalues_spectrum", "Number of "
00066             " Eigenvalues for spectrum method null-distribution approximation",
00067             MS_NOT_AVAILABLE);
00068     SG_ADD((machine_int_t*)&m_statistic_type, "statistic_type",
00069             "Biased or unbiased MMD", MS_NOT_AVAILABLE);
00070 
00071     m_num_samples_spectrum=0;
00072     m_num_eigenvalues_spectrum=0;
00073     m_statistic_type=UNBIASED;
00074 }
00075 
00076 float64_t CQuadraticTimeMMD::compute_unbiased_statistic()
00077 {
00078     /* split computations into three terms from JLMR paper (see documentation )*/
00079     index_t m=m_m;
00080 
00081     /* init kernel with features */
00082     m_kernel->init(m_p_and_q, m_p_and_q);
00083 
00084     /* first term */
00085     float64_t first=0;
00086     for (index_t i=0; i<m; ++i)
00087     {
00088         /* ensure i!=j while adding up */
00089         for (index_t j=0; j<m; ++j)
00090             first+=i==j ? 0 : m_kernel->kernel(i,j);
00091     }
00092     first/=(m-1);
00093 
00094     /* second term */
00095     float64_t second=0;
00096     for (index_t i=m_m; i<m_m+m; ++i)
00097     {
00098         /* ensure i!=j while adding up */
00099         for (index_t j=m_m; j<m_m+m; ++j)
00100             second+=i==j ? 0 : m_kernel->kernel(i,j);
00101     }
00102     second/=(m-1);
00103 
00104     /* third term */
00105     float64_t third=0;
00106     for (index_t i=0; i<m; ++i)
00107     {
00108         for (index_t j=m_m; j<m_m+m; ++j)
00109             third+=m_kernel->kernel(i,j);
00110     }
00111     third*=2.0/m;
00112 
00113     return first+second-third;
00114 }
00115 
00116 float64_t CQuadraticTimeMMD::compute_biased_statistic()
00117 {
00118     /* split computations into three terms from JLMR paper (see documentation )*/
00119     index_t m=m_m;
00120 
00121     /* init kernel with features */
00122     m_kernel->init(m_p_and_q, m_p_and_q);
00123 
00124     /* first term */
00125     float64_t first=0;
00126     for (index_t i=0; i<m; ++i)
00127     {
00128         for (index_t j=0; j<m; ++j)
00129             first+=m_kernel->kernel(i,j);
00130     }
00131     first/=m;
00132 
00133     /* second term */
00134     float64_t second=0;
00135     for (index_t i=m_m; i<m_m+m; ++i)
00136     {
00137         for (index_t j=m_m; j<m_m+m; ++j)
00138             second+=m_kernel->kernel(i,j);
00139     }
00140     second/=m;
00141 
00142     /* third term */
00143     float64_t third=0;
00144     for (index_t i=0; i<m; ++i)
00145     {
00146         for (index_t j=m_m; j<m_m+m; ++j)
00147             third+=m_kernel->kernel(i,j);
00148     }
00149     third*=2.0/m;
00150 
00151     return first+second-third;
00152 }
00153 
00154 float64_t CQuadraticTimeMMD::compute_statistic()
00155 {
00156     if (!m_kernel)
00157         SG_ERROR("%s::compute_statistic(): No kernel specified!\n", get_name())
00158 
00159     float64_t result=0;
00160     switch (m_statistic_type)
00161     {
00162     case UNBIASED:
00163         result=compute_unbiased_statistic();
00164         break;
00165     case BIASED:
00166         result=compute_biased_statistic();
00167         break;
00168     default:
00169         SG_ERROR("CQuadraticTimeMMD::compute_statistic(): Unknown statistic "
00170                 "type!\n");
00171         break;
00172     }
00173 
00174     return result;
00175 }
00176 
00177 float64_t CQuadraticTimeMMD::compute_p_value(float64_t statistic)
00178 {
00179     float64_t result=0;
00180 
00181     switch (m_null_approximation_method)
00182     {
00183     case MMD2_SPECTRUM:
00184     {
00185 #ifdef HAVE_LAPACK
00186         /* get samples from null-distribution and compute p-value of statistic */
00187         SGVector<float64_t> null_samples=sample_null_spectrum(
00188                 m_num_samples_spectrum, m_num_eigenvalues_spectrum);
00189         null_samples.qsort();
00190         index_t pos=null_samples.find_position_to_insert(statistic);
00191         result=1.0-((float64_t)pos)/null_samples.vlen;
00192 #else // HAVE_LAPACK
00193         SG_ERROR("CQuadraticTimeMMD::compute_p_value(): Only possible if "
00194                 "shogun is compiled with LAPACK enabled\n");
00195 #endif // HAVE_LAPACK
00196         break;
00197     }
00198 
00199     case MMD2_GAMMA:
00200     {
00201         /* fit gamma and return cdf at statistic */
00202         SGVector<float64_t> params=fit_null_gamma();
00203         result=CStatistics::gamma_cdf(statistic, params[0], params[1]);
00204         break;
00205     }
00206 
00207     default:
00208         result=CKernelTwoSampleTestStatistic::compute_p_value(statistic);
00209         break;
00210     }
00211 
00212     return result;
00213 }
00214 
00215 SGVector<float64_t> CQuadraticTimeMMD::compute_statistic(
00216         bool multiple_kernels)
00217 {
00218     SGVector<float64_t> mmds;
00219     if (!multiple_kernels)
00220     {
00221         /* just one mmd result */
00222         mmds=SGVector<float64_t>(1);
00223         mmds[0]=compute_statistic();
00224     }
00225     else
00226     {
00227         REQUIRE(m_kernel->get_kernel_type()==K_COMBINED,
00228             "%s::compute_statistic: multiple kernels specified,"
00229             "but underlying kernel is not of type K_COMBINED\n", get_name());
00230 
00231         /* cast and allocate memory for results */
00232         CCombinedKernel* combined=(CCombinedKernel*)m_kernel;
00233         SG_REF(combined);
00234         mmds=SGVector<float64_t>(combined->get_num_subkernels());
00235 
00236         /* iterate through all kernels and compute statistic */
00237         /* TODO this might be done in parallel */
00238         for (index_t i=0; i<mmds.vlen; ++i)
00239         {
00240             CKernel* current=combined->get_kernel(i);
00241             /* temporarily replace underlying kernel and compute statistic */
00242             m_kernel=current;
00243             mmds[i]=compute_statistic();
00244 
00245             SG_UNREF(current);
00246         }
00247 
00248         /* restore combined kernel */
00249         m_kernel=combined;
00250         SG_UNREF(combined);
00251     }
00252 
00253     return mmds;
00254 }
00255 
00256 float64_t CQuadraticTimeMMD::compute_threshold(float64_t alpha)
00257 {
00258     float64_t result=0;
00259 
00260     switch (m_null_approximation_method)
00261     {
00262     case MMD2_SPECTRUM:
00263     {
00264 #ifdef HAVE_LAPACK
00265         /* get samples from null-distribution and compute threshold */
00266         SGVector<float64_t> null_samples=sample_null_spectrum(
00267                 m_num_samples_spectrum, m_num_eigenvalues_spectrum);
00268         null_samples.qsort();
00269         result=null_samples[index_t(CMath::floor(null_samples.vlen*(1-alpha)))];
00270 #else // HAVE_LAPACK
00271         SG_ERROR("CQuadraticTimeMMD::compute_threshold(): Only possible if "
00272                 "shogun is compiled with LAPACK enabled\n");
00273 #endif // HAVE_LAPACK
00274         break;
00275     }
00276 
00277     case MMD2_GAMMA:
00278     {
00279         /* fit gamma and return inverse cdf at alpha */
00280         SGVector<float64_t> params=fit_null_gamma();
00281         result=CStatistics::inverse_gamma_cdf(alpha, params[0], params[1]);
00282         break;
00283     }
00284 
00285     default:
00286         /* bootstrapping is handled here */
00287         result=CKernelTwoSampleTestStatistic::compute_threshold(alpha);
00288         break;
00289     }
00290 
00291     return result;
00292 }
00293 
00294 
00295 #ifdef HAVE_LAPACK
00296 SGVector<float64_t> CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples,
00297         index_t num_eigenvalues)
00298 {
00299     REQUIRE(m_kernel, "%s::sample_null_spectrum(%d, %d): No kernel set!\n",
00300             get_name(), num_samples, num_eigenvalues);
00301     REQUIRE(m_kernel->get_kernel_type()==K_CUSTOM || m_p_and_q,
00302             "%s::sample_null_spectrum(%d, %d): No features set and no custom "
00303             "kernel in use!\n", get_name(), num_samples, num_eigenvalues);
00304 
00305     index_t num_data;
00306     if (m_kernel->get_kernel_type()==K_CUSTOM)
00307         num_data=m_kernel->get_num_vec_rhs();
00308     else
00309         num_data=m_p_and_q->get_num_vectors();
00310 
00311     if (m_m!=num_data/2)
00312     {
00313         SG_ERROR("%s::sample_null_spectrum(): Currently, only equal "
00314                 "sample sizes are supported\n", get_name());
00315     }
00316 
00317     if (num_samples<=2)
00318     {
00319         SG_ERROR("%s::sample_null_spectrum(): Number of samples has to be at"
00320                 " least 2, better in the hundreds", get_name());
00321     }
00322 
00323     if (num_eigenvalues>2*m_m-1)
00324     {
00325         SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too large\n",
00326                 get_name());
00327     }
00328 
00329     if (num_eigenvalues<1)
00330     {
00331         SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too small\n",
00332                 get_name());
00333     }
00334 
00335     /* evtl. warn user not to use wrong statistic type */
00336     if (m_statistic_type!=BIASED)
00337     {
00338         SG_WARNING("%s::sample_null_spectrum(): Note: provided statistic has "
00339                 "to be BIASED. Please ensure that! To get rid of warning,"
00340                 "call %s::set_statistic_type(BIASED)\n", get_name(),
00341                 get_name());
00342     }
00343 
00344     /* imaginary matrix K=[K KL; KL' L] (MATLAB notation)
00345      * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
00346      * works since X and Y are concatenated here */
00347     m_kernel->init(m_p_and_q, m_p_and_q);
00348     SGMatrix<float64_t> K=m_kernel->get_kernel_matrix();
00349 
00350     /* center matrix K=H*K*H */
00351     K.center();
00352 
00353     /* compute eigenvalues and select num_eigenvalues largest ones */
00354     SGVector<float64_t> eigenvalues=
00355             SGMatrix<float64_t>::compute_eigenvectors(K);
00356     SGVector<float64_t> largest_ev(num_eigenvalues);
00357 
00358     /* take largest EV, scale by 1/2/m on the fly and take abs value*/
00359     for (index_t i=0; i<num_eigenvalues; ++i)
00360         largest_ev[i]=CMath::abs(
00361                 1.0/2/m_m*eigenvalues[eigenvalues.vlen-1-i]);
00362 
00363     /* finally, sample from null distribution */
00364     SGVector<float64_t> null_samples(num_samples);
00365     for (index_t i=0; i<num_samples; ++i)
00366     {
00367         /* 2*sum(kEigs.*(randn(length(kEigs),1)).^2); */
00368         null_samples[i]=0;
00369         for (index_t j=0; j<largest_ev.vlen; ++j)
00370             null_samples[i]+=largest_ev[j]*CMath::pow(CMath::randn_double(), 2);
00371 
00372         null_samples[i]*=2;
00373     }
00374 
00375     return null_samples;
00376 }
00377 #endif // HAVE_LAPACK
00378 
00379 SGVector<float64_t> CQuadraticTimeMMD::fit_null_gamma()
00380 {
00381     REQUIRE(m_kernel, "%s::fit_null_gamma(): No kernel set!\n", get_name());
00382     REQUIRE(m_kernel->get_kernel_type()==K_CUSTOM || m_p_and_q,
00383             "%s::fit_null_gamma(): No features set and no custom kernel in"
00384             " use!\n", get_name());
00385 
00386     index_t num_data;
00387     if (m_kernel->get_kernel_type()==K_CUSTOM)
00388         num_data=m_kernel->get_num_vec_rhs();
00389     else
00390         num_data=m_p_and_q->get_num_vectors();
00391 
00392     if (m_m!=num_data/2)
00393     {
00394         SG_ERROR("%s::compute_p_value_gamma(): Currently, only equal "
00395                 "sample sizes are supported\n", get_name());
00396     }
00397 
00398     /* evtl. warn user not to use wrong statistic type */
00399     if (m_statistic_type!=BIASED)
00400     {
00401         SG_WARNING("%s::compute_p_value(): Note: provided statistic has "
00402                 "to be BIASED. Please ensure that! To get rid of warning,"
00403                 "call %s::set_statistic_type(BIASED)\n", get_name(),
00404                 get_name());
00405     }
00406 
00407     /* imaginary matrix K=[K KL; KL' L] (MATLAB notation)
00408      * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX
00409      * works since X and Y are concatenated here */
00410     m_kernel->init(m_p_and_q, m_p_and_q);
00411 
00412     /* compute mean under H0 of MMD, which is
00413      * meanMMD  = 2/m * ( 1  - 1/m*sum(diag(KL))  );
00414      * in MATLAB.
00415      * Remove diagonals on the fly */
00416     float64_t mean_mmd=0;
00417     for (index_t i=0; i<m_m; ++i)
00418     {
00419         /* virtual KL matrix is in upper right corner of SHOGUN K matrix
00420          * so this sums the diagonal of the matrix between X and Y*/
00421         mean_mmd+=m_kernel->kernel(i, m_m+i);
00422     }
00423     mean_mmd=2.0/m_m*(1.0-1.0/m_m*mean_mmd);
00424 
00425     /* compute variance under H0 of MMD, which is
00426      * varMMD = 2/m/(m-1) * 1/m/(m-1) * sum(sum( (K + L - KL - KL').^2 ));
00427      * in MATLAB, so sum up all elements */
00428     float64_t var_mmd=0;
00429     for (index_t i=0; i<m_m; ++i)
00430     {
00431         for (index_t j=0; j<m_m; ++j)
00432         {
00433             /* dont add diagonal of all pairs of imaginary kernel matrices */
00434             if (i==j || m_m+i==j || m_m+j==i)
00435                 continue;
00436 
00437             float64_t to_add=m_kernel->kernel(i, j);
00438             to_add+=m_kernel->kernel(m_m+i, m_m+j);
00439             to_add-=m_kernel->kernel(i, m_m+j);
00440             to_add-=m_kernel->kernel(m_m+i, j);
00441             var_mmd+=CMath::pow(to_add, 2);
00442         }
00443     }
00444     var_mmd*=2.0/m_m/(m_m-1)*1.0/m_m/(m_m-1);
00445 
00446     /* parameters for gamma distribution */
00447     float64_t a=CMath::pow(mean_mmd, 2)/var_mmd;
00448     float64_t b=var_mmd*m_m / mean_mmd;
00449 
00450     SGVector<float64_t> result(2);
00451     result[0]=a;
00452     result[1]=b;
00453 
00454     return result;
00455 }
00456 
00457 void CQuadraticTimeMMD::set_num_samples_sepctrum(index_t
00458         num_samples_spectrum)
00459 {
00460     m_num_samples_spectrum=num_samples_spectrum;
00461 }
00462 
00463 void CQuadraticTimeMMD::set_num_eigenvalues_spectrum(
00464         index_t num_eigenvalues_spectrum)
00465 {
00466     m_num_eigenvalues_spectrum=num_eigenvalues_spectrum;
00467 }
00468 
00469 void CQuadraticTimeMMD::set_statistic_type(EQuadraticMMDType
00470         statistic_type)
00471 {
00472     m_statistic_type=statistic_type;
00473 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation