SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2012-2013 Heiko Strathmann 00008 */ 00009 00010 #include <shogun/statistics/QuadraticTimeMMD.h> 00011 #include <shogun/features/Features.h> 00012 #include <shogun/mathematics/Statistics.h> 00013 #include <shogun/kernel/Kernel.h> 00014 #include <shogun/kernel/CombinedKernel.h> 00015 #include <shogun/kernel/CustomKernel.h> 00016 00017 using namespace shogun; 00018 00019 CQuadraticTimeMMD::CQuadraticTimeMMD() : CKernelTwoSampleTestStatistic() 00020 { 00021 init(); 00022 } 00023 00024 CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p_and_q, 00025 index_t m) : 00026 CKernelTwoSampleTestStatistic(kernel, p_and_q, m) 00027 { 00028 init(); 00029 00030 if (p_and_q && m!=p_and_q->get_num_vectors()/2) 00031 { 00032 SG_ERROR("CQuadraticTimeMMD: Only features with equal number of vectors " 00033 "are currently possible\n"); 00034 } 00035 } 00036 00037 CQuadraticTimeMMD::CQuadraticTimeMMD(CKernel* kernel, CFeatures* p, 00038 CFeatures* q) : CKernelTwoSampleTestStatistic(kernel, p, q) 00039 { 00040 init(); 00041 00042 if (p && q && p->get_num_vectors()!=q->get_num_vectors()) 00043 { 00044 SG_ERROR("CQuadraticTimeMMD: Only features with equal number of vectors " 00045 "are currently possible\n"); 00046 } 00047 } 00048 00049 CQuadraticTimeMMD::CQuadraticTimeMMD(CCustomKernel* custom_kernel, index_t m) : 00050 CKernelTwoSampleTestStatistic(custom_kernel, NULL, m) 00051 { 00052 init(); 00053 } 00054 00055 CQuadraticTimeMMD::~CQuadraticTimeMMD() 00056 { 00057 00058 } 00059 00060 void CQuadraticTimeMMD::init() 00061 { 00062 SG_ADD(&m_num_samples_spectrum, "num_samples_spectrum", "Number of samples" 00063 " for spectrum method null-distribution approximation", 00064 MS_NOT_AVAILABLE); 00065 SG_ADD(&m_num_eigenvalues_spectrum, "num_eigenvalues_spectrum", "Number of " 00066 " Eigenvalues for spectrum method null-distribution approximation", 00067 MS_NOT_AVAILABLE); 00068 SG_ADD((machine_int_t*)&m_statistic_type, "statistic_type", 00069 "Biased or unbiased MMD", MS_NOT_AVAILABLE); 00070 00071 m_num_samples_spectrum=0; 00072 m_num_eigenvalues_spectrum=0; 00073 m_statistic_type=UNBIASED; 00074 } 00075 00076 float64_t CQuadraticTimeMMD::compute_unbiased_statistic() 00077 { 00078 /* split computations into three terms from JLMR paper (see documentation )*/ 00079 index_t m=m_m; 00080 00081 /* init kernel with features */ 00082 m_kernel->init(m_p_and_q, m_p_and_q); 00083 00084 /* first term */ 00085 float64_t first=0; 00086 for (index_t i=0; i<m; ++i) 00087 { 00088 /* ensure i!=j while adding up */ 00089 for (index_t j=0; j<m; ++j) 00090 first+=i==j ? 0 : m_kernel->kernel(i,j); 00091 } 00092 first/=(m-1); 00093 00094 /* second term */ 00095 float64_t second=0; 00096 for (index_t i=m_m; i<m_m+m; ++i) 00097 { 00098 /* ensure i!=j while adding up */ 00099 for (index_t j=m_m; j<m_m+m; ++j) 00100 second+=i==j ? 0 : m_kernel->kernel(i,j); 00101 } 00102 second/=(m-1); 00103 00104 /* third term */ 00105 float64_t third=0; 00106 for (index_t i=0; i<m; ++i) 00107 { 00108 for (index_t j=m_m; j<m_m+m; ++j) 00109 third+=m_kernel->kernel(i,j); 00110 } 00111 third*=2.0/m; 00112 00113 return first+second-third; 00114 } 00115 00116 float64_t CQuadraticTimeMMD::compute_biased_statistic() 00117 { 00118 /* split computations into three terms from JLMR paper (see documentation )*/ 00119 index_t m=m_m; 00120 00121 /* init kernel with features */ 00122 m_kernel->init(m_p_and_q, m_p_and_q); 00123 00124 /* first term */ 00125 float64_t first=0; 00126 for (index_t i=0; i<m; ++i) 00127 { 00128 for (index_t j=0; j<m; ++j) 00129 first+=m_kernel->kernel(i,j); 00130 } 00131 first/=m; 00132 00133 /* second term */ 00134 float64_t second=0; 00135 for (index_t i=m_m; i<m_m+m; ++i) 00136 { 00137 for (index_t j=m_m; j<m_m+m; ++j) 00138 second+=m_kernel->kernel(i,j); 00139 } 00140 second/=m; 00141 00142 /* third term */ 00143 float64_t third=0; 00144 for (index_t i=0; i<m; ++i) 00145 { 00146 for (index_t j=m_m; j<m_m+m; ++j) 00147 third+=m_kernel->kernel(i,j); 00148 } 00149 third*=2.0/m; 00150 00151 return first+second-third; 00152 } 00153 00154 float64_t CQuadraticTimeMMD::compute_statistic() 00155 { 00156 if (!m_kernel) 00157 SG_ERROR("%s::compute_statistic(): No kernel specified!\n", get_name()) 00158 00159 float64_t result=0; 00160 switch (m_statistic_type) 00161 { 00162 case UNBIASED: 00163 result=compute_unbiased_statistic(); 00164 break; 00165 case BIASED: 00166 result=compute_biased_statistic(); 00167 break; 00168 default: 00169 SG_ERROR("CQuadraticTimeMMD::compute_statistic(): Unknown statistic " 00170 "type!\n"); 00171 break; 00172 } 00173 00174 return result; 00175 } 00176 00177 float64_t CQuadraticTimeMMD::compute_p_value(float64_t statistic) 00178 { 00179 float64_t result=0; 00180 00181 switch (m_null_approximation_method) 00182 { 00183 case MMD2_SPECTRUM: 00184 { 00185 #ifdef HAVE_LAPACK 00186 /* get samples from null-distribution and compute p-value of statistic */ 00187 SGVector<float64_t> null_samples=sample_null_spectrum( 00188 m_num_samples_spectrum, m_num_eigenvalues_spectrum); 00189 null_samples.qsort(); 00190 index_t pos=null_samples.find_position_to_insert(statistic); 00191 result=1.0-((float64_t)pos)/null_samples.vlen; 00192 #else // HAVE_LAPACK 00193 SG_ERROR("CQuadraticTimeMMD::compute_p_value(): Only possible if " 00194 "shogun is compiled with LAPACK enabled\n"); 00195 #endif // HAVE_LAPACK 00196 break; 00197 } 00198 00199 case MMD2_GAMMA: 00200 { 00201 /* fit gamma and return cdf at statistic */ 00202 SGVector<float64_t> params=fit_null_gamma(); 00203 result=CStatistics::gamma_cdf(statistic, params[0], params[1]); 00204 break; 00205 } 00206 00207 default: 00208 result=CKernelTwoSampleTestStatistic::compute_p_value(statistic); 00209 break; 00210 } 00211 00212 return result; 00213 } 00214 00215 SGVector<float64_t> CQuadraticTimeMMD::compute_statistic( 00216 bool multiple_kernels) 00217 { 00218 SGVector<float64_t> mmds; 00219 if (!multiple_kernels) 00220 { 00221 /* just one mmd result */ 00222 mmds=SGVector<float64_t>(1); 00223 mmds[0]=compute_statistic(); 00224 } 00225 else 00226 { 00227 REQUIRE(m_kernel->get_kernel_type()==K_COMBINED, 00228 "%s::compute_statistic: multiple kernels specified," 00229 "but underlying kernel is not of type K_COMBINED\n", get_name()); 00230 00231 /* cast and allocate memory for results */ 00232 CCombinedKernel* combined=(CCombinedKernel*)m_kernel; 00233 SG_REF(combined); 00234 mmds=SGVector<float64_t>(combined->get_num_subkernels()); 00235 00236 /* iterate through all kernels and compute statistic */ 00237 /* TODO this might be done in parallel */ 00238 for (index_t i=0; i<mmds.vlen; ++i) 00239 { 00240 CKernel* current=combined->get_kernel(i); 00241 /* temporarily replace underlying kernel and compute statistic */ 00242 m_kernel=current; 00243 mmds[i]=compute_statistic(); 00244 00245 SG_UNREF(current); 00246 } 00247 00248 /* restore combined kernel */ 00249 m_kernel=combined; 00250 SG_UNREF(combined); 00251 } 00252 00253 return mmds; 00254 } 00255 00256 float64_t CQuadraticTimeMMD::compute_threshold(float64_t alpha) 00257 { 00258 float64_t result=0; 00259 00260 switch (m_null_approximation_method) 00261 { 00262 case MMD2_SPECTRUM: 00263 { 00264 #ifdef HAVE_LAPACK 00265 /* get samples from null-distribution and compute threshold */ 00266 SGVector<float64_t> null_samples=sample_null_spectrum( 00267 m_num_samples_spectrum, m_num_eigenvalues_spectrum); 00268 null_samples.qsort(); 00269 result=null_samples[index_t(CMath::floor(null_samples.vlen*(1-alpha)))]; 00270 #else // HAVE_LAPACK 00271 SG_ERROR("CQuadraticTimeMMD::compute_threshold(): Only possible if " 00272 "shogun is compiled with LAPACK enabled\n"); 00273 #endif // HAVE_LAPACK 00274 break; 00275 } 00276 00277 case MMD2_GAMMA: 00278 { 00279 /* fit gamma and return inverse cdf at alpha */ 00280 SGVector<float64_t> params=fit_null_gamma(); 00281 result=CStatistics::inverse_gamma_cdf(alpha, params[0], params[1]); 00282 break; 00283 } 00284 00285 default: 00286 /* bootstrapping is handled here */ 00287 result=CKernelTwoSampleTestStatistic::compute_threshold(alpha); 00288 break; 00289 } 00290 00291 return result; 00292 } 00293 00294 00295 #ifdef HAVE_LAPACK 00296 SGVector<float64_t> CQuadraticTimeMMD::sample_null_spectrum(index_t num_samples, 00297 index_t num_eigenvalues) 00298 { 00299 REQUIRE(m_kernel, "%s::sample_null_spectrum(%d, %d): No kernel set!\n", 00300 get_name(), num_samples, num_eigenvalues); 00301 REQUIRE(m_kernel->get_kernel_type()==K_CUSTOM || m_p_and_q, 00302 "%s::sample_null_spectrum(%d, %d): No features set and no custom " 00303 "kernel in use!\n", get_name(), num_samples, num_eigenvalues); 00304 00305 index_t num_data; 00306 if (m_kernel->get_kernel_type()==K_CUSTOM) 00307 num_data=m_kernel->get_num_vec_rhs(); 00308 else 00309 num_data=m_p_and_q->get_num_vectors(); 00310 00311 if (m_m!=num_data/2) 00312 { 00313 SG_ERROR("%s::sample_null_spectrum(): Currently, only equal " 00314 "sample sizes are supported\n", get_name()); 00315 } 00316 00317 if (num_samples<=2) 00318 { 00319 SG_ERROR("%s::sample_null_spectrum(): Number of samples has to be at" 00320 " least 2, better in the hundreds", get_name()); 00321 } 00322 00323 if (num_eigenvalues>2*m_m-1) 00324 { 00325 SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too large\n", 00326 get_name()); 00327 } 00328 00329 if (num_eigenvalues<1) 00330 { 00331 SG_ERROR("%s::sample_null_spectrum(): Number of Eigenvalues too small\n", 00332 get_name()); 00333 } 00334 00335 /* evtl. warn user not to use wrong statistic type */ 00336 if (m_statistic_type!=BIASED) 00337 { 00338 SG_WARNING("%s::sample_null_spectrum(): Note: provided statistic has " 00339 "to be BIASED. Please ensure that! To get rid of warning," 00340 "call %s::set_statistic_type(BIASED)\n", get_name(), 00341 get_name()); 00342 } 00343 00344 /* imaginary matrix K=[K KL; KL' L] (MATLAB notation) 00345 * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX 00346 * works since X and Y are concatenated here */ 00347 m_kernel->init(m_p_and_q, m_p_and_q); 00348 SGMatrix<float64_t> K=m_kernel->get_kernel_matrix(); 00349 00350 /* center matrix K=H*K*H */ 00351 K.center(); 00352 00353 /* compute eigenvalues and select num_eigenvalues largest ones */ 00354 SGVector<float64_t> eigenvalues= 00355 SGMatrix<float64_t>::compute_eigenvectors(K); 00356 SGVector<float64_t> largest_ev(num_eigenvalues); 00357 00358 /* take largest EV, scale by 1/2/m on the fly and take abs value*/ 00359 for (index_t i=0; i<num_eigenvalues; ++i) 00360 largest_ev[i]=CMath::abs( 00361 1.0/2/m_m*eigenvalues[eigenvalues.vlen-1-i]); 00362 00363 /* finally, sample from null distribution */ 00364 SGVector<float64_t> null_samples(num_samples); 00365 for (index_t i=0; i<num_samples; ++i) 00366 { 00367 /* 2*sum(kEigs.*(randn(length(kEigs),1)).^2); */ 00368 null_samples[i]=0; 00369 for (index_t j=0; j<largest_ev.vlen; ++j) 00370 null_samples[i]+=largest_ev[j]*CMath::pow(CMath::randn_double(), 2); 00371 00372 null_samples[i]*=2; 00373 } 00374 00375 return null_samples; 00376 } 00377 #endif // HAVE_LAPACK 00378 00379 SGVector<float64_t> CQuadraticTimeMMD::fit_null_gamma() 00380 { 00381 REQUIRE(m_kernel, "%s::fit_null_gamma(): No kernel set!\n", get_name()); 00382 REQUIRE(m_kernel->get_kernel_type()==K_CUSTOM || m_p_and_q, 00383 "%s::fit_null_gamma(): No features set and no custom kernel in" 00384 " use!\n", get_name()); 00385 00386 index_t num_data; 00387 if (m_kernel->get_kernel_type()==K_CUSTOM) 00388 num_data=m_kernel->get_num_vec_rhs(); 00389 else 00390 num_data=m_p_and_q->get_num_vectors(); 00391 00392 if (m_m!=num_data/2) 00393 { 00394 SG_ERROR("%s::compute_p_value_gamma(): Currently, only equal " 00395 "sample sizes are supported\n", get_name()); 00396 } 00397 00398 /* evtl. warn user not to use wrong statistic type */ 00399 if (m_statistic_type!=BIASED) 00400 { 00401 SG_WARNING("%s::compute_p_value(): Note: provided statistic has " 00402 "to be BIASED. Please ensure that! To get rid of warning," 00403 "call %s::set_statistic_type(BIASED)\n", get_name(), 00404 get_name()); 00405 } 00406 00407 /* imaginary matrix K=[K KL; KL' L] (MATLAB notation) 00408 * K is matrix for XX, L is matrix for YY, KL is XY, LK is YX 00409 * works since X and Y are concatenated here */ 00410 m_kernel->init(m_p_and_q, m_p_and_q); 00411 00412 /* compute mean under H0 of MMD, which is 00413 * meanMMD = 2/m * ( 1 - 1/m*sum(diag(KL)) ); 00414 * in MATLAB. 00415 * Remove diagonals on the fly */ 00416 float64_t mean_mmd=0; 00417 for (index_t i=0; i<m_m; ++i) 00418 { 00419 /* virtual KL matrix is in upper right corner of SHOGUN K matrix 00420 * so this sums the diagonal of the matrix between X and Y*/ 00421 mean_mmd+=m_kernel->kernel(i, m_m+i); 00422 } 00423 mean_mmd=2.0/m_m*(1.0-1.0/m_m*mean_mmd); 00424 00425 /* compute variance under H0 of MMD, which is 00426 * varMMD = 2/m/(m-1) * 1/m/(m-1) * sum(sum( (K + L - KL - KL').^2 )); 00427 * in MATLAB, so sum up all elements */ 00428 float64_t var_mmd=0; 00429 for (index_t i=0; i<m_m; ++i) 00430 { 00431 for (index_t j=0; j<m_m; ++j) 00432 { 00433 /* dont add diagonal of all pairs of imaginary kernel matrices */ 00434 if (i==j || m_m+i==j || m_m+j==i) 00435 continue; 00436 00437 float64_t to_add=m_kernel->kernel(i, j); 00438 to_add+=m_kernel->kernel(m_m+i, m_m+j); 00439 to_add-=m_kernel->kernel(i, m_m+j); 00440 to_add-=m_kernel->kernel(m_m+i, j); 00441 var_mmd+=CMath::pow(to_add, 2); 00442 } 00443 } 00444 var_mmd*=2.0/m_m/(m_m-1)*1.0/m_m/(m_m-1); 00445 00446 /* parameters for gamma distribution */ 00447 float64_t a=CMath::pow(mean_mmd, 2)/var_mmd; 00448 float64_t b=var_mmd*m_m / mean_mmd; 00449 00450 SGVector<float64_t> result(2); 00451 result[0]=a; 00452 result[1]=b; 00453 00454 return result; 00455 } 00456 00457 void CQuadraticTimeMMD::set_num_samples_sepctrum(index_t 00458 num_samples_spectrum) 00459 { 00460 m_num_samples_spectrum=num_samples_spectrum; 00461 } 00462 00463 void CQuadraticTimeMMD::set_num_eigenvalues_spectrum( 00464 index_t num_eigenvalues_spectrum) 00465 { 00466 m_num_eigenvalues_spectrum=num_eigenvalues_spectrum; 00467 } 00468 00469 void CQuadraticTimeMMD::set_statistic_type(EQuadraticMMDType 00470 statistic_type) 00471 { 00472 m_statistic_type=statistic_type; 00473 }