SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2012-2013 Heiko Strathmann 00008 */ 00009 00010 #include <shogun/statistics/LinearTimeMMD.h> 00011 #include <shogun/features/Features.h> 00012 #include <shogun/features/streaming/StreamingFeatures.h> 00013 #include <shogun/mathematics/Statistics.h> 00014 #include <shogun/features/CombinedFeatures.h> 00015 #include <shogun/kernel/CombinedKernel.h> 00016 #include <shogun/lib/List.h> 00017 00018 #include <shogun/lib/external/libqp.h> 00019 00020 using namespace shogun; 00021 00022 CLinearTimeMMD::CLinearTimeMMD() : 00023 CKernelTwoSampleTestStatistic() 00024 { 00025 init(); 00026 } 00027 00028 CLinearTimeMMD::CLinearTimeMMD(CKernel* kernel, CStreamingFeatures* p, 00029 CStreamingFeatures* q, index_t m, index_t blocksize) : 00030 CKernelTwoSampleTestStatistic(kernel, NULL, m) 00031 { 00032 init(); 00033 00034 m_streaming_p=p; 00035 SG_REF(m_streaming_p); 00036 00037 m_streaming_q=q; 00038 SG_REF(m_streaming_q); 00039 00040 m_blocksize=blocksize; 00041 } 00042 00043 CLinearTimeMMD::~CLinearTimeMMD() 00044 { 00045 SG_UNREF(m_streaming_p); 00046 SG_UNREF(m_streaming_q); 00047 00048 /* m_kernel is SG_UNREFed in base desctructor */ 00049 } 00050 00051 void CLinearTimeMMD::init() 00052 { 00053 SG_ADD((CSGObject**)&m_streaming_p, "streaming_p", "Streaming features p", 00054 MS_NOT_AVAILABLE); 00055 SG_ADD((CSGObject**)&m_streaming_q, "streaming_q", "Streaming features p", 00056 MS_NOT_AVAILABLE); 00057 SG_ADD(&m_blocksize, "blocksize", "Number of elements processed at once", 00058 MS_NOT_AVAILABLE); 00059 SG_ADD(&m_simulate_h0, "simulate_h0", "Whether p and q are mixed", 00060 MS_NOT_AVAILABLE); 00061 00062 m_streaming_p=NULL; 00063 m_streaming_q=NULL; 00064 m_blocksize=10000; 00065 m_simulate_h0=false; 00066 } 00067 00068 void CLinearTimeMMD::compute_statistic_and_variance( 00069 SGVector<float64_t>& statistic, SGVector<float64_t>& variance, 00070 bool multiple_kernels) 00071 { 00072 SG_DEBUG("entering %s::compute_statistic_and_variance()\n", get_name()) 00073 00074 REQUIRE(m_streaming_p, "%s::compute_statistic_and_variance: streaming " 00075 "features p required!\n", get_name()); 00076 REQUIRE(m_streaming_q, "%s::compute_statistic_and_variance: streaming " 00077 "features q required!\n", get_name()); 00078 00079 REQUIRE(m_kernel, "%s::compute_statistic_and_variance: kernel needed!\n", 00080 get_name()); 00081 00082 /* make sure multiple_kernels flag is used only with a combined kernel */ 00083 REQUIRE(!multiple_kernels || m_kernel->get_kernel_type()==K_COMBINED, 00084 "%s::compute_statistic_and_variance: multiple kernels specified," 00085 "but underlying kernel is not of type K_COMBINED\n", get_name()); 00086 00087 /* m is number of samples from each distribution, m_2 is half of it 00088 * using names from JLMR paper (see class documentation) */ 00089 index_t m_2=m_m/2; 00090 00091 SG_DEBUG("m_m=%d\n", m_m) 00092 00093 /* find out whether single or multiple kernels (cast is safe, check above) */ 00094 index_t num_kernels=1; 00095 if (multiple_kernels) 00096 { 00097 num_kernels=((CCombinedKernel*)m_kernel)->get_num_subkernels(); 00098 SG_DEBUG("computing MMD and variance for %d sub-kernels\n", 00099 num_kernels); 00100 } 00101 00102 /* allocate memory for results if vectors are empty */ 00103 if (!statistic.vector) 00104 statistic=SGVector<float64_t>(num_kernels); 00105 00106 if (!variance.vector) 00107 variance=SGVector<float64_t>(num_kernels); 00108 00109 /* ensure right dimensions */ 00110 REQUIRE(statistic.vlen==num_kernels, "%s::compute_statistic_and_variance: " 00111 "statistic vector size (%d) does not match number of kernels (%d)\n", 00112 get_name(), statistic.vlen, num_kernels); 00113 00114 REQUIRE(variance.vlen==num_kernels, "%s::compute_statistic_and_variance: " 00115 "variance vector size (%d) does not match number of kernels (%d)\n", 00116 get_name(), variance.vlen, num_kernels); 00117 00118 /* temp variable in the algorithm */ 00119 float64_t current; 00120 float64_t delta; 00121 00122 /* initialise statistic and variance since they are cumulative */ 00123 statistic.zero(); 00124 variance.zero(); 00125 00126 /* needed for online mean and variance */ 00127 SGVector<index_t> term_counters(num_kernels); 00128 term_counters.set_const(1); 00129 00130 /* term counter to compute online mean and variance */ 00131 index_t num_examples_processed=0; 00132 while (num_examples_processed<m_2) 00133 { 00134 /* number of example to look at in this iteration */ 00135 index_t num_this_run=CMath::min(m_blocksize, 00136 CMath::max(0, m_2-num_examples_processed)); 00137 SG_DEBUG("processing %d more examples. %d so far processed. Blocksize " 00138 "is %d\n", num_this_run, num_examples_processed, m_blocksize); 00139 00140 /* stream data from both distributions */ 00141 CFeatures* p1=m_streaming_p->get_streamed_features(num_this_run); 00142 CFeatures* p2=m_streaming_p->get_streamed_features(num_this_run); 00143 CFeatures* q1=m_streaming_q->get_streamed_features(num_this_run); 00144 CFeatures* q2=m_streaming_q->get_streamed_features(num_this_run); 00145 00146 /* check whether h0 should be simulated and permute if so */ 00147 if (m_simulate_h0) 00148 { 00149 /* create merged copy of all feature instances to permute */ 00150 CList* list=new CList(); 00151 list->append_element(p2); 00152 list->append_element(q1); 00153 list->append_element(q2); 00154 CFeatures* merged=p1->create_merged_copy(list); 00155 SG_UNREF(list); 00156 00157 /* permute */ 00158 SGVector<index_t> inds(merged->get_num_vectors()); 00159 inds.range_fill(); 00160 inds.permute(); 00161 merged->add_subset(inds); 00162 00163 /* copy back, replacing old features */ 00164 SG_UNREF(p1); 00165 SG_UNREF(p2); 00166 SG_UNREF(q1); 00167 SG_UNREF(q2); 00168 00169 SGVector<index_t> copy(num_this_run); 00170 copy.range_fill(); 00171 p1=merged->copy_subset(copy); 00172 copy.add(num_this_run); 00173 p2=merged->copy_subset(copy); 00174 copy.add(num_this_run); 00175 q1=merged->copy_subset(copy); 00176 copy.add(num_this_run); 00177 q2=merged->copy_subset(copy); 00178 00179 /* clean up and note that copy_subset does a SG_REF */ 00180 SG_UNREF(merged); 00181 } 00182 else 00183 { 00184 /* reference produced features (only if copy_subset was not used) */ 00185 SG_REF(p1); 00186 SG_REF(p2); 00187 SG_REF(q1); 00188 SG_REF(q2); 00189 } 00190 00191 /* if multiple kernels are used, compute all of them on streamed data, 00192 * if multiple kernels flag is false, the above loop will be executed 00193 * only once */ 00194 CKernel* kernel=m_kernel; 00195 if (multiple_kernels) 00196 { 00197 SG_DEBUG("using multiple kernels\n"); 00198 } 00199 00200 /* iterate through all kernels for this data */ 00201 for (index_t i=0; i<num_kernels; ++i) 00202 { 00203 /* if multiple kernels should be computed, set next kernel */ 00204 if (multiple_kernels) 00205 { 00206 kernel=((CCombinedKernel*)m_kernel)->get_kernel(i); 00207 } 00208 00209 /* compute kernel matrix diagonals */ 00210 kernel->init(p1, p2); 00211 SGVector<float64_t> pp=kernel->get_kernel_diagonal(); 00212 00213 kernel->init(q1, q2); 00214 SGVector<float64_t> qq=kernel->get_kernel_diagonal(); 00215 00216 kernel->init(p1, q2); 00217 SGVector<float64_t> pq=kernel->get_kernel_diagonal(); 00218 00219 kernel->init(q1, p2); 00220 SGVector<float64_t> qp=kernel->get_kernel_diagonal(); 00221 00222 /* single variances for all kernels. Update mean and variance 00223 * using Knuth's online variance algorithm. 00224 * C.f. for example Wikipedia */ 00225 for (index_t j=0; j<num_this_run; ++j) 00226 { 00227 /* compute sum of current h terms for current kernel */ 00228 current=pp[j]+qq[j]-pq[j]-qp[j]; 00229 00230 /* D. Knuth's online variance algorithm for current kernel */ 00231 delta=current-statistic[i]; 00232 statistic[i]+=delta/term_counters[i]++; 00233 variance[i]+=delta*(current-statistic[i]); 00234 00235 SG_DEBUG("burst: current=%f, delta=%f, statistic=%f, " 00236 "variance=%f, kernel_idx=%d\n", current, delta, 00237 statistic[i], variance[i], i); 00238 } 00239 00240 if (multiple_kernels) 00241 { 00242 SG_UNREF(kernel); 00243 } 00244 } 00245 00246 /* clean up streamed data */ 00247 SG_UNREF(p1); 00248 SG_UNREF(p2); 00249 SG_UNREF(q1); 00250 SG_UNREF(q2); 00251 00252 /* add number of processed examples for this run */ 00253 num_examples_processed+=num_this_run; 00254 } 00255 SG_DEBUG("Done compouting statistic, processed 2*%d examples.\n", 00256 num_examples_processed); 00257 00258 /* mean of sum all traces is linear time mmd, copy entries for all kernels */ 00259 if (io->get_loglevel()==MSG_DEBUG || io->get_loglevel()==MSG_GCDEBUG) 00260 statistic.display_vector("statistics"); 00261 00262 /* variance of terms can be computed using mean (statistic). 00263 * Note that the variance needs to be divided by m_2 in order to get 00264 * variance of null-distribution */ 00265 for (index_t i=0; i<num_kernels; ++i) 00266 variance[i]=variance[i]/(m_2-1)/m_2; 00267 00268 if (io->get_loglevel()==MSG_DEBUG || io->get_loglevel()==MSG_GCDEBUG) 00269 variance.display_vector("variances"); 00270 00271 SG_DEBUG("leaving %s::compute_statistic_and_variance()\n", get_name()) 00272 } 00273 00274 void CLinearTimeMMD::compute_statistic_and_Q( 00275 SGVector<float64_t>& statistic, SGMatrix<float64_t>& Q) 00276 { 00277 SG_DEBUG("entering %s::compute_statistic_and_Q()\n", get_name()) 00278 00279 REQUIRE(m_streaming_p, "%s::compute_statistic_and_Q: streaming " 00280 "features p required!\n", get_name()); 00281 REQUIRE(m_streaming_q, "%s::compute_statistic_and_Q: streaming " 00282 "features q required!\n", get_name()); 00283 00284 REQUIRE(m_kernel, "%s::compute_statistic_and_Q: kernel needed!\n", 00285 get_name()); 00286 00287 /* make sure multiple_kernels flag is used only with a combined kernel */ 00288 REQUIRE(m_kernel->get_kernel_type()==K_COMBINED, 00289 "%s::compute_statistic_and_Q: underlying kernel is not of " 00290 "type K_COMBINED\n", get_name()); 00291 00292 /* cast combined kernel */ 00293 CCombinedKernel* combined=(CCombinedKernel*)m_kernel; 00294 00295 /* m is number of samples from each distribution, m_4 is quarter of it */ 00296 REQUIRE(m_m>=4, "%s::compute_statistic_and_Q: Need at least m>=4\n", 00297 get_name()); 00298 index_t m_4=m_m/4; 00299 00300 SG_DEBUG("m_m=%d\n", m_m) 00301 00302 /* find out whether single or multiple kernels (cast is safe, check above) */ 00303 index_t num_kernels=combined->get_num_subkernels(); 00304 REQUIRE(num_kernels>0, "%s::compute_statistic_and_Q: At least one kernel " 00305 "is needed\n", get_name()); 00306 00307 /* allocate memory for results if vectors are empty */ 00308 if (!statistic.vector) 00309 statistic=SGVector<float64_t>(num_kernels); 00310 00311 if (!Q.matrix) 00312 Q=SGMatrix<float64_t>(num_kernels, num_kernels); 00313 00314 /* ensure right dimensions */ 00315 REQUIRE(statistic.vlen==num_kernels, "%s::compute_statistic_and_variance: " 00316 "statistic vector size (%d) does not match number of kernels (%d)\n", 00317 get_name(), statistic.vlen, num_kernels); 00318 00319 REQUIRE(Q.num_rows==num_kernels, "%s::compute_statistic_and_variance: " 00320 "Q number of rows does (%d) not match number of kernels (%d)\n", 00321 get_name(), Q.num_rows, num_kernels); 00322 00323 REQUIRE(Q.num_cols==num_kernels, "%s::compute_statistic_and_variance: " 00324 "Q number of columns (%d) does not match number of kernels (%d)\n", 00325 get_name(), Q.num_cols, num_kernels); 00326 00327 /* initialise statistic and variance since they are cumulative */ 00328 statistic.zero(); 00329 Q.zero(); 00330 00331 /* produce two kernel lists to iterate doubly nested */ 00332 CList* list_i=new CList(); 00333 CList* list_j=new CList(); 00334 00335 for (index_t k_idx=0; k_idx<combined->get_num_kernels(); k_idx++) 00336 { 00337 CKernel* kernel = combined->get_kernel(k_idx); 00338 list_i->append_element(kernel); 00339 list_j->append_element(kernel); 00340 SG_UNREF(kernel); 00341 } 00342 00343 /* needed for online mean and variance */ 00344 SGVector<index_t> term_counters_statistic(num_kernels); 00345 SGMatrix<index_t> term_counters_Q(num_kernels, num_kernels); 00346 term_counters_statistic.set_const(1); 00347 term_counters_Q.set_const(1); 00348 00349 index_t num_examples_processed=0; 00350 while (num_examples_processed<m_4) 00351 { 00352 /* number of example to look at in this iteration */ 00353 index_t num_this_run=CMath::min(m_blocksize, 00354 CMath::max(0, m_4-num_examples_processed)); 00355 SG_DEBUG("processing %d more examples. %d so far processed. Blocksize " 00356 "is %d\n", num_this_run, num_examples_processed, m_blocksize); 00357 00358 /* stream data from both distributions */ 00359 CFeatures* p1a=m_streaming_p->get_streamed_features(num_this_run); 00360 CFeatures* p1b=m_streaming_p->get_streamed_features(num_this_run); 00361 CFeatures* p2a=m_streaming_p->get_streamed_features(num_this_run); 00362 CFeatures* p2b=m_streaming_p->get_streamed_features(num_this_run); 00363 CFeatures* q1a=m_streaming_q->get_streamed_features(num_this_run); 00364 CFeatures* q1b=m_streaming_q->get_streamed_features(num_this_run); 00365 CFeatures* q2a=m_streaming_q->get_streamed_features(num_this_run); 00366 CFeatures* q2b=m_streaming_q->get_streamed_features(num_this_run); 00367 00368 /* check whether h0 should be simulated and permute if so */ 00369 if (m_simulate_h0) 00370 { 00371 /* create merged copy of all feature instances to permute */ 00372 CList* list=new CList(); 00373 list->append_element(p1b); 00374 list->append_element(p2a); 00375 list->append_element(p2b); 00376 list->append_element(q1a); 00377 list->append_element(q1b); 00378 list->append_element(q2a); 00379 list->append_element(q2b); 00380 CFeatures* merged=p1a->create_merged_copy(list); 00381 SG_UNREF(list); 00382 00383 /* permute */ 00384 SGVector<index_t> inds(merged->get_num_vectors()); 00385 inds.range_fill(); 00386 inds.permute(); 00387 merged->add_subset(inds); 00388 00389 /* copy back, replacing old features */ 00390 SG_UNREF(p1a); 00391 SG_UNREF(p1b); 00392 SG_UNREF(p2a); 00393 SG_UNREF(p2b); 00394 SG_UNREF(q1a); 00395 SG_UNREF(q1b); 00396 SG_UNREF(q2a); 00397 SG_UNREF(q2b); 00398 00399 SGVector<index_t> copy(num_this_run); 00400 copy.range_fill(); 00401 p1a=merged->copy_subset(copy); 00402 copy.add(num_this_run); 00403 p1b=merged->copy_subset(copy); 00404 copy.add(num_this_run); 00405 p2a=merged->copy_subset(copy); 00406 copy.add(num_this_run); 00407 p2b=merged->copy_subset(copy); 00408 copy.add(num_this_run); 00409 q1a=merged->copy_subset(copy); 00410 copy.add(num_this_run); 00411 q1b=merged->copy_subset(copy); 00412 copy.add(num_this_run); 00413 q2a=merged->copy_subset(copy); 00414 copy.add(num_this_run); 00415 q2b=merged->copy_subset(copy); 00416 00417 /* clean up and note that copy_subset does a SG_REF */ 00418 SG_UNREF(merged); 00419 } 00420 else 00421 { 00422 /* reference the produced features (only if copy subset was not used) */ 00423 SG_REF(p1a); 00424 SG_REF(p1b); 00425 SG_REF(p2a); 00426 SG_REF(p2b); 00427 SG_REF(q1a); 00428 SG_REF(q1b); 00429 SG_REF(q2a); 00430 SG_REF(q2b); 00431 } 00432 00433 /* now for each of these streamed data instances, iterate through all 00434 * kernels and update Q matrix while also computing MMD statistic */ 00435 00436 /* preallocate some memory for faster processing */ 00437 SGVector<float64_t> pp(num_this_run); 00438 SGVector<float64_t> qq(num_this_run); 00439 SGVector<float64_t> pq(num_this_run); 00440 SGVector<float64_t> qp(num_this_run); 00441 SGVector<float64_t> h_i_a(num_this_run); 00442 SGVector<float64_t> h_i_b(num_this_run); 00443 SGVector<float64_t> h_j_a(num_this_run); 00444 SGVector<float64_t> h_j_b(num_this_run); 00445 00446 /* iterate through Q matrix and update values, compute mmd */ 00447 CKernel* kernel_i=(CKernel*)list_i->get_first_element(); 00448 for (index_t i=0; i<num_kernels; ++i) 00449 { 00450 /* compute all necessary 8 h-vectors for this burst. 00451 * h_delta-terms for each kernel, expression 7 of NIPS paper 00452 * first kernel */ 00453 00454 /* first kernel, a-part */ 00455 kernel_i->init(p1a, p2a); 00456 pp=kernel_i->get_kernel_diagonal(pp); 00457 kernel_i->init(q1a, q2a); 00458 qq=kernel_i->get_kernel_diagonal(qq); 00459 kernel_i->init(p1a, q2a); 00460 pq=kernel_i->get_kernel_diagonal(pq); 00461 kernel_i->init(q1a, p2a); 00462 qp=kernel_i->get_kernel_diagonal(qp); 00463 for (index_t it=0; it<num_this_run; ++it) 00464 h_i_a[it]=pp[it]+qq[it]-pq[it]-qp[it]; 00465 00466 /* first kernel, b-part */ 00467 kernel_i->init(p1b, p2b); 00468 pp=kernel_i->get_kernel_diagonal(pp); 00469 kernel_i->init(q1b, q2b); 00470 qq=kernel_i->get_kernel_diagonal(qq); 00471 kernel_i->init(p1b, q2b); 00472 pq=kernel_i->get_kernel_diagonal(pq); 00473 kernel_i->init(q1b, p2b); 00474 qp=kernel_i->get_kernel_diagonal(qp); 00475 for (index_t it=0; it<num_this_run; ++it) 00476 h_i_b[it]=pp[it]+qq[it]-pq[it]-qp[it]; 00477 00478 /* iterate through j, but use symmetry in order to save half of the 00479 * computations */ 00480 CKernel* kernel_j=(CKernel*)list_j->get_first_element(); 00481 for (index_t j=0; j<=i; ++j) 00482 { 00483 /* compute all necessary 8 h-vectors for this burst. 00484 * h_delta-terms for each kernel, expression 7 of NIPS paper 00485 * second kernel */ 00486 00487 /* second kernel, a-part */ 00488 kernel_j->init(p1a, p2a); 00489 pp=kernel_j->get_kernel_diagonal(pp); 00490 kernel_j->init(q1a, q2a); 00491 qq=kernel_j->get_kernel_diagonal(qq); 00492 kernel_j->init(p1a, q2a); 00493 pq=kernel_j->get_kernel_diagonal(pq); 00494 kernel_j->init(q1a, p2a); 00495 qp=kernel_j->get_kernel_diagonal(qp); 00496 for (index_t it=0; it<num_this_run; ++it) 00497 h_j_a[it]=pp[it]+qq[it]-pq[it]-qp[it]; 00498 00499 /* second kernel, b-part */ 00500 kernel_j->init(p1b, p2b); 00501 pp=kernel_j->get_kernel_diagonal(pp); 00502 kernel_j->init(q1b, q2b); 00503 qq=kernel_j->get_kernel_diagonal(qq); 00504 kernel_j->init(p1b, q2b); 00505 pq=kernel_j->get_kernel_diagonal(pq); 00506 kernel_j->init(q1b, p2b); 00507 qp=kernel_j->get_kernel_diagonal(qp); 00508 for (index_t it=0; it<num_this_run; ++it) 00509 h_j_b[it]=pp[it]+qq[it]-pq[it]-qp[it]; 00510 00511 float64_t term; 00512 for (index_t it=0; it<num_this_run; ++it) 00513 { 00514 /* current term of expression 7 of NIPS paper */ 00515 term=(h_i_a[it]-h_i_b[it])*(h_j_a[it]-h_j_b[it]); 00516 00517 /* update covariance element for the current burst. This is a 00518 * running average of the product of the h_delta terms of each 00519 * kernel */ 00520 Q(i, j)+=(term-Q(i, j))/term_counters_Q(i, j)++; 00521 } 00522 00523 /* use symmetry */ 00524 Q(j, i)=Q(i, j); 00525 00526 /* next kernel j */ 00527 kernel_j=(CKernel*)list_j->get_next_element(); 00528 } 00529 00530 /* update MMD statistic online computation for kernel i, using 00531 * vectors that were computed above */ 00532 SGVector<float64_t> h(num_this_run*2); 00533 for (index_t it=0; it<num_this_run; ++it) 00534 { 00535 /* update statistic for kernel i (outer loop) and update using 00536 * all elements of the h_i_a, h_i_b vectors (iterate over it) */ 00537 statistic[i]=statistic[i]+ 00538 (h_i_a[it]-statistic[i])/term_counters_statistic[i]++; 00539 00540 /* Make sure to use all data, i.e. part a and b */ 00541 statistic[i]=statistic[i]+ 00542 (h_i_b[it]-statistic[i])/(term_counters_statistic[i]++); 00543 } 00544 00545 /* next kernel i */ 00546 kernel_i=(CKernel*)list_i->get_next_element(); 00547 } 00548 00549 /* clean up streamed data */ 00550 SG_UNREF(p1a); 00551 SG_UNREF(p1b); 00552 SG_UNREF(p2a); 00553 SG_UNREF(p2b); 00554 SG_UNREF(q1a); 00555 SG_UNREF(q1b); 00556 SG_UNREF(q2a); 00557 SG_UNREF(q2b); 00558 00559 /* add number of processed examples for this run */ 00560 num_examples_processed+=num_this_run; 00561 } 00562 00563 /* clean up */ 00564 SG_UNREF(list_i); 00565 SG_UNREF(list_j); 00566 00567 SG_DEBUG("Done compouting statistic, processed 4*%d examples.\n", 00568 num_examples_processed); 00569 00570 SG_DEBUG("leaving %s::compute_statistic_and_Q()\n", get_name()) 00571 } 00572 00573 float64_t CLinearTimeMMD::compute_statistic() 00574 { 00575 /* use wrapper method and compute for single kernel */ 00576 SGVector<float64_t> statistic; 00577 SGVector<float64_t> variance; 00578 compute_statistic_and_variance(statistic, variance, false); 00579 00580 return statistic[0]; 00581 } 00582 00583 SGVector<float64_t> CLinearTimeMMD::compute_statistic( 00584 bool multiple_kernels) 00585 { 00586 /* make sure multiple_kernels flag is used only with a combined kernel */ 00587 REQUIRE(!multiple_kernels || m_kernel->get_kernel_type()==K_COMBINED, 00588 "%s::compute_statistic: multiple kernels specified," 00589 "but underlying kernel is not of type K_COMBINED\n", get_name()); 00590 00591 SGVector<float64_t> statistic; 00592 SGVector<float64_t> variance; 00593 compute_statistic_and_variance(statistic, variance, multiple_kernels); 00594 00595 return statistic; 00596 } 00597 00598 float64_t CLinearTimeMMD::compute_variance_estimate() 00599 { 00600 /* use wrapper method and compute for single kernel */ 00601 SGVector<float64_t> statistic; 00602 SGVector<float64_t> variance; 00603 compute_statistic_and_variance(statistic, variance, false); 00604 00605 return variance[0]; 00606 } 00607 00608 float64_t CLinearTimeMMD::compute_p_value(float64_t statistic) 00609 { 00610 float64_t result=0; 00611 00612 switch (m_null_approximation_method) 00613 { 00614 case MMD1_GAUSSIAN: 00615 { 00616 /* compute variance and use to estimate Gaussian distribution */ 00617 float64_t std_dev=CMath::sqrt(compute_variance_estimate()); 00618 result=1.0-CStatistics::normal_cdf(statistic, std_dev); 00619 } 00620 break; 00621 00622 default: 00623 /* bootstrapping is handled here */ 00624 result=CKernelTwoSampleTestStatistic::compute_p_value(statistic); 00625 break; 00626 } 00627 00628 return result; 00629 } 00630 00631 float64_t CLinearTimeMMD::compute_threshold(float64_t alpha) 00632 { 00633 float64_t result=0; 00634 00635 switch (m_null_approximation_method) 00636 { 00637 case MMD1_GAUSSIAN: 00638 { 00639 /* compute variance and use to estimate Gaussian distribution */ 00640 float64_t std_dev=CMath::sqrt(compute_variance_estimate()); 00641 result=1.0-CStatistics::inverse_normal_cdf(1-alpha, 0, std_dev); 00642 } 00643 break; 00644 00645 default: 00646 /* bootstrapping is handled here */ 00647 result=CKernelTwoSampleTestStatistic::compute_threshold(alpha); 00648 break; 00649 } 00650 00651 return result; 00652 } 00653 00654 float64_t CLinearTimeMMD::perform_test() 00655 { 00656 float64_t result=0; 00657 00658 switch (m_null_approximation_method) 00659 { 00660 case MMD1_GAUSSIAN: 00661 { 00662 /* compute variance and use to estimate Gaussian distribution, use 00663 * wrapper method and compute for single kernel */ 00664 SGVector<float64_t> statistic; 00665 SGVector<float64_t> variance; 00666 compute_statistic_and_variance(statistic, variance, false); 00667 00668 /* estimate Gaussian distribution */ 00669 result=1.0-CStatistics::normal_cdf(statistic[0], 00670 CMath::sqrt(variance[0])); 00671 } 00672 break; 00673 00674 default: 00675 /* bootstrapping can be done separately in superclass */ 00676 result=CTestStatistic::perform_test(); 00677 break; 00678 } 00679 00680 return result; 00681 } 00682 00683 SGVector<float64_t> CLinearTimeMMD::bootstrap_null() 00684 { 00685 SGVector<float64_t> samples(m_bootstrap_iterations); 00686 00687 /* instead of permutating samples, just samples new data all the time. */ 00688 CStreamingFeatures* p=m_streaming_p; 00689 CStreamingFeatures* q=m_streaming_q; 00690 SG_REF(p); 00691 SG_REF(q); 00692 00693 bool old=m_simulate_h0; 00694 set_simulate_h0(true); 00695 for (index_t i=0; i<m_bootstrap_iterations; ++i) 00696 { 00697 /* compute statistic for this permutation of mixed samples */ 00698 samples[i]=compute_statistic(); 00699 } 00700 set_simulate_h0(old); 00701 m_streaming_p=p; 00702 m_streaming_q=q; 00703 SG_UNREF(p); 00704 SG_UNREF(q); 00705 00706 return samples; 00707 } 00708 00709 void CLinearTimeMMD::set_p_and_q(CFeatures* p_and_q) 00710 { 00711 SG_ERROR("%s::set_p_and_q(): Method not implemented since linear time mmd" 00712 " is based on streaming features\n", get_name()); 00713 } 00714 00715 CFeatures* CLinearTimeMMD::get_p_and_q() 00716 { 00717 SG_ERROR("%s::get_p_and_q(): Method not implemented since linear time mmd" 00718 " is based on streaming features\n", get_name()); 00719 return NULL; 00720 } 00721 00722 CStreamingFeatures* CLinearTimeMMD::get_streaming_p() 00723 { 00724 SG_REF(m_streaming_p); 00725 return m_streaming_p; 00726 } 00727 00728 CStreamingFeatures* CLinearTimeMMD::get_streaming_q() 00729 { 00730 SG_REF(m_streaming_q); 00731 return m_streaming_q; 00732 } 00733