SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
LinearTimeMMD.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2012-2013 Heiko Strathmann
00008  */
00009 
00010 #include <shogun/statistics/LinearTimeMMD.h>
00011 #include <shogun/features/Features.h>
00012 #include <shogun/features/streaming/StreamingFeatures.h>
00013 #include <shogun/mathematics/Statistics.h>
00014 #include <shogun/features/CombinedFeatures.h>
00015 #include <shogun/kernel/CombinedKernel.h>
00016 #include <shogun/lib/List.h>
00017 
00018 #include <shogun/lib/external/libqp.h>
00019 
00020 using namespace shogun;
00021 
00022 CLinearTimeMMD::CLinearTimeMMD() :
00023         CKernelTwoSampleTestStatistic()
00024 {
00025     init();
00026 }
00027 
00028 CLinearTimeMMD::CLinearTimeMMD(CKernel* kernel, CStreamingFeatures* p,
00029         CStreamingFeatures* q, index_t m, index_t blocksize) :
00030         CKernelTwoSampleTestStatistic(kernel, NULL, m)
00031 {
00032     init();
00033 
00034     m_streaming_p=p;
00035     SG_REF(m_streaming_p);
00036 
00037     m_streaming_q=q;
00038     SG_REF(m_streaming_q);
00039 
00040     m_blocksize=blocksize;
00041 }
00042 
00043 CLinearTimeMMD::~CLinearTimeMMD()
00044 {
00045     SG_UNREF(m_streaming_p);
00046     SG_UNREF(m_streaming_q);
00047 
00048     /* m_kernel is SG_UNREFed in base desctructor */
00049 }
00050 
00051 void CLinearTimeMMD::init()
00052 {
00053     SG_ADD((CSGObject**)&m_streaming_p, "streaming_p", "Streaming features p",
00054                 MS_NOT_AVAILABLE);
00055     SG_ADD((CSGObject**)&m_streaming_q, "streaming_q", "Streaming features p",
00056                 MS_NOT_AVAILABLE);
00057     SG_ADD(&m_blocksize, "blocksize", "Number of elements processed at once",
00058                 MS_NOT_AVAILABLE);
00059     SG_ADD(&m_simulate_h0, "simulate_h0", "Whether p and q are mixed",
00060                 MS_NOT_AVAILABLE);
00061 
00062     m_streaming_p=NULL;
00063     m_streaming_q=NULL;
00064     m_blocksize=10000;
00065     m_simulate_h0=false;
00066 }
00067 
00068 void CLinearTimeMMD::compute_statistic_and_variance(
00069         SGVector<float64_t>& statistic, SGVector<float64_t>& variance,
00070         bool multiple_kernels)
00071 {
00072     SG_DEBUG("entering %s::compute_statistic_and_variance()\n", get_name())
00073 
00074     REQUIRE(m_streaming_p, "%s::compute_statistic_and_variance: streaming "
00075             "features p required!\n", get_name());
00076     REQUIRE(m_streaming_q, "%s::compute_statistic_and_variance: streaming "
00077             "features q required!\n", get_name());
00078 
00079     REQUIRE(m_kernel, "%s::compute_statistic_and_variance: kernel needed!\n",
00080             get_name());
00081 
00082     /* make sure multiple_kernels flag is used only with a combined kernel */
00083     REQUIRE(!multiple_kernels || m_kernel->get_kernel_type()==K_COMBINED,
00084             "%s::compute_statistic_and_variance: multiple kernels specified,"
00085             "but underlying kernel is not of type K_COMBINED\n", get_name());
00086 
00087     /* m is number of samples from each distribution, m_2 is half of it
00088      * using names from JLMR paper (see class documentation) */
00089     index_t m_2=m_m/2;
00090 
00091     SG_DEBUG("m_m=%d\n", m_m)
00092 
00093     /* find out whether single or multiple kernels (cast is safe, check above) */
00094     index_t num_kernels=1;
00095     if (multiple_kernels)
00096     {
00097         num_kernels=((CCombinedKernel*)m_kernel)->get_num_subkernels();
00098         SG_DEBUG("computing MMD and variance for %d sub-kernels\n",
00099                 num_kernels);
00100     }
00101 
00102     /* allocate memory for results if vectors are empty */
00103     if (!statistic.vector)
00104         statistic=SGVector<float64_t>(num_kernels);
00105 
00106     if (!variance.vector)
00107         variance=SGVector<float64_t>(num_kernels);
00108 
00109     /* ensure right dimensions */
00110     REQUIRE(statistic.vlen==num_kernels, "%s::compute_statistic_and_variance: "
00111             "statistic vector size (%d) does not match number of kernels (%d)\n",
00112              get_name(), statistic.vlen, num_kernels);
00113 
00114     REQUIRE(variance.vlen==num_kernels, "%s::compute_statistic_and_variance: "
00115             "variance vector size (%d) does not match number of kernels (%d)\n",
00116              get_name(), variance.vlen, num_kernels);
00117 
00118     /* temp variable in the algorithm */
00119     float64_t current;
00120     float64_t delta;
00121 
00122     /* initialise statistic and variance since they are cumulative */
00123     statistic.zero();
00124     variance.zero();
00125 
00126     /* needed for online mean and variance */
00127     SGVector<index_t> term_counters(num_kernels);
00128     term_counters.set_const(1);
00129 
00130     /* term counter to compute online mean and variance */
00131     index_t num_examples_processed=0;
00132     while (num_examples_processed<m_2)
00133     {
00134         /* number of example to look at in this iteration */
00135         index_t num_this_run=CMath::min(m_blocksize,
00136                 CMath::max(0, m_2-num_examples_processed));
00137         SG_DEBUG("processing %d more examples. %d so far processed. Blocksize "
00138                 "is %d\n", num_this_run, num_examples_processed, m_blocksize);
00139 
00140         /* stream data from both distributions */
00141         CFeatures* p1=m_streaming_p->get_streamed_features(num_this_run);
00142         CFeatures* p2=m_streaming_p->get_streamed_features(num_this_run);
00143         CFeatures* q1=m_streaming_q->get_streamed_features(num_this_run);
00144         CFeatures* q2=m_streaming_q->get_streamed_features(num_this_run);
00145 
00146         /* check whether h0 should be simulated and permute if so */
00147         if (m_simulate_h0)
00148         {
00149             /* create merged copy of all feature instances to permute */
00150             CList* list=new CList();
00151             list->append_element(p2);
00152             list->append_element(q1);
00153             list->append_element(q2);
00154             CFeatures* merged=p1->create_merged_copy(list);
00155             SG_UNREF(list);
00156 
00157             /* permute */
00158             SGVector<index_t> inds(merged->get_num_vectors());
00159             inds.range_fill();
00160             inds.permute();
00161             merged->add_subset(inds);
00162 
00163             /* copy back, replacing old features */
00164             SG_UNREF(p1);
00165             SG_UNREF(p2);
00166             SG_UNREF(q1);
00167             SG_UNREF(q2);
00168 
00169             SGVector<index_t> copy(num_this_run);
00170             copy.range_fill();
00171             p1=merged->copy_subset(copy);
00172             copy.add(num_this_run);
00173             p2=merged->copy_subset(copy);
00174             copy.add(num_this_run);
00175             q1=merged->copy_subset(copy);
00176             copy.add(num_this_run);
00177             q2=merged->copy_subset(copy);
00178 
00179             /* clean up and note that copy_subset does a SG_REF */
00180             SG_UNREF(merged);
00181         }
00182         else
00183         {
00184             /* reference produced features (only if copy_subset was not used) */
00185             SG_REF(p1);
00186             SG_REF(p2);
00187             SG_REF(q1);
00188             SG_REF(q2);
00189         }
00190 
00191         /* if multiple kernels are used, compute all of them on streamed data,
00192          * if multiple kernels flag is false, the above loop will be executed
00193          * only once */
00194         CKernel* kernel=m_kernel;
00195         if (multiple_kernels)
00196         {
00197             SG_DEBUG("using multiple kernels\n");
00198         }
00199 
00200         /* iterate through all kernels for this data */
00201         for (index_t i=0; i<num_kernels; ++i)
00202         {
00203             /* if multiple kernels should be computed, set next kernel */
00204             if (multiple_kernels)
00205             {
00206                 kernel=((CCombinedKernel*)m_kernel)->get_kernel(i);
00207             }
00208 
00209             /* compute kernel matrix diagonals */
00210             kernel->init(p1, p2);
00211             SGVector<float64_t> pp=kernel->get_kernel_diagonal();
00212 
00213             kernel->init(q1, q2);
00214             SGVector<float64_t> qq=kernel->get_kernel_diagonal();
00215 
00216             kernel->init(p1, q2);
00217             SGVector<float64_t> pq=kernel->get_kernel_diagonal();
00218 
00219             kernel->init(q1, p2);
00220             SGVector<float64_t> qp=kernel->get_kernel_diagonal();
00221 
00222             /* single variances for all kernels. Update mean and variance
00223              * using Knuth's online variance algorithm.
00224              * C.f. for example Wikipedia */
00225             for (index_t j=0; j<num_this_run; ++j)
00226             {
00227                 /* compute sum of current h terms for current kernel */
00228                 current=pp[j]+qq[j]-pq[j]-qp[j];
00229 
00230                 /* D. Knuth's online variance algorithm for current kernel */
00231                 delta=current-statistic[i];
00232                 statistic[i]+=delta/term_counters[i]++;
00233                 variance[i]+=delta*(current-statistic[i]);
00234 
00235                 SG_DEBUG("burst: current=%f, delta=%f, statistic=%f, "
00236                         "variance=%f, kernel_idx=%d\n", current, delta,
00237                         statistic[i], variance[i], i);
00238             }
00239 
00240             if (multiple_kernels)
00241             {
00242                 SG_UNREF(kernel);
00243             }
00244         }
00245 
00246         /* clean up streamed data */
00247         SG_UNREF(p1);
00248         SG_UNREF(p2);
00249         SG_UNREF(q1);
00250         SG_UNREF(q2);
00251 
00252         /* add number of processed examples for this run */
00253         num_examples_processed+=num_this_run;
00254     }
00255     SG_DEBUG("Done compouting statistic, processed 2*%d examples.\n",
00256             num_examples_processed);
00257 
00258     /* mean of sum all traces is linear time mmd, copy entries for all kernels */
00259     if (io->get_loglevel()==MSG_DEBUG || io->get_loglevel()==MSG_GCDEBUG)
00260         statistic.display_vector("statistics");
00261 
00262     /* variance of terms can be computed using mean (statistic).
00263      * Note that the variance needs to be divided by m_2 in order to get
00264      * variance of null-distribution */
00265     for (index_t i=0; i<num_kernels; ++i)
00266         variance[i]=variance[i]/(m_2-1)/m_2;
00267 
00268     if (io->get_loglevel()==MSG_DEBUG || io->get_loglevel()==MSG_GCDEBUG)
00269         variance.display_vector("variances");
00270 
00271     SG_DEBUG("leaving %s::compute_statistic_and_variance()\n", get_name())
00272 }
00273 
00274 void CLinearTimeMMD::compute_statistic_and_Q(
00275         SGVector<float64_t>& statistic, SGMatrix<float64_t>& Q)
00276 {
00277     SG_DEBUG("entering %s::compute_statistic_and_Q()\n", get_name())
00278 
00279     REQUIRE(m_streaming_p, "%s::compute_statistic_and_Q: streaming "
00280             "features p required!\n", get_name());
00281     REQUIRE(m_streaming_q, "%s::compute_statistic_and_Q: streaming "
00282             "features q required!\n", get_name());
00283 
00284     REQUIRE(m_kernel, "%s::compute_statistic_and_Q: kernel needed!\n",
00285             get_name());
00286 
00287     /* make sure multiple_kernels flag is used only with a combined kernel */
00288     REQUIRE(m_kernel->get_kernel_type()==K_COMBINED,
00289             "%s::compute_statistic_and_Q: underlying kernel is not of "
00290             "type K_COMBINED\n", get_name());
00291 
00292     /* cast combined kernel */
00293     CCombinedKernel* combined=(CCombinedKernel*)m_kernel;
00294 
00295     /* m is number of samples from each distribution, m_4 is quarter of it */
00296     REQUIRE(m_m>=4, "%s::compute_statistic_and_Q: Need at least m>=4\n",
00297             get_name());
00298     index_t m_4=m_m/4;
00299 
00300     SG_DEBUG("m_m=%d\n", m_m)
00301 
00302     /* find out whether single or multiple kernels (cast is safe, check above) */
00303     index_t num_kernels=combined->get_num_subkernels();
00304     REQUIRE(num_kernels>0, "%s::compute_statistic_and_Q: At least one kernel "
00305             "is needed\n", get_name());
00306 
00307     /* allocate memory for results if vectors are empty */
00308     if (!statistic.vector)
00309         statistic=SGVector<float64_t>(num_kernels);
00310 
00311     if (!Q.matrix)
00312         Q=SGMatrix<float64_t>(num_kernels, num_kernels);
00313 
00314     /* ensure right dimensions */
00315     REQUIRE(statistic.vlen==num_kernels, "%s::compute_statistic_and_variance: "
00316             "statistic vector size (%d) does not match number of kernels (%d)\n",
00317              get_name(), statistic.vlen, num_kernels);
00318 
00319     REQUIRE(Q.num_rows==num_kernels, "%s::compute_statistic_and_variance: "
00320             "Q number of rows does (%d) not match number of kernels (%d)\n",
00321              get_name(), Q.num_rows, num_kernels);
00322 
00323     REQUIRE(Q.num_cols==num_kernels, "%s::compute_statistic_and_variance: "
00324             "Q number of columns (%d) does not match number of kernels (%d)\n",
00325              get_name(), Q.num_cols, num_kernels);
00326 
00327     /* initialise statistic and variance since they are cumulative */
00328     statistic.zero();
00329     Q.zero();
00330 
00331     /* produce two kernel lists to iterate doubly nested */
00332     CList* list_i=new CList();
00333     CList* list_j=new CList();
00334 
00335     for (index_t k_idx=0; k_idx<combined->get_num_kernels(); k_idx++)
00336     {
00337         CKernel* kernel = combined->get_kernel(k_idx);
00338         list_i->append_element(kernel);
00339         list_j->append_element(kernel);
00340         SG_UNREF(kernel);
00341     }
00342 
00343     /* needed for online mean and variance */
00344     SGVector<index_t> term_counters_statistic(num_kernels);
00345     SGMatrix<index_t> term_counters_Q(num_kernels, num_kernels);
00346     term_counters_statistic.set_const(1);
00347     term_counters_Q.set_const(1);
00348 
00349     index_t num_examples_processed=0;
00350     while (num_examples_processed<m_4)
00351     {
00352         /* number of example to look at in this iteration */
00353         index_t num_this_run=CMath::min(m_blocksize,
00354                 CMath::max(0, m_4-num_examples_processed));
00355         SG_DEBUG("processing %d more examples. %d so far processed. Blocksize "
00356                 "is %d\n", num_this_run, num_examples_processed, m_blocksize);
00357 
00358         /* stream data from both distributions */
00359         CFeatures* p1a=m_streaming_p->get_streamed_features(num_this_run);
00360         CFeatures* p1b=m_streaming_p->get_streamed_features(num_this_run);
00361         CFeatures* p2a=m_streaming_p->get_streamed_features(num_this_run);
00362         CFeatures* p2b=m_streaming_p->get_streamed_features(num_this_run);
00363         CFeatures* q1a=m_streaming_q->get_streamed_features(num_this_run);
00364         CFeatures* q1b=m_streaming_q->get_streamed_features(num_this_run);
00365         CFeatures* q2a=m_streaming_q->get_streamed_features(num_this_run);
00366         CFeatures* q2b=m_streaming_q->get_streamed_features(num_this_run);
00367 
00368         /* check whether h0 should be simulated and permute if so */
00369         if (m_simulate_h0)
00370         {
00371             /* create merged copy of all feature instances to permute */
00372             CList* list=new CList();
00373             list->append_element(p1b);
00374             list->append_element(p2a);
00375             list->append_element(p2b);
00376             list->append_element(q1a);
00377             list->append_element(q1b);
00378             list->append_element(q2a);
00379             list->append_element(q2b);
00380             CFeatures* merged=p1a->create_merged_copy(list);
00381             SG_UNREF(list);
00382 
00383             /* permute */
00384             SGVector<index_t> inds(merged->get_num_vectors());
00385             inds.range_fill();
00386             inds.permute();
00387             merged->add_subset(inds);
00388 
00389             /* copy back, replacing old features */
00390             SG_UNREF(p1a);
00391             SG_UNREF(p1b);
00392             SG_UNREF(p2a);
00393             SG_UNREF(p2b);
00394             SG_UNREF(q1a);
00395             SG_UNREF(q1b);
00396             SG_UNREF(q2a);
00397             SG_UNREF(q2b);
00398 
00399             SGVector<index_t> copy(num_this_run);
00400             copy.range_fill();
00401             p1a=merged->copy_subset(copy);
00402             copy.add(num_this_run);
00403             p1b=merged->copy_subset(copy);
00404             copy.add(num_this_run);
00405             p2a=merged->copy_subset(copy);
00406             copy.add(num_this_run);
00407             p2b=merged->copy_subset(copy);
00408             copy.add(num_this_run);
00409             q1a=merged->copy_subset(copy);
00410             copy.add(num_this_run);
00411             q1b=merged->copy_subset(copy);
00412             copy.add(num_this_run);
00413             q2a=merged->copy_subset(copy);
00414             copy.add(num_this_run);
00415             q2b=merged->copy_subset(copy);
00416 
00417             /* clean up and note that copy_subset does a SG_REF */
00418             SG_UNREF(merged);
00419         }
00420         else
00421         {
00422             /* reference the produced features (only if copy subset was not used) */
00423             SG_REF(p1a);
00424             SG_REF(p1b);
00425             SG_REF(p2a);
00426             SG_REF(p2b);
00427             SG_REF(q1a);
00428             SG_REF(q1b);
00429             SG_REF(q2a);
00430             SG_REF(q2b);
00431         }
00432 
00433         /* now for each of these streamed data instances, iterate through all
00434          * kernels and update Q matrix while also computing MMD statistic */
00435 
00436         /* preallocate some memory for faster processing */
00437         SGVector<float64_t> pp(num_this_run);
00438         SGVector<float64_t> qq(num_this_run);
00439         SGVector<float64_t> pq(num_this_run);
00440         SGVector<float64_t> qp(num_this_run);
00441         SGVector<float64_t> h_i_a(num_this_run);
00442         SGVector<float64_t> h_i_b(num_this_run);
00443         SGVector<float64_t> h_j_a(num_this_run);
00444         SGVector<float64_t> h_j_b(num_this_run);
00445 
00446         /* iterate through Q matrix and update values, compute mmd */
00447         CKernel* kernel_i=(CKernel*)list_i->get_first_element();
00448         for (index_t i=0; i<num_kernels; ++i)
00449         {
00450             /* compute all necessary 8 h-vectors for this burst.
00451              * h_delta-terms for each kernel, expression 7 of NIPS paper
00452              * first kernel */
00453 
00454             /* first kernel, a-part */
00455             kernel_i->init(p1a, p2a);
00456             pp=kernel_i->get_kernel_diagonal(pp);
00457             kernel_i->init(q1a, q2a);
00458             qq=kernel_i->get_kernel_diagonal(qq);
00459             kernel_i->init(p1a, q2a);
00460             pq=kernel_i->get_kernel_diagonal(pq);
00461             kernel_i->init(q1a, p2a);
00462             qp=kernel_i->get_kernel_diagonal(qp);
00463             for (index_t it=0; it<num_this_run; ++it)
00464                 h_i_a[it]=pp[it]+qq[it]-pq[it]-qp[it];
00465 
00466             /* first kernel, b-part */
00467             kernel_i->init(p1b, p2b);
00468             pp=kernel_i->get_kernel_diagonal(pp);
00469             kernel_i->init(q1b, q2b);
00470             qq=kernel_i->get_kernel_diagonal(qq);
00471             kernel_i->init(p1b, q2b);
00472             pq=kernel_i->get_kernel_diagonal(pq);
00473             kernel_i->init(q1b, p2b);
00474             qp=kernel_i->get_kernel_diagonal(qp);
00475             for (index_t it=0; it<num_this_run; ++it)
00476                 h_i_b[it]=pp[it]+qq[it]-pq[it]-qp[it];
00477 
00478             /* iterate through j, but use symmetry in order to save half of the
00479              * computations */
00480             CKernel* kernel_j=(CKernel*)list_j->get_first_element();
00481             for (index_t j=0; j<=i; ++j)
00482             {
00483                 /* compute all necessary 8 h-vectors for this burst.
00484                  * h_delta-terms for each kernel, expression 7 of NIPS paper
00485                  * second kernel */
00486 
00487                 /* second kernel, a-part */
00488                 kernel_j->init(p1a, p2a);
00489                 pp=kernel_j->get_kernel_diagonal(pp);
00490                 kernel_j->init(q1a, q2a);
00491                 qq=kernel_j->get_kernel_diagonal(qq);
00492                 kernel_j->init(p1a, q2a);
00493                 pq=kernel_j->get_kernel_diagonal(pq);
00494                 kernel_j->init(q1a, p2a);
00495                 qp=kernel_j->get_kernel_diagonal(qp);
00496                 for (index_t it=0; it<num_this_run; ++it)
00497                     h_j_a[it]=pp[it]+qq[it]-pq[it]-qp[it];
00498 
00499                 /* second kernel, b-part */
00500                 kernel_j->init(p1b, p2b);
00501                 pp=kernel_j->get_kernel_diagonal(pp);
00502                 kernel_j->init(q1b, q2b);
00503                 qq=kernel_j->get_kernel_diagonal(qq);
00504                 kernel_j->init(p1b, q2b);
00505                 pq=kernel_j->get_kernel_diagonal(pq);
00506                 kernel_j->init(q1b, p2b);
00507                 qp=kernel_j->get_kernel_diagonal(qp);
00508                 for (index_t it=0; it<num_this_run; ++it)
00509                     h_j_b[it]=pp[it]+qq[it]-pq[it]-qp[it];
00510 
00511                 float64_t term;
00512                 for (index_t it=0; it<num_this_run; ++it)
00513                 {
00514                     /* current term of expression 7 of NIPS paper */
00515                     term=(h_i_a[it]-h_i_b[it])*(h_j_a[it]-h_j_b[it]);
00516 
00517                     /* update covariance element for the current burst. This is a
00518                      * running average of the product of the h_delta terms of each
00519                      * kernel */
00520                     Q(i, j)+=(term-Q(i, j))/term_counters_Q(i, j)++;
00521                 }
00522 
00523                 /* use symmetry */
00524                 Q(j, i)=Q(i, j);
00525 
00526                 /* next kernel j */
00527                 kernel_j=(CKernel*)list_j->get_next_element();
00528             }
00529 
00530             /* update MMD statistic online computation for kernel i, using
00531              * vectors that were computed above */
00532             SGVector<float64_t> h(num_this_run*2);
00533             for (index_t it=0; it<num_this_run; ++it)
00534             {
00535                 /* update statistic for kernel i (outer loop) and update using
00536                  * all elements of the h_i_a, h_i_b vectors (iterate over it) */
00537                 statistic[i]=statistic[i]+
00538                         (h_i_a[it]-statistic[i])/term_counters_statistic[i]++;
00539 
00540                 /* Make sure to use all data, i.e. part a and b */
00541                 statistic[i]=statistic[i]+
00542                         (h_i_b[it]-statistic[i])/(term_counters_statistic[i]++);
00543             }
00544 
00545             /* next kernel i */
00546             kernel_i=(CKernel*)list_i->get_next_element();
00547         }
00548 
00549         /* clean up streamed data */
00550         SG_UNREF(p1a);
00551         SG_UNREF(p1b);
00552         SG_UNREF(p2a);
00553         SG_UNREF(p2b);
00554         SG_UNREF(q1a);
00555         SG_UNREF(q1b);
00556         SG_UNREF(q2a);
00557         SG_UNREF(q2b);
00558 
00559         /* add number of processed examples for this run */
00560         num_examples_processed+=num_this_run;
00561     }
00562 
00563     /* clean up */
00564     SG_UNREF(list_i);
00565     SG_UNREF(list_j);
00566 
00567     SG_DEBUG("Done compouting statistic, processed 4*%d examples.\n",
00568             num_examples_processed);
00569 
00570     SG_DEBUG("leaving %s::compute_statistic_and_Q()\n", get_name())
00571 }
00572 
00573 float64_t CLinearTimeMMD::compute_statistic()
00574 {
00575     /* use wrapper method and compute for single kernel */
00576     SGVector<float64_t> statistic;
00577     SGVector<float64_t> variance;
00578     compute_statistic_and_variance(statistic, variance, false);
00579 
00580     return statistic[0];
00581 }
00582 
00583 SGVector<float64_t> CLinearTimeMMD::compute_statistic(
00584         bool multiple_kernels)
00585 {
00586     /* make sure multiple_kernels flag is used only with a combined kernel */
00587     REQUIRE(!multiple_kernels || m_kernel->get_kernel_type()==K_COMBINED,
00588             "%s::compute_statistic: multiple kernels specified,"
00589             "but underlying kernel is not of type K_COMBINED\n", get_name());
00590 
00591     SGVector<float64_t> statistic;
00592     SGVector<float64_t> variance;
00593     compute_statistic_and_variance(statistic, variance, multiple_kernels);
00594 
00595     return statistic;
00596 }
00597 
00598 float64_t CLinearTimeMMD::compute_variance_estimate()
00599 {
00600     /* use wrapper method and compute for single kernel */
00601     SGVector<float64_t> statistic;
00602     SGVector<float64_t> variance;
00603     compute_statistic_and_variance(statistic, variance, false);
00604 
00605     return variance[0];
00606 }
00607 
00608 float64_t CLinearTimeMMD::compute_p_value(float64_t statistic)
00609 {
00610     float64_t result=0;
00611 
00612     switch (m_null_approximation_method)
00613     {
00614     case MMD1_GAUSSIAN:
00615         {
00616             /* compute variance and use to estimate Gaussian distribution */
00617             float64_t std_dev=CMath::sqrt(compute_variance_estimate());
00618             result=1.0-CStatistics::normal_cdf(statistic, std_dev);
00619         }
00620         break;
00621 
00622     default:
00623         /* bootstrapping is handled here */
00624         result=CKernelTwoSampleTestStatistic::compute_p_value(statistic);
00625         break;
00626     }
00627 
00628     return result;
00629 }
00630 
00631 float64_t CLinearTimeMMD::compute_threshold(float64_t alpha)
00632 {
00633     float64_t result=0;
00634 
00635     switch (m_null_approximation_method)
00636     {
00637     case MMD1_GAUSSIAN:
00638         {
00639             /* compute variance and use to estimate Gaussian distribution */
00640             float64_t std_dev=CMath::sqrt(compute_variance_estimate());
00641             result=1.0-CStatistics::inverse_normal_cdf(1-alpha, 0, std_dev);
00642         }
00643         break;
00644 
00645     default:
00646         /* bootstrapping is handled here */
00647         result=CKernelTwoSampleTestStatistic::compute_threshold(alpha);
00648         break;
00649     }
00650 
00651     return result;
00652 }
00653 
00654 float64_t CLinearTimeMMD::perform_test()
00655 {
00656     float64_t result=0;
00657 
00658     switch (m_null_approximation_method)
00659     {
00660     case MMD1_GAUSSIAN:
00661         {
00662             /* compute variance and use to estimate Gaussian distribution, use
00663              * wrapper method and compute for single kernel */
00664             SGVector<float64_t> statistic;
00665             SGVector<float64_t> variance;
00666             compute_statistic_and_variance(statistic, variance, false);
00667 
00668             /* estimate Gaussian distribution */
00669             result=1.0-CStatistics::normal_cdf(statistic[0],
00670                     CMath::sqrt(variance[0]));
00671         }
00672         break;
00673 
00674     default:
00675         /* bootstrapping can be done separately in superclass */
00676         result=CTestStatistic::perform_test();
00677         break;
00678     }
00679 
00680     return result;
00681 }
00682 
00683 SGVector<float64_t> CLinearTimeMMD::bootstrap_null()
00684 {
00685     SGVector<float64_t> samples(m_bootstrap_iterations);
00686 
00687     /* instead of permutating samples, just samples new data all the time. */
00688     CStreamingFeatures* p=m_streaming_p;
00689     CStreamingFeatures* q=m_streaming_q;
00690     SG_REF(p);
00691     SG_REF(q);
00692 
00693     bool old=m_simulate_h0;
00694     set_simulate_h0(true);
00695     for (index_t i=0; i<m_bootstrap_iterations; ++i)
00696     {
00697         /* compute statistic for this permutation of mixed samples */
00698         samples[i]=compute_statistic();
00699     }
00700     set_simulate_h0(old);
00701     m_streaming_p=p;
00702     m_streaming_q=q;
00703     SG_UNREF(p);
00704     SG_UNREF(q);
00705 
00706     return samples;
00707 }
00708 
00709 void CLinearTimeMMD::set_p_and_q(CFeatures* p_and_q)
00710 {
00711     SG_ERROR("%s::set_p_and_q(): Method not implemented since linear time mmd"
00712             " is based on streaming features\n", get_name());
00713 }
00714 
00715 CFeatures* CLinearTimeMMD::get_p_and_q()
00716 {
00717     SG_ERROR("%s::get_p_and_q(): Method not implemented since linear time mmd"
00718             " is based on streaming features\n", get_name());
00719     return NULL;
00720 }
00721 
00722 CStreamingFeatures* CLinearTimeMMD::get_streaming_p()
00723 {
00724     SG_REF(m_streaming_p);
00725     return m_streaming_p;
00726 }
00727 
00728 CStreamingFeatures* CLinearTimeMMD::get_streaming_q()
00729 {
00730     SG_REF(m_streaming_q);
00731     return m_streaming_q;
00732 }
00733 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation