SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
TwoDistributionsTestStatistic.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2012-2013 Heiko Strathmann
00008  */
00009 
00010 #include <shogun/statistics/TwoDistributionsTestStatistic.h>
00011 #include <shogun/features/Features.h>
00012 
00013 using namespace shogun;
00014 
00015 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic() :
00016         CTestStatistic()
00017 {
00018     init();
00019 }
00020 
00021 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic(
00022         CFeatures* p_and_q,
00023         index_t m) : CTestStatistic()
00024 {
00025     init();
00026 
00027     m_p_and_q=p_and_q;
00028     SG_REF(m_p_and_q);
00029 
00030     m_m=m;
00031 }
00032 
00033 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic(
00034         CFeatures* p, CFeatures* q) :
00035         CTestStatistic()
00036 {
00037     init();
00038 
00039     m_p_and_q=p->create_merged_copy(q);
00040     SG_REF(m_p_and_q);
00041 
00042     m_m=p->get_num_vectors();
00043 }
00044 
00045 CTwoDistributionsTestStatistic::~CTwoDistributionsTestStatistic()
00046 {
00047     SG_UNREF(m_p_and_q);
00048 }
00049 
00050 void CTwoDistributionsTestStatistic::init()
00051 {
00052     SG_ADD((CSGObject**)&m_p_and_q, "p_and_q", "Concatenated samples p and q",
00053             MS_NOT_AVAILABLE);
00054     SG_ADD(&m_m, "m", "Index of first sample of q",
00055             MS_NOT_AVAILABLE);
00056 
00057     m_p_and_q=NULL;
00058     m_m=0;
00059 }
00060 
00061 SGVector<float64_t> CTwoDistributionsTestStatistic::bootstrap_null()
00062 {
00063     SG_DEBUG("entering CTwoDistributionsTestStatistic::bootstrap_null()\n")
00064 
00065     REQUIRE(m_p_and_q, "CTwoDistributionsTestStatistic::bootstrap_null(): "
00066             "No appended features p and q!\n");
00067 
00068     /* compute bootstrap statistics for null distribution */
00069     SGVector<float64_t> results(m_bootstrap_iterations);
00070 
00071     /* memory for index permutations. Adding of subset has to happen
00072      * inside the loop since it may be copied if there already is one set */
00073     SGVector<index_t> ind_permutation(2*m_m);
00074     ind_permutation.range_fill();
00075 
00076     for (index_t i=0; i<m_bootstrap_iterations; ++i)
00077     {
00078         /* idea: merge features of p and q, shuffle, and compute statistic.
00079          * This is done using subsets here */
00080 
00081         /* create index permutation and add as subset. This will mix samples
00082          * from p and q */
00083         SGVector<int32_t>::permute_vector(ind_permutation);
00084 
00085         /* compute statistic for this permutation of mixed samples */
00086         m_p_and_q->add_subset(ind_permutation);
00087         results[i]=compute_statistic();
00088         m_p_and_q->remove_subset();
00089     }
00090 
00091     SG_DEBUG("leaving CTwoDistributionsTestStatistic::bootstrap_null()\n")
00092     return results;
00093 }
00094 
00095 float64_t CTwoDistributionsTestStatistic::compute_p_value(
00096         float64_t statistic)
00097 {
00098     float64_t result=0;
00099 
00100     if (m_null_approximation_method==BOOTSTRAP)
00101     {
00102         /* bootstrap a bunch of MMD values from null distribution */
00103         SGVector<float64_t> values=bootstrap_null();
00104 
00105         /* find out percentile of parameter "statistic" in null distribution */
00106         values.qsort();
00107         float64_t i=values.find_position_to_insert(statistic);
00108 
00109         /* return corresponding p-value */
00110         result=1.0-i/values.vlen;
00111     }
00112     else
00113     {
00114         SG_ERROR("CTwoDistributionsTestStatistics::compute_p_value(): Unknown"
00115                 " method to approximate null distribution!\n");
00116     }
00117 
00118     return result;
00119 }
00120 
00121 float64_t CTwoDistributionsTestStatistic::compute_threshold(
00122         float64_t alpha)
00123 {
00124     float64_t result=0;
00125 
00126     if (m_null_approximation_method==BOOTSTRAP)
00127     {
00128         /* bootstrap a bunch of MMD values from null distribution */
00129         SGVector<float64_t> values=bootstrap_null();
00130 
00131         /* return value of (1-alpha) quantile */
00132         result=values[index_t(CMath::floor(values.vlen*(1-alpha)))];
00133     }
00134     else
00135     {
00136         SG_ERROR("CTwoDistributionsTestStatistics::compute_threshold():"
00137                 "Unknown method to approximate null distribution!\n");
00138     }
00139 
00140     return result;
00141 }
00142 
00143 void CTwoDistributionsTestStatistic::set_p_and_q(CFeatures* p_and_q)
00144 {
00145     /* ref before unref to avoid problems when instances are equal */
00146     SG_REF(p_and_q);
00147     SG_UNREF(m_p_and_q);
00148     m_p_and_q=p_and_q;
00149 }
00150 
00151 CFeatures* CTwoDistributionsTestStatistic::get_p_and_q()
00152 {
00153     SG_REF(m_p_and_q);
00154     return m_p_and_q;
00155 }
00156 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation