SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2012-2013 Heiko Strathmann 00008 */ 00009 00010 #include <shogun/statistics/TwoDistributionsTestStatistic.h> 00011 #include <shogun/features/Features.h> 00012 00013 using namespace shogun; 00014 00015 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic() : 00016 CTestStatistic() 00017 { 00018 init(); 00019 } 00020 00021 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic( 00022 CFeatures* p_and_q, 00023 index_t m) : CTestStatistic() 00024 { 00025 init(); 00026 00027 m_p_and_q=p_and_q; 00028 SG_REF(m_p_and_q); 00029 00030 m_m=m; 00031 } 00032 00033 CTwoDistributionsTestStatistic::CTwoDistributionsTestStatistic( 00034 CFeatures* p, CFeatures* q) : 00035 CTestStatistic() 00036 { 00037 init(); 00038 00039 m_p_and_q=p->create_merged_copy(q); 00040 SG_REF(m_p_and_q); 00041 00042 m_m=p->get_num_vectors(); 00043 } 00044 00045 CTwoDistributionsTestStatistic::~CTwoDistributionsTestStatistic() 00046 { 00047 SG_UNREF(m_p_and_q); 00048 } 00049 00050 void CTwoDistributionsTestStatistic::init() 00051 { 00052 SG_ADD((CSGObject**)&m_p_and_q, "p_and_q", "Concatenated samples p and q", 00053 MS_NOT_AVAILABLE); 00054 SG_ADD(&m_m, "m", "Index of first sample of q", 00055 MS_NOT_AVAILABLE); 00056 00057 m_p_and_q=NULL; 00058 m_m=0; 00059 } 00060 00061 SGVector<float64_t> CTwoDistributionsTestStatistic::bootstrap_null() 00062 { 00063 SG_DEBUG("entering CTwoDistributionsTestStatistic::bootstrap_null()\n") 00064 00065 REQUIRE(m_p_and_q, "CTwoDistributionsTestStatistic::bootstrap_null(): " 00066 "No appended features p and q!\n"); 00067 00068 /* compute bootstrap statistics for null distribution */ 00069 SGVector<float64_t> results(m_bootstrap_iterations); 00070 00071 /* memory for index permutations. Adding of subset has to happen 00072 * inside the loop since it may be copied if there already is one set */ 00073 SGVector<index_t> ind_permutation(2*m_m); 00074 ind_permutation.range_fill(); 00075 00076 for (index_t i=0; i<m_bootstrap_iterations; ++i) 00077 { 00078 /* idea: merge features of p and q, shuffle, and compute statistic. 00079 * This is done using subsets here */ 00080 00081 /* create index permutation and add as subset. This will mix samples 00082 * from p and q */ 00083 SGVector<int32_t>::permute_vector(ind_permutation); 00084 00085 /* compute statistic for this permutation of mixed samples */ 00086 m_p_and_q->add_subset(ind_permutation); 00087 results[i]=compute_statistic(); 00088 m_p_and_q->remove_subset(); 00089 } 00090 00091 SG_DEBUG("leaving CTwoDistributionsTestStatistic::bootstrap_null()\n") 00092 return results; 00093 } 00094 00095 float64_t CTwoDistributionsTestStatistic::compute_p_value( 00096 float64_t statistic) 00097 { 00098 float64_t result=0; 00099 00100 if (m_null_approximation_method==BOOTSTRAP) 00101 { 00102 /* bootstrap a bunch of MMD values from null distribution */ 00103 SGVector<float64_t> values=bootstrap_null(); 00104 00105 /* find out percentile of parameter "statistic" in null distribution */ 00106 values.qsort(); 00107 float64_t i=values.find_position_to_insert(statistic); 00108 00109 /* return corresponding p-value */ 00110 result=1.0-i/values.vlen; 00111 } 00112 else 00113 { 00114 SG_ERROR("CTwoDistributionsTestStatistics::compute_p_value(): Unknown" 00115 " method to approximate null distribution!\n"); 00116 } 00117 00118 return result; 00119 } 00120 00121 float64_t CTwoDistributionsTestStatistic::compute_threshold( 00122 float64_t alpha) 00123 { 00124 float64_t result=0; 00125 00126 if (m_null_approximation_method==BOOTSTRAP) 00127 { 00128 /* bootstrap a bunch of MMD values from null distribution */ 00129 SGVector<float64_t> values=bootstrap_null(); 00130 00131 /* return value of (1-alpha) quantile */ 00132 result=values[index_t(CMath::floor(values.vlen*(1-alpha)))]; 00133 } 00134 else 00135 { 00136 SG_ERROR("CTwoDistributionsTestStatistics::compute_threshold():" 00137 "Unknown method to approximate null distribution!\n"); 00138 } 00139 00140 return result; 00141 } 00142 00143 void CTwoDistributionsTestStatistic::set_p_and_q(CFeatures* p_and_q) 00144 { 00145 /* ref before unref to avoid problems when instances are equal */ 00146 SG_REF(p_and_q); 00147 SG_UNREF(m_p_and_q); 00148 m_p_and_q=p_and_q; 00149 } 00150 00151 CFeatures* CTwoDistributionsTestStatistic::get_p_and_q() 00152 { 00153 SG_REF(m_p_and_q); 00154 return m_p_and_q; 00155 } 00156