SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2008 Gunnar Raetsch 00008 * Written (W) 1999-2009 Soeren Sonnenburg 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <shogun/preprocessor/PruneVarSubMean.h> 00013 #include <shogun/preprocessor/DensePreprocessor.h> 00014 #include <shogun/features/Features.h> 00015 #include <shogun/io/SGIO.h> 00016 #include <shogun/mathematics/Math.h> 00017 00018 using namespace shogun; 00019 00020 CPruneVarSubMean::CPruneVarSubMean(bool divide) 00021 : CDensePreprocessor<float64_t>() 00022 { 00023 init(); 00024 register_parameters(); 00025 m_divide_by_std = divide; 00026 } 00027 00028 CPruneVarSubMean::~CPruneVarSubMean() 00029 { 00030 cleanup(); 00031 } 00032 00034 bool CPruneVarSubMean::init(CFeatures* features) 00035 { 00036 if (!m_initialized) 00037 { 00038 ASSERT(features->get_feature_class()==C_DENSE) 00039 ASSERT(features->get_feature_type()==F_DREAL) 00040 00041 CDenseFeatures<float64_t>* simple_features=(CDenseFeatures<float64_t>*) features; 00042 int32_t num_examples = simple_features->get_num_vectors(); 00043 int32_t num_features = simple_features->get_num_features(); 00044 00045 m_mean = SGVector<float64_t>(); 00046 m_idx = SGVector<int32_t>(); 00047 m_std = SGVector<float64_t>();; 00048 00049 m_mean.resize_vector(num_features); 00050 float64_t* var=SG_MALLOC(float64_t, num_features); 00051 int32_t i,j; 00052 00053 memset(var, 0, num_features*sizeof(float64_t)); 00054 m_mean.zero(); 00055 00056 SGMatrix<float64_t> feature_matrix = simple_features->get_feature_matrix(); 00057 00058 // compute mean 00059 for (i=0; i<num_examples; i++) 00060 { 00061 for (j=0; j<num_features; j++) 00062 m_mean[j]+=feature_matrix.matrix[i*num_features+j]; 00063 } 00064 00065 for (j=0; j<num_features; j++) 00066 m_mean[j]/=num_examples; 00067 00068 // compute var 00069 for (i=0; i<num_examples; i++) 00070 { 00071 for (j=0; j<num_features; j++) 00072 var[j]+=CMath::sq(m_mean[j]-feature_matrix.matrix[i*num_features+j]); 00073 } 00074 00075 int32_t num_ok=0; 00076 int32_t* idx_ok=SG_MALLOC(int32_t, num_features); 00077 00078 for (j=0; j<num_features; j++) 00079 { 00080 var[j]/=num_examples; 00081 00082 if (var[j]>=1e-14) 00083 { 00084 idx_ok[num_ok]=j; 00085 num_ok++ ; 00086 } 00087 } 00088 00089 SG_INFO("Reducing number of features from %i to %i\n", num_features, num_ok) 00090 00091 m_idx.resize_vector(num_ok); 00092 SGVector<float64_t> new_mean(num_ok); 00093 m_std.resize_vector(num_ok); 00094 00095 for (j=0; j<num_ok; j++) 00096 { 00097 m_idx[j]=idx_ok[j] ; 00098 new_mean[j]=m_mean[idx_ok[j]]; 00099 m_std[j]=CMath::sqrt(var[idx_ok[j]]); 00100 } 00101 m_num_idx = num_ok; 00102 SG_FREE(idx_ok); 00103 SG_FREE(var); 00104 m_mean = new_mean; 00105 00106 m_initialized = true; 00107 return true; 00108 } 00109 else 00110 return false; 00111 } 00112 00114 void CPruneVarSubMean::cleanup() 00115 { 00116 m_idx=SGVector<int32_t>(); 00117 m_mean=SGVector<float64_t>(); 00118 m_std=SGVector<float64_t>(); 00119 m_initialized = false; 00120 } 00121 00125 SGMatrix<float64_t> CPruneVarSubMean::apply_to_feature_matrix(CFeatures* features) 00126 { 00127 ASSERT(m_initialized) 00128 00129 int32_t num_vectors=0; 00130 int32_t num_features=0; 00131 float64_t* m=((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors); 00132 00133 SG_INFO("get Feature matrix: %ix%i\n", num_vectors, num_features) 00134 SG_INFO("Preprocessing feature matrix\n") 00135 for (int32_t vec=0; vec<num_vectors; vec++) 00136 { 00137 float64_t* v_src=&m[num_features*vec]; 00138 float64_t* v_dst=&m[m_num_idx*vec]; 00139 00140 if (m_divide_by_std) 00141 { 00142 for (int32_t feat=0; feat<m_num_idx; feat++) 00143 v_dst[feat]=(v_src[m_idx[feat]]-m_mean[feat])/m_std[feat]; 00144 } 00145 else 00146 { 00147 for (int32_t feat=0; feat<m_num_idx; feat++) 00148 v_dst[feat]=(v_src[m_idx[feat]]-m_mean[feat]); 00149 } 00150 } 00151 00152 ((CDenseFeatures<float64_t>*) features)->set_num_features(m_num_idx); 00153 ((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors); 00154 SG_INFO("new Feature matrix: %ix%i\n", num_vectors, num_features) 00155 00156 return ((CDenseFeatures<float64_t>*) features)->get_feature_matrix(); 00157 } 00158 00161 SGVector<float64_t> CPruneVarSubMean::apply_to_feature_vector(SGVector<float64_t> vector) 00162 { 00163 float64_t* ret=NULL; 00164 00165 if (m_initialized) 00166 { 00167 ret=SG_MALLOC(float64_t, m_num_idx); 00168 00169 if (m_divide_by_std) 00170 { 00171 for (int32_t i=0; i<m_num_idx; i++) 00172 ret[i]=(vector.vector[m_idx[i]]-m_mean[i])/m_std[i]; 00173 } 00174 else 00175 { 00176 for (int32_t i=0; i<m_num_idx; i++) 00177 ret[i]=(vector.vector[m_idx[i]]-m_mean[i]); 00178 } 00179 } 00180 else 00181 { 00182 ret=SG_MALLOC(float64_t, vector.vlen); 00183 for (int32_t i=0; i<vector.vlen; i++) 00184 ret[i]=vector.vector[i]; 00185 } 00186 00187 return SGVector<float64_t>(ret,m_num_idx); 00188 } 00189 00190 void CPruneVarSubMean::init() 00191 { 00192 m_initialized = false; 00193 m_divide_by_std = false; 00194 m_num_idx = 0; 00195 m_idx = SGVector<int32_t>(); 00196 m_mean = SGVector<float64_t>(); 00197 m_std = SGVector<float64_t>(); 00198 } 00199 00200 void CPruneVarSubMean::register_parameters() 00201 { 00202 SG_ADD(&m_initialized, "initialized", "The prerpocessor is initialized", MS_NOT_AVAILABLE); 00203 SG_ADD(&m_divide_by_std, "divide_by_std", "Divide by standard deviation", MS_AVAILABLE); 00204 SG_ADD(&m_num_idx, "num_idx", "Number of elements in idx_vec", MS_NOT_AVAILABLE); 00205 SG_ADD(&m_std, "std_vec", "Standard dev vector", MS_NOT_AVAILABLE); 00206 SG_ADD(&m_mean, "mean_vec", "Mean vector", MS_NOT_AVAILABLE); 00207 SG_ADD(&m_idx, "idx_vec", "Index vector", MS_NOT_AVAILABLE); 00208 }