SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
PruneVarSubMean.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 1999-2009 Soeren Sonnenburg
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include <shogun/preprocessor/PruneVarSubMean.h>
00013 #include <shogun/preprocessor/DensePreprocessor.h>
00014 #include <shogun/features/Features.h>
00015 #include <shogun/io/SGIO.h>
00016 #include <shogun/mathematics/Math.h>
00017 
00018 using namespace shogun;
00019 
00020 CPruneVarSubMean::CPruneVarSubMean(bool divide)
00021 : CDensePreprocessor<float64_t>()
00022 {
00023     init();
00024     register_parameters();
00025     m_divide_by_std = divide;
00026 }
00027 
00028 CPruneVarSubMean::~CPruneVarSubMean()
00029 {
00030     cleanup();
00031 }
00032 
00034 bool CPruneVarSubMean::init(CFeatures* features)
00035 {
00036     if (!m_initialized)
00037     {
00038         ASSERT(features->get_feature_class()==C_DENSE)
00039         ASSERT(features->get_feature_type()==F_DREAL)
00040 
00041         CDenseFeatures<float64_t>* simple_features=(CDenseFeatures<float64_t>*) features;
00042         int32_t num_examples = simple_features->get_num_vectors();
00043         int32_t num_features = simple_features->get_num_features();
00044 
00045         m_mean = SGVector<float64_t>();
00046         m_idx = SGVector<int32_t>();
00047         m_std = SGVector<float64_t>();;
00048 
00049         m_mean.resize_vector(num_features);
00050         float64_t* var=SG_MALLOC(float64_t, num_features);
00051         int32_t i,j;
00052 
00053         memset(var, 0, num_features*sizeof(float64_t));
00054         m_mean.zero();
00055 
00056         SGMatrix<float64_t> feature_matrix = simple_features->get_feature_matrix();
00057 
00058         // compute mean
00059         for (i=0; i<num_examples; i++)
00060         {
00061             for (j=0; j<num_features; j++)
00062                 m_mean[j]+=feature_matrix.matrix[i*num_features+j];
00063         }
00064 
00065         for (j=0; j<num_features; j++)
00066             m_mean[j]/=num_examples;
00067 
00068         // compute var
00069         for (i=0; i<num_examples; i++)
00070         {
00071             for (j=0; j<num_features; j++)
00072                 var[j]+=CMath::sq(m_mean[j]-feature_matrix.matrix[i*num_features+j]);
00073         }
00074 
00075         int32_t num_ok=0;
00076         int32_t* idx_ok=SG_MALLOC(int32_t, num_features);
00077 
00078         for (j=0; j<num_features; j++)
00079         {
00080             var[j]/=num_examples;
00081 
00082             if (var[j]>=1e-14)
00083             {
00084                 idx_ok[num_ok]=j;
00085                 num_ok++ ;
00086             }
00087         }
00088 
00089         SG_INFO("Reducing number of features from %i to %i\n", num_features, num_ok)
00090 
00091         m_idx.resize_vector(num_ok);
00092         SGVector<float64_t> new_mean(num_ok);
00093         m_std.resize_vector(num_ok);
00094 
00095         for (j=0; j<num_ok; j++)
00096         {
00097             m_idx[j]=idx_ok[j] ;
00098             new_mean[j]=m_mean[idx_ok[j]];
00099             m_std[j]=CMath::sqrt(var[idx_ok[j]]);
00100         }
00101         m_num_idx = num_ok;
00102         SG_FREE(idx_ok);
00103         SG_FREE(var);
00104         m_mean = new_mean;
00105 
00106         m_initialized = true;
00107         return true;
00108     }
00109     else
00110         return false;
00111 }
00112 
00114 void CPruneVarSubMean::cleanup()
00115 {
00116     m_idx=SGVector<int32_t>();
00117     m_mean=SGVector<float64_t>();
00118     m_std=SGVector<float64_t>();
00119     m_initialized = false;
00120 }
00121 
00125 SGMatrix<float64_t> CPruneVarSubMean::apply_to_feature_matrix(CFeatures* features)
00126 {
00127     ASSERT(m_initialized)
00128 
00129     int32_t num_vectors=0;
00130     int32_t num_features=0;
00131     float64_t* m=((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
00132 
00133     SG_INFO("get Feature matrix: %ix%i\n", num_vectors, num_features)
00134     SG_INFO("Preprocessing feature matrix\n")
00135     for (int32_t vec=0; vec<num_vectors; vec++)
00136     {
00137         float64_t* v_src=&m[num_features*vec];
00138         float64_t* v_dst=&m[m_num_idx*vec];
00139 
00140         if (m_divide_by_std)
00141         {
00142             for (int32_t feat=0; feat<m_num_idx; feat++)
00143                 v_dst[feat]=(v_src[m_idx[feat]]-m_mean[feat])/m_std[feat];
00144         }
00145         else
00146         {
00147             for (int32_t feat=0; feat<m_num_idx; feat++)
00148                 v_dst[feat]=(v_src[m_idx[feat]]-m_mean[feat]);
00149         }
00150     }
00151 
00152     ((CDenseFeatures<float64_t>*) features)->set_num_features(m_num_idx);
00153     ((CDenseFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
00154     SG_INFO("new Feature matrix: %ix%i\n", num_vectors, num_features)
00155 
00156     return ((CDenseFeatures<float64_t>*) features)->get_feature_matrix();
00157 }
00158 
00161 SGVector<float64_t> CPruneVarSubMean::apply_to_feature_vector(SGVector<float64_t> vector)
00162 {
00163     float64_t* ret=NULL;
00164 
00165     if (m_initialized)
00166     {
00167         ret=SG_MALLOC(float64_t, m_num_idx);
00168 
00169         if (m_divide_by_std)
00170         {
00171             for (int32_t i=0; i<m_num_idx; i++)
00172                 ret[i]=(vector.vector[m_idx[i]]-m_mean[i])/m_std[i];
00173         }
00174         else
00175         {
00176             for (int32_t i=0; i<m_num_idx; i++)
00177                 ret[i]=(vector.vector[m_idx[i]]-m_mean[i]);
00178         }
00179     }
00180     else
00181     {
00182         ret=SG_MALLOC(float64_t, vector.vlen);
00183         for (int32_t i=0; i<vector.vlen; i++)
00184             ret[i]=vector.vector[i];
00185     }
00186 
00187     return SGVector<float64_t>(ret,m_num_idx);
00188 }
00189 
00190 void CPruneVarSubMean::init()
00191 {
00192     m_initialized = false;
00193     m_divide_by_std = false;
00194     m_num_idx = 0;
00195     m_idx = SGVector<int32_t>();
00196     m_mean = SGVector<float64_t>();
00197     m_std = SGVector<float64_t>();
00198 }
00199 
00200 void CPruneVarSubMean::register_parameters()
00201 {
00202     SG_ADD(&m_initialized, "initialized", "The prerpocessor is initialized",  MS_NOT_AVAILABLE);
00203     SG_ADD(&m_divide_by_std, "divide_by_std", "Divide by standard deviation", MS_AVAILABLE);
00204     SG_ADD(&m_num_idx, "num_idx", "Number of elements in idx_vec", MS_NOT_AVAILABLE);
00205     SG_ADD(&m_std, "std_vec", "Standard dev vector", MS_NOT_AVAILABLE);
00206     SG_ADD(&m_mean, "mean_vec", "Mean vector", MS_NOT_AVAILABLE);
00207     SG_ADD(&m_idx, "idx_vec", "Index vector", MS_NOT_AVAILABLE);
00208 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation