SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
SparsePolyFeatures.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2010 Soeren Sonnenburg
00008  * Copyright (C) 2010 Berlin Institute of Technology
00009  */
00010 #include <shogun/features/SparsePolyFeatures.h>
00011 #include <shogun/lib/Hash.h>
00012 
00013 using namespace shogun;
00014 
00015 CSparsePolyFeatures::CSparsePolyFeatures()
00016 {
00017     SG_UNSTABLE("CSparsePolyFeatures::CSparsePolyFeatures()",
00018                 "\n");
00019 
00020     m_feat = NULL;
00021     m_degree = 0;
00022     m_normalize = false;
00023     m_input_dimensions = 0;
00024     m_output_dimensions = 0;
00025     m_normalization_values = NULL;
00026     mask = 0;
00027     m_hash_bits = 0;
00028 }
00029 
00030 CSparsePolyFeatures::CSparsePolyFeatures(CSparseFeatures<float64_t>* feat, int32_t degree, bool normalize, int32_t hash_bits)
00031     : CDotFeatures(), m_normalization_values(NULL)
00032 {
00033     ASSERT(feat)
00034 
00035     m_feat = feat;
00036     SG_REF(m_feat);
00037     m_degree=degree;
00038     m_normalize=normalize;
00039     m_hash_bits=hash_bits;
00040     mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
00041     m_output_dimensions=1<<m_hash_bits;
00042     m_input_dimensions=feat->get_num_features();
00043 
00044     if (m_normalize)
00045         store_normalization_values();
00046 }
00047 
00048 CSparsePolyFeatures::~CSparsePolyFeatures()
00049 {
00050     SG_FREE(m_normalization_values);
00051     SG_UNREF(m_feat);
00052 }
00053 
00054 CSparsePolyFeatures::CSparsePolyFeatures(const CSparsePolyFeatures & orig)
00055 {
00056     SG_PRINT("CSparsePolyFeatures:\n")
00057     SG_NOTIMPLEMENTED
00058 }
00059 
00060 int32_t CSparsePolyFeatures::get_dim_feature_space() const
00061 {
00062     return m_output_dimensions;
00063 }
00064 
00065 int32_t CSparsePolyFeatures::get_nnz_features_for_vector(int32_t num)
00066 {
00067     int32_t vlen;
00068     SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(num);
00069     vlen=vec.num_feat_entries;
00070     m_feat->free_feature_vector(num);
00071     return vlen*(vlen+1)/2;
00072 }
00073 
00074 EFeatureType CSparsePolyFeatures::get_feature_type() const
00075 {
00076     return F_UNKNOWN;
00077 }
00078 
00079 EFeatureClass CSparsePolyFeatures::get_feature_class() const
00080 {
00081     return C_POLY;
00082 }
00083 
00084 int32_t CSparsePolyFeatures::get_num_vectors() const
00085 {
00086     if (m_feat)
00087         return m_feat->get_num_vectors();
00088     else
00089         return 0;
00090 
00091 }
00092 
00093 void* CSparsePolyFeatures::get_feature_iterator(int32_t vector_index)
00094 {
00095     SG_NOTIMPLEMENTED
00096     return NULL;
00097 }
00098 
00099 bool CSparsePolyFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00100 {
00101     SG_NOTIMPLEMENTED
00102     return false;
00103 }
00104 
00105 void CSparsePolyFeatures::free_feature_iterator(void* iterator)
00106 {
00107     SG_NOTIMPLEMENTED
00108 }
00109 
00110 float64_t CSparsePolyFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00111 {
00112     ASSERT(df)
00113     ASSERT(df->get_feature_type() == get_feature_type())
00114     ASSERT(df->get_feature_class() == get_feature_class())
00115 
00116     CSparsePolyFeatures* pf=(CSparsePolyFeatures*) df;
00117 
00118     SGSparseVector<float64_t> vec1=m_feat->get_sparse_feature_vector(vec_idx1);
00119     SGSparseVector<float64_t> vec2=pf->m_feat->get_sparse_feature_vector(
00120             vec_idx2);
00121 
00122     float64_t result=SGSparseVector<float64_t>::sparse_dot(vec1, vec2);
00123     result=CMath::pow(result, m_degree);
00124 
00125     m_feat->free_feature_vector(vec_idx1);
00126     pf->m_feat->free_feature_vector(vec_idx2);
00127 
00128     return result;
00129 }
00130 
00131 float64_t CSparsePolyFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len)
00132 {
00133     if (vec2_len != m_output_dimensions)
00134         SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions)
00135 
00136     SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(vec_idx1);
00137 
00138     float64_t result=0;
00139 
00140     if (vec.features)
00141     {
00142         if (m_degree==2)
00143         {
00144             /* (a+b)^2 = a^2 + 2ab +b^2 */
00145             for (int32_t i=0; i<vec.num_feat_entries; i++)
00146             {
00147                 float64_t v1=vec.features[i].entry;
00148                 uint32_t seed=CHash::MurmurHash3(
00149                         (uint8_t*)&(vec.features[i].feat_index),
00150                         sizeof(int32_t), 0xDEADBEAF);
00151 
00152                 for (int32_t j=i; j<vec.num_feat_entries; j++)
00153                 {
00154                     float64_t v2=vec.features[j].entry;
00155                     uint32_t h=CHash::MurmurHash3(
00156                             (uint8_t*)&(vec.features[j].feat_index),
00157                             sizeof(int32_t), seed)&mask;
00158                     float64_t v;
00159 
00160                     if (i==j)
00161                         v=v1*v1;
00162                     else
00163                         v=CMath::sqrt(2.0)*v1*v2;
00164 
00165                     result+=v*vec2[h];
00166                 }
00167             }
00168         }
00169         else if (m_degree==3)
00170             SG_NOTIMPLEMENTED
00171     }
00172 
00173     if (m_normalize)
00174         result/=m_normalization_values[vec_idx1];
00175 
00176     m_feat->free_feature_vector(vec_idx1);
00177     return result;
00178 }
00179 
00180 void CSparsePolyFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00181 {
00182     if (vec2_len!=m_output_dimensions)
00183         SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions)
00184 
00185     SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(vec_idx1);
00186 
00187     float64_t norm_val=1.0;
00188     if (m_normalize)
00189         norm_val = m_normalization_values[vec_idx1];
00190     alpha/=norm_val;
00191 
00192     if (m_degree==2)
00193     {
00194         /* (a+b)^2 = a^2 + 2ab +b^2 */
00195         for (int32_t i=0; i<vec.num_feat_entries; i++)
00196         {
00197             float64_t v1=vec.features[i].entry;
00198             uint32_t seed=CHash::MurmurHash3(
00199                     (uint8_t*)&(vec.features[i].feat_index), sizeof(int32_t),
00200                     0xDEADBEAF);
00201 
00202             for (int32_t j=i; j<vec.num_feat_entries; j++)
00203             {
00204                 float64_t v2=vec.features[j].entry;
00205                 uint32_t h=CHash::MurmurHash3(
00206                         (uint8_t*)&(vec.features[j].feat_index),
00207                         sizeof(int32_t), seed)&mask;
00208                 float64_t v;
00209 
00210                 if (i==j)
00211                     v=alpha*v1*v1;
00212                 else
00213                     v=alpha*CMath::sqrt(2.0)*v1*v2;
00214 
00215                 if (abs_val)
00216                     vec2[h]+=CMath::abs(v);
00217                 else
00218                     vec2[h]+=v;
00219             }
00220         }
00221     }
00222     else if (m_degree==3)
00223         SG_NOTIMPLEMENTED
00224 
00225     m_feat->free_feature_vector(vec_idx1);
00226 }
00227 
00228 void CSparsePolyFeatures::store_normalization_values()
00229 {
00230     SG_FREE(m_normalization_values);
00231 
00232     m_normalization_values_len = this->get_num_vectors();
00233 
00234     m_normalization_values=SG_MALLOC(float64_t, m_normalization_values_len);
00235     for (int i=0; i<m_normalization_values_len; i++)
00236     {
00237         float64_t val = CMath::sqrt(dot(i, this,i));
00238         if (val==0)
00239             // trap division by zero
00240             m_normalization_values[i]=1.0;
00241         else
00242             m_normalization_values[i]=val;
00243     }
00244 
00245 }
00246 
00247 CFeatures* CSparsePolyFeatures::duplicate() const
00248 {
00249     return new CSparsePolyFeatures(*this);
00250 }
00251 
00252 void CSparsePolyFeatures::init()
00253 {
00254     m_parameters->add((CSGObject**) &m_feat, "features",
00255             "Features in original space.");
00256     m_parameters->add(&m_degree, "degree", "Degree of the polynomial kernel.");
00257     m_parameters->add(&m_normalize, "normalize", "Normalize");
00258     m_parameters->add(&m_input_dimensions, "input_dimensions",
00259             "Dimensions of the input space.");
00260     m_parameters->add(&m_output_dimensions, "output_dimensions",
00261             "Dimensions of the feature space of the polynomial kernel.");
00262     m_normalization_values_len = get_num_vectors();
00263     m_parameters->add_vector(&m_normalization_values, &m_normalization_values_len,
00264             "m_normalization_values", "Norm of each training example");
00265     m_parameters->add(&mask, "mask", "Mask.");
00266     m_parameters->add(&m_hash_bits, "m_hash_bits", "Number of bits in hash");
00267 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation