SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2010 Soeren Sonnenburg 00008 * Copyright (C) 2010 Berlin Institute of Technology 00009 */ 00010 #include <shogun/features/SparsePolyFeatures.h> 00011 #include <shogun/lib/Hash.h> 00012 00013 using namespace shogun; 00014 00015 CSparsePolyFeatures::CSparsePolyFeatures() 00016 { 00017 SG_UNSTABLE("CSparsePolyFeatures::CSparsePolyFeatures()", 00018 "\n"); 00019 00020 m_feat = NULL; 00021 m_degree = 0; 00022 m_normalize = false; 00023 m_input_dimensions = 0; 00024 m_output_dimensions = 0; 00025 m_normalization_values = NULL; 00026 mask = 0; 00027 m_hash_bits = 0; 00028 } 00029 00030 CSparsePolyFeatures::CSparsePolyFeatures(CSparseFeatures<float64_t>* feat, int32_t degree, bool normalize, int32_t hash_bits) 00031 : CDotFeatures(), m_normalization_values(NULL) 00032 { 00033 ASSERT(feat) 00034 00035 m_feat = feat; 00036 SG_REF(m_feat); 00037 m_degree=degree; 00038 m_normalize=normalize; 00039 m_hash_bits=hash_bits; 00040 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1; 00041 m_output_dimensions=1<<m_hash_bits; 00042 m_input_dimensions=feat->get_num_features(); 00043 00044 if (m_normalize) 00045 store_normalization_values(); 00046 } 00047 00048 CSparsePolyFeatures::~CSparsePolyFeatures() 00049 { 00050 SG_FREE(m_normalization_values); 00051 SG_UNREF(m_feat); 00052 } 00053 00054 CSparsePolyFeatures::CSparsePolyFeatures(const CSparsePolyFeatures & orig) 00055 { 00056 SG_PRINT("CSparsePolyFeatures:\n") 00057 SG_NOTIMPLEMENTED 00058 } 00059 00060 int32_t CSparsePolyFeatures::get_dim_feature_space() const 00061 { 00062 return m_output_dimensions; 00063 } 00064 00065 int32_t CSparsePolyFeatures::get_nnz_features_for_vector(int32_t num) 00066 { 00067 int32_t vlen; 00068 SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(num); 00069 vlen=vec.num_feat_entries; 00070 m_feat->free_feature_vector(num); 00071 return vlen*(vlen+1)/2; 00072 } 00073 00074 EFeatureType CSparsePolyFeatures::get_feature_type() const 00075 { 00076 return F_UNKNOWN; 00077 } 00078 00079 EFeatureClass CSparsePolyFeatures::get_feature_class() const 00080 { 00081 return C_POLY; 00082 } 00083 00084 int32_t CSparsePolyFeatures::get_num_vectors() const 00085 { 00086 if (m_feat) 00087 return m_feat->get_num_vectors(); 00088 else 00089 return 0; 00090 00091 } 00092 00093 void* CSparsePolyFeatures::get_feature_iterator(int32_t vector_index) 00094 { 00095 SG_NOTIMPLEMENTED 00096 return NULL; 00097 } 00098 00099 bool CSparsePolyFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00100 { 00101 SG_NOTIMPLEMENTED 00102 return false; 00103 } 00104 00105 void CSparsePolyFeatures::free_feature_iterator(void* iterator) 00106 { 00107 SG_NOTIMPLEMENTED 00108 } 00109 00110 float64_t CSparsePolyFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2) 00111 { 00112 ASSERT(df) 00113 ASSERT(df->get_feature_type() == get_feature_type()) 00114 ASSERT(df->get_feature_class() == get_feature_class()) 00115 00116 CSparsePolyFeatures* pf=(CSparsePolyFeatures*) df; 00117 00118 SGSparseVector<float64_t> vec1=m_feat->get_sparse_feature_vector(vec_idx1); 00119 SGSparseVector<float64_t> vec2=pf->m_feat->get_sparse_feature_vector( 00120 vec_idx2); 00121 00122 float64_t result=SGSparseVector<float64_t>::sparse_dot(vec1, vec2); 00123 result=CMath::pow(result, m_degree); 00124 00125 m_feat->free_feature_vector(vec_idx1); 00126 pf->m_feat->free_feature_vector(vec_idx2); 00127 00128 return result; 00129 } 00130 00131 float64_t CSparsePolyFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00132 { 00133 if (vec2_len != m_output_dimensions) 00134 SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions) 00135 00136 SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(vec_idx1); 00137 00138 float64_t result=0; 00139 00140 if (vec.features) 00141 { 00142 if (m_degree==2) 00143 { 00144 /* (a+b)^2 = a^2 + 2ab +b^2 */ 00145 for (int32_t i=0; i<vec.num_feat_entries; i++) 00146 { 00147 float64_t v1=vec.features[i].entry; 00148 uint32_t seed=CHash::MurmurHash3( 00149 (uint8_t*)&(vec.features[i].feat_index), 00150 sizeof(int32_t), 0xDEADBEAF); 00151 00152 for (int32_t j=i; j<vec.num_feat_entries; j++) 00153 { 00154 float64_t v2=vec.features[j].entry; 00155 uint32_t h=CHash::MurmurHash3( 00156 (uint8_t*)&(vec.features[j].feat_index), 00157 sizeof(int32_t), seed)&mask; 00158 float64_t v; 00159 00160 if (i==j) 00161 v=v1*v1; 00162 else 00163 v=CMath::sqrt(2.0)*v1*v2; 00164 00165 result+=v*vec2[h]; 00166 } 00167 } 00168 } 00169 else if (m_degree==3) 00170 SG_NOTIMPLEMENTED 00171 } 00172 00173 if (m_normalize) 00174 result/=m_normalization_values[vec_idx1]; 00175 00176 m_feat->free_feature_vector(vec_idx1); 00177 return result; 00178 } 00179 00180 void CSparsePolyFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00181 { 00182 if (vec2_len!=m_output_dimensions) 00183 SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions) 00184 00185 SGSparseVector<float64_t> vec=m_feat->get_sparse_feature_vector(vec_idx1); 00186 00187 float64_t norm_val=1.0; 00188 if (m_normalize) 00189 norm_val = m_normalization_values[vec_idx1]; 00190 alpha/=norm_val; 00191 00192 if (m_degree==2) 00193 { 00194 /* (a+b)^2 = a^2 + 2ab +b^2 */ 00195 for (int32_t i=0; i<vec.num_feat_entries; i++) 00196 { 00197 float64_t v1=vec.features[i].entry; 00198 uint32_t seed=CHash::MurmurHash3( 00199 (uint8_t*)&(vec.features[i].feat_index), sizeof(int32_t), 00200 0xDEADBEAF); 00201 00202 for (int32_t j=i; j<vec.num_feat_entries; j++) 00203 { 00204 float64_t v2=vec.features[j].entry; 00205 uint32_t h=CHash::MurmurHash3( 00206 (uint8_t*)&(vec.features[j].feat_index), 00207 sizeof(int32_t), seed)&mask; 00208 float64_t v; 00209 00210 if (i==j) 00211 v=alpha*v1*v1; 00212 else 00213 v=alpha*CMath::sqrt(2.0)*v1*v2; 00214 00215 if (abs_val) 00216 vec2[h]+=CMath::abs(v); 00217 else 00218 vec2[h]+=v; 00219 } 00220 } 00221 } 00222 else if (m_degree==3) 00223 SG_NOTIMPLEMENTED 00224 00225 m_feat->free_feature_vector(vec_idx1); 00226 } 00227 00228 void CSparsePolyFeatures::store_normalization_values() 00229 { 00230 SG_FREE(m_normalization_values); 00231 00232 m_normalization_values_len = this->get_num_vectors(); 00233 00234 m_normalization_values=SG_MALLOC(float64_t, m_normalization_values_len); 00235 for (int i=0; i<m_normalization_values_len; i++) 00236 { 00237 float64_t val = CMath::sqrt(dot(i, this,i)); 00238 if (val==0) 00239 // trap division by zero 00240 m_normalization_values[i]=1.0; 00241 else 00242 m_normalization_values[i]=val; 00243 } 00244 00245 } 00246 00247 CFeatures* CSparsePolyFeatures::duplicate() const 00248 { 00249 return new CSparsePolyFeatures(*this); 00250 } 00251 00252 void CSparsePolyFeatures::init() 00253 { 00254 m_parameters->add((CSGObject**) &m_feat, "features", 00255 "Features in original space."); 00256 m_parameters->add(&m_degree, "degree", "Degree of the polynomial kernel."); 00257 m_parameters->add(&m_normalize, "normalize", "Normalize"); 00258 m_parameters->add(&m_input_dimensions, "input_dimensions", 00259 "Dimensions of the input space."); 00260 m_parameters->add(&m_output_dimensions, "output_dimensions", 00261 "Dimensions of the feature space of the polynomial kernel."); 00262 m_normalization_values_len = get_num_vectors(); 00263 m_parameters->add_vector(&m_normalization_values, &m_normalization_values_len, 00264 "m_normalization_values", "Norm of each training example"); 00265 m_parameters->add(&mask, "mask", "Mask."); 00266 m_parameters->add(&m_hash_bits, "m_hash_bits", "Number of bits in hash"); 00267 }