SHOGUN
v3.2.0
|
00001 #include <shogun/features/PolyFeatures.h> 00002 00003 using namespace shogun; 00004 00005 CPolyFeatures::CPolyFeatures() :CDotFeatures() 00006 { 00007 m_feat=NULL; 00008 m_degree=0; 00009 m_normalize=false; 00010 m_input_dimensions=0; 00011 m_multi_index=NULL; 00012 m_multinomial_coefficients=NULL; 00013 m_normalization_values=NULL; 00014 m_output_dimensions=0; 00015 00016 register_parameters(); 00017 } 00018 00019 CPolyFeatures::CPolyFeatures(CDenseFeatures<float64_t>* feat, int32_t degree, bool normalize) 00020 : CDotFeatures(), m_multi_index(NULL), m_multinomial_coefficients(NULL), 00021 m_normalization_values(NULL) 00022 { 00023 ASSERT(feat) 00024 00025 m_feat = feat; 00026 SG_REF(m_feat); 00027 m_degree=degree; 00028 m_normalize=normalize; 00029 m_input_dimensions=feat->get_num_features(); 00030 m_output_dimensions=calc_feature_space_dimensions(m_input_dimensions, m_degree); 00031 00032 store_multi_index(); 00033 store_multinomial_coefficients(); 00034 if (m_normalize) 00035 store_normalization_values(); 00036 00037 register_parameters(); 00038 } 00039 00040 00041 CPolyFeatures::~CPolyFeatures() 00042 { 00043 SG_FREE(m_multi_index); 00044 SG_FREE(m_multinomial_coefficients); 00045 SG_FREE(m_normalization_values); 00046 SG_UNREF(m_feat); 00047 } 00048 00049 CPolyFeatures::CPolyFeatures(const CPolyFeatures & orig) 00050 { 00051 SG_PRINT("CPolyFeatures:\n") 00052 SG_NOTIMPLEMENTED 00053 }; 00054 00055 int32_t CPolyFeatures::get_dim_feature_space() const 00056 { 00057 return m_output_dimensions; 00058 } 00059 00060 int32_t CPolyFeatures::get_nnz_features_for_vector(int32_t num) 00061 { 00062 return m_output_dimensions; 00063 } 00064 00065 EFeatureType CPolyFeatures::get_feature_type() const 00066 { 00067 return F_UNKNOWN; 00068 } 00069 00070 EFeatureClass CPolyFeatures::get_feature_class() const 00071 { 00072 return C_POLY; 00073 } 00074 00075 int32_t CPolyFeatures::get_num_vectors() const 00076 { 00077 if (m_feat) 00078 return m_feat->get_num_vectors(); 00079 else 00080 return 0; 00081 00082 } 00083 00084 void* CPolyFeatures::get_feature_iterator(int32_t vector_index) 00085 { 00086 SG_NOTIMPLEMENTED 00087 return NULL; 00088 } 00089 00090 bool CPolyFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00091 { 00092 SG_NOTIMPLEMENTED 00093 return false; 00094 } 00095 00096 void CPolyFeatures::free_feature_iterator(void* iterator) 00097 { 00098 SG_NOTIMPLEMENTED 00099 } 00100 00101 00102 00103 float64_t CPolyFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2) 00104 { 00105 ASSERT(df) 00106 ASSERT(df->get_feature_type() == get_feature_type()) 00107 ASSERT(df->get_feature_class() == get_feature_class()) 00108 00109 CPolyFeatures* pf=(CPolyFeatures*) df; 00110 00111 int32_t len1; 00112 bool do_free1; 00113 float64_t* vec1 = m_feat->get_feature_vector(vec_idx1, len1, do_free1); 00114 00115 int32_t len2; 00116 bool do_free2; 00117 float64_t* vec2 = pf->m_feat->get_feature_vector(vec_idx2, len2, do_free2); 00118 00119 float64_t sum=0; 00120 int cnt=0; 00121 for (int j=0; j<m_output_dimensions; j++) 00122 { 00123 float64_t out1=m_multinomial_coefficients[j]; 00124 float64_t out2=m_multinomial_coefficients[j]; 00125 for (int k=0; k<m_degree; k++) 00126 { 00127 out1*=vec1[m_multi_index[cnt]]; 00128 out2*=vec2[m_multi_index[cnt]]; 00129 cnt++; 00130 } 00131 sum+=out1*out2; 00132 } 00133 m_feat->free_feature_vector(vec1, len1, do_free1); 00134 pf->m_feat->free_feature_vector(vec2, len2, do_free2); 00135 00136 return sum; 00137 } 00138 00139 float64_t CPolyFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00140 { 00141 if (vec2_len != m_output_dimensions) 00142 SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions) 00143 00144 int32_t len; 00145 bool do_free; 00146 float64_t* vec = m_feat->get_feature_vector(vec_idx1, len, do_free); 00147 00148 00149 int cnt=0; 00150 float64_t sum=0; 00151 for (int j=0; j<vec2_len; j++) 00152 { 00153 float64_t output=m_multinomial_coefficients[j]; 00154 for (int k=0; k<m_degree; k++) 00155 { 00156 output*=vec[m_multi_index[cnt]]; 00157 cnt++; 00158 } 00159 sum+=output*vec2[j]; 00160 } 00161 if (m_normalize) 00162 sum = sum/m_normalization_values[vec_idx1]; 00163 00164 m_feat->free_feature_vector(vec, len, do_free); 00165 return sum; 00166 } 00167 void CPolyFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00168 { 00169 if (vec2_len != m_output_dimensions) 00170 SG_ERROR("Dimensions don't match, vec2_dim=%d, m_output_dimensions=%d\n", vec2_len, m_output_dimensions) 00171 00172 int32_t len; 00173 bool do_free; 00174 float64_t* vec = m_feat->get_feature_vector(vec_idx1, len, do_free); 00175 00176 00177 int cnt=0; 00178 float32_t norm_val=1; 00179 if (m_normalize) 00180 norm_val = m_normalization_values[vec_idx1]; 00181 alpha/=norm_val; 00182 for (int j=0; j<vec2_len; j++) 00183 { 00184 float64_t output=m_multinomial_coefficients[j]; 00185 for (int k=0; k<m_degree; k++) 00186 { 00187 output*=vec[m_multi_index[cnt]]; 00188 cnt++; 00189 } 00190 if (abs_val) 00191 output=CMath::abs(output); 00192 00193 vec2[j]+=alpha*output; 00194 } 00195 m_feat->free_feature_vector(vec, len, do_free); 00196 } 00197 void CPolyFeatures::store_normalization_values() 00198 { 00199 SG_FREE(m_normalization_values); 00200 00201 int32_t num_vec = this->get_num_vectors(); 00202 00203 m_normalization_values=SG_MALLOC(float32_t, num_vec); 00204 for (int i=0; i<num_vec; i++) 00205 { 00206 float64_t tmp = CMath::sqrt(dot(i, this,i)); 00207 if (tmp==0) 00208 // trap division by zero 00209 m_normalization_values[i]=1; 00210 else 00211 m_normalization_values[i]=tmp; 00212 } 00213 00214 } 00215 00216 void CPolyFeatures::store_multi_index() 00217 { 00218 SG_FREE(m_multi_index); 00219 00220 m_multi_index=SG_MALLOC(uint16_t, m_output_dimensions*m_degree); 00221 00222 uint16_t* exponents = SG_MALLOC(uint16_t, m_input_dimensions); 00223 if (!exponents) 00224 SG_ERROR("Error allocating mem \n") 00225 /*copy adress: otherwise it will be overwritten in recursion*/ 00226 uint16_t* index = m_multi_index; 00227 enumerate_multi_index(0, &index, exponents, m_degree); 00228 00229 SG_FREE(exponents); 00230 } 00231 00232 void CPolyFeatures::enumerate_multi_index(const int32_t feat_idx, uint16_t** index, uint16_t* exponents, const int32_t degree) 00233 { 00234 if (feat_idx==m_input_dimensions-1 || degree==0) 00235 { 00236 if (feat_idx==m_input_dimensions-1) 00237 exponents[feat_idx] = degree; 00238 if (degree==0) 00239 exponents[feat_idx] = 0; 00240 int32_t i, j; 00241 for (j=0; j<feat_idx+1; j++) 00242 for (i=0; i<exponents[j]; i++) 00243 { 00244 **index = j; 00245 (*index)++; 00246 } 00247 exponents[feat_idx] = 0; 00248 return; 00249 } 00250 int32_t k; 00251 for (k=0; k<=degree; k++) 00252 { 00253 exponents[feat_idx] = k; 00254 enumerate_multi_index(feat_idx+1, index, exponents, degree-k); 00255 } 00256 return; 00257 00258 } 00259 00260 void CPolyFeatures::store_multinomial_coefficients() 00261 { 00262 SG_FREE(m_multinomial_coefficients); 00263 00264 m_multinomial_coefficients = SG_MALLOC(float64_t, m_output_dimensions); 00265 int32_t* exponents = SG_MALLOC(int32_t, m_input_dimensions); 00266 if (!exponents) 00267 SG_ERROR("Error allocating mem \n") 00268 int32_t j=0; 00269 for (j=0; j<m_input_dimensions; j++) 00270 exponents[j] = 0; 00271 int32_t k, cnt=0; 00272 for (j=0; j<m_output_dimensions; j++) 00273 { 00274 for (k=0; k<m_degree; k++) 00275 { 00276 exponents[m_multi_index[cnt]] ++; 00277 cnt++; 00278 } 00279 m_multinomial_coefficients[j] = sqrt((double) multinomialcoef(exponents, m_input_dimensions)); 00280 for (k=0; k<m_input_dimensions; k++) 00281 { 00282 exponents[k]=0; 00283 } 00284 } 00285 SG_FREE(exponents); 00286 } 00287 00288 int32_t CPolyFeatures::bico2(int32_t n, int32_t k) 00289 { 00290 00291 /* for this problem k is usually small (<=degree), 00292 * thus it is efficient to 00293 * to use recursion and prune end recursions*/ 00294 if (n<k) 00295 return 0; 00296 if (k>n/2) 00297 k = n-k; 00298 if (k<0) 00299 return 0; 00300 if (k==0) 00301 return 1; 00302 if (k==1) 00303 return n; 00304 if (k<4) 00305 return bico2(n-1, k-1)+bico2(n-1, k); 00306 00307 /* call function as implemented in numerical recipies: 00308 * much more efficient for large binomial coefficients*/ 00309 return bico(n, k); 00310 00311 } 00312 00313 int32_t CPolyFeatures::calc_feature_space_dimensions(int32_t N, int32_t D) 00314 { 00315 if (N==1) 00316 return 1; 00317 if (D==0) 00318 return 1; 00319 int32_t d; 00320 int32_t ret = 0; 00321 for (d=0; d<=D; d++) 00322 ret += calc_feature_space_dimensions(N-1, d); 00323 00324 return ret; 00325 } 00326 00327 int32_t CPolyFeatures::multinomialcoef(int32_t* exps, int32_t len) 00328 { 00329 int32_t ret = 1, i; 00330 int32_t n = 0; 00331 for (i=0; i<len; i++) 00332 { 00333 n += exps[i]; 00334 ret *= bico2(n, exps[i]); 00335 } 00336 return ret; 00337 } 00338 00339 /* gammln as implemented in the 00340 * second edition of Numerical Recipes in C */ 00341 float64_t CPolyFeatures::gammln(float64_t xx) 00342 { 00343 float64_t x,y,tmp,ser; 00344 static float64_t cof[6]={76.18009172947146, -86.50532032941677, 00345 24.01409824083091, -1.231739572450155, 00346 0.1208650973866179e-2,-0.5395239384953e-5}; 00347 int32_t j; 00348 00349 y=x=xx; 00350 tmp=x+5.5; 00351 tmp -= (x+0.5)*log(tmp); 00352 ser=1.000000000190015; 00353 for (j=0;j<=5;j++) ser += cof[j]/++y; 00354 return -tmp+log(2.5066282746310005*ser/x); 00355 } 00356 00357 float64_t CPolyFeatures::factln(int32_t n) 00358 { 00359 static float64_t a[101]; 00360 00361 if (n < 0) SG_ERROR("Negative factorial in routine factln\n") 00362 if (n <= 1) return 0.0; 00363 if (n <= 100) return a[n] ? a[n] : (a[n]=gammln(n+1.0)); 00364 else return gammln(n+1.0); 00365 } 00366 00367 int32_t CPolyFeatures::bico(int32_t n, int32_t k) 00368 { 00369 /* use floor to clean roundoff errors*/ 00370 return (int32_t) floor(0.5+exp(factln(n)-factln(k)-factln(n-k))); 00371 } 00372 CFeatures* CPolyFeatures::duplicate() const 00373 { 00374 return new CPolyFeatures(*this); 00375 } 00376 00377 void CPolyFeatures::register_parameters() 00378 { 00379 m_parameters->add((CSGObject**) &m_feat, "features", 00380 "Features in original space."); 00381 m_parameters->add(&m_degree, "degree", "Degree of the polynomial kernel."); 00382 m_parameters->add(&m_normalize, "normalize", "Normalize?"); 00383 m_parameters->add(&m_input_dimensions, "input_dimensions", 00384 "Dimensions of the input space."); 00385 m_parameters->add(&m_output_dimensions, "output_dimensions", 00386 "Dimensions of the feature space of the polynomial kernel."); 00387 00388 multi_index_length=m_output_dimensions*m_degree; 00389 m_parameters->add_vector( 00390 &m_multi_index, 00391 &multi_index_length, 00392 "multi_index", 00393 "Flattened matrix of all multi indices that sum do the" 00394 " degree of the polynomial kernel."); 00395 00396 multinomial_coefficients_length=m_output_dimensions; 00397 m_parameters->add_vector(&m_multinomial_coefficients, 00398 &multinomial_coefficients_length, "multinomial_coefficients", 00399 "Multinomial coefficients for all multi-indices."); 00400 00401 normalization_values_length=get_num_vectors(); 00402 m_parameters->add_vector(&m_normalization_values, 00403 &normalization_values_length, "normalization_values", 00404 "Norm of each training example."); 00405 }