SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include <shogun/features/WDFeatures.h> 00012 #include <shogun/io/SGIO.h> 00013 00014 using namespace shogun; 00015 00016 CWDFeatures::CWDFeatures() :CDotFeatures() 00017 { 00018 SG_UNSTABLE("CWDFeatures::CWDFeatures() :CDotFeatures()", 00019 "\n"); 00020 00021 strings = NULL; 00022 00023 degree = 0; 00024 from_degree = 0; 00025 string_length = 0; 00026 num_strings = 0; 00027 alphabet_size = 0; 00028 w_dim = 0; 00029 wd_weights = NULL; 00030 normalization_const = 0.0; 00031 } 00032 00033 CWDFeatures::CWDFeatures(CStringFeatures<uint8_t>* str, 00034 int32_t order, int32_t from_order) : CDotFeatures() 00035 { 00036 ASSERT(str) 00037 ASSERT(str->have_same_length()) 00038 SG_REF(str); 00039 00040 strings=str; 00041 string_length=str->get_max_vector_length(); 00042 num_strings=str->get_num_vectors(); 00043 CAlphabet* alpha=str->get_alphabet(); 00044 alphabet_size=alpha->get_num_symbols(); 00045 SG_UNREF(alpha); 00046 00047 degree=order; 00048 from_degree=from_order; 00049 wd_weights=NULL; 00050 set_wd_weights(); 00051 set_normalization_const(); 00052 00053 } 00054 00055 CWDFeatures::CWDFeatures(const CWDFeatures& orig) 00056 : CDotFeatures(orig), strings(orig.strings), 00057 degree(orig.degree), from_degree(orig.from_degree), 00058 normalization_const(orig.normalization_const) 00059 { 00060 SG_REF(strings); 00061 00062 if (strings) 00063 { 00064 string_length=strings->get_max_vector_length(); 00065 num_strings=strings->get_num_vectors(); 00066 CAlphabet* alpha=strings->get_alphabet(); 00067 alphabet_size=alpha->get_num_symbols(); 00068 SG_UNREF(alpha); 00069 } 00070 else 00071 { 00072 string_length = 0; 00073 num_strings = 0; 00074 alphabet_size = 0; 00075 } 00076 00077 wd_weights=NULL; 00078 if (degree>0) 00079 set_wd_weights(); 00080 } 00081 00082 CWDFeatures::~CWDFeatures() 00083 { 00084 SG_UNREF(strings); 00085 SG_FREE(wd_weights); 00086 } 00087 00088 float64_t CWDFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2) 00089 { 00090 ASSERT(df) 00091 ASSERT(df->get_feature_type() == get_feature_type()) 00092 ASSERT(df->get_feature_class() == get_feature_class()) 00093 CWDFeatures* wdf = (CWDFeatures*) df; 00094 00095 int32_t len1, len2; 00096 bool free_vec1, free_vec2; 00097 00098 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00099 uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2); 00100 00101 ASSERT(len1==len2) 00102 00103 float64_t sum=0.0; 00104 00105 for (int32_t i=0; i<len1; i++) 00106 { 00107 for (int32_t j=0; (i+j<len1) && (j<degree); j++) 00108 { 00109 if (vec1[i+j]!=vec2[i+j]) 00110 break ; 00111 sum += wd_weights[j]*wd_weights[j]; 00112 } 00113 } 00114 strings->free_feature_vector(vec1, vec_idx1, free_vec1); 00115 wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2); 00116 return sum/CMath::sq(normalization_const); 00117 } 00118 00119 float64_t CWDFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00120 { 00121 if (vec2_len != w_dim) 00122 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim) 00123 00124 float64_t sum=0; 00125 int32_t lim=CMath::min(degree, string_length); 00126 int32_t len; 00127 bool free_vec1; 00128 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00129 int32_t* val=SG_MALLOC(int32_t, len); 00130 SGVector<int32_t>::fill_vector(val, len, 0); 00131 00132 int32_t asize=alphabet_size; 00133 int32_t asizem1=1; 00134 int32_t offs=0; 00135 00136 for (int32_t k=0; k<lim; k++) 00137 { 00138 float64_t wd = wd_weights[k]; 00139 00140 int32_t o=offs; 00141 for (int32_t i=0; i+k < len; i++) 00142 { 00143 val[i]+=asizem1*vec[i+k]; 00144 sum+=vec2[val[i]+o]*wd; 00145 o+=asize; 00146 } 00147 offs+=asize*len; 00148 asize*=alphabet_size; 00149 asizem1*=alphabet_size; 00150 } 00151 SG_FREE(val); 00152 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00153 00154 return sum/normalization_const; 00155 } 00156 00157 void CWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00158 { 00159 if (vec2_len != w_dim) 00160 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim) 00161 00162 int32_t lim=CMath::min(degree, string_length); 00163 int32_t len; 00164 bool free_vec1; 00165 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00166 int32_t* val=SG_MALLOC(int32_t, len); 00167 SGVector<int32_t>::fill_vector(val, len, 0); 00168 00169 int32_t asize=alphabet_size; 00170 int32_t asizem1=1; 00171 int32_t offs=0; 00172 00173 for (int32_t k=0; k<lim; k++) 00174 { 00175 float64_t wd = alpha*wd_weights[k]/normalization_const; 00176 00177 if (abs_val) 00178 wd=CMath::abs(wd); 00179 00180 int32_t o=offs; 00181 for (int32_t i=0; i+k < len; i++) 00182 { 00183 val[i]+=asizem1*vec[i+k]; 00184 vec2[val[i]+o]+=wd; 00185 o+=asize; 00186 } 00187 offs+=asize*len; 00188 asize*=alphabet_size; 00189 asizem1*=alphabet_size; 00190 } 00191 SG_FREE(val); 00192 00193 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00194 } 00195 00196 void CWDFeatures::set_wd_weights() 00197 { 00198 ASSERT(degree>0 && degree<=8) 00199 SG_FREE(wd_weights); 00200 wd_weights=SG_MALLOC(float64_t, degree); 00201 w_dim=0; 00202 00203 for (int32_t i=0; i<degree; i++) 00204 { 00205 w_dim+=CMath::pow(alphabet_size, i+1)*string_length; 00206 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1))); 00207 } 00208 SG_DEBUG("created WDFeatures with d=%d (%d), alphabetsize=%d, dim=%d num=%d, len=%d\n", degree, from_degree, alphabet_size, w_dim, num_strings, string_length) 00209 } 00210 00211 00212 void CWDFeatures::set_normalization_const(float64_t n) 00213 { 00214 if (n==0) 00215 { 00216 normalization_const=0; 00217 for (int32_t i=0; i<degree; i++) 00218 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i]; 00219 00220 normalization_const=CMath::sqrt(normalization_const); 00221 } 00222 else 00223 normalization_const=n; 00224 00225 SG_DEBUG("normalization_const:%f\n", normalization_const) 00226 } 00227 00228 void* CWDFeatures::get_feature_iterator(int32_t vector_index) 00229 { 00230 if (vector_index>=num_strings) 00231 { 00232 SG_ERROR("Index out of bounds (number of strings %d, you " 00233 "requested %d)\n", num_strings, vector_index); 00234 } 00235 00236 wd_feature_iterator* it=SG_MALLOC(wd_feature_iterator, 1); 00237 00238 it->lim=CMath::min(degree, string_length); 00239 it->vec= strings->get_feature_vector(vector_index, it->vlen, it->vfree); 00240 it->vidx=vector_index; 00241 00242 it->vec = strings->get_feature_vector(vector_index, it->vlen, it->vfree); 00243 it->val=SG_MALLOC(int32_t, it->vlen); 00244 SGVector<int32_t>::fill_vector(it->val, it->vlen, 0); 00245 00246 it->asize=alphabet_size; 00247 it->asizem1=1; 00248 it->offs=0; 00249 it->k=0; 00250 it->i=0; 00251 it->o=0; 00252 00253 return it; 00254 } 00255 00256 bool CWDFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00257 { 00258 wd_feature_iterator* it=(wd_feature_iterator*) iterator; 00259 00260 if (it->i + it->k >= it->vlen) 00261 { 00262 if (it->k < it->lim-1) 00263 { 00264 it->offs+=it->asize*it->vlen; 00265 it->asize*=alphabet_size; 00266 it->asizem1*=alphabet_size; 00267 it->k++; 00268 it->i=0; 00269 it->o=it->offs; 00270 } 00271 else 00272 return false; 00273 } 00274 00275 int32_t i=it->i; 00276 int32_t k=it->k; 00277 #ifdef DEBUG_WDFEATURES 00278 SG_PRINT("i=%d k=%d offs=%d o=%d asize=%d asizem1=%d\n", i, k, it->offs, it->o, it->asize, it->asizem1) 00279 #endif 00280 00281 it->val[i]+=it->asizem1*it->vec[i+k]; 00282 value=wd_weights[k]/normalization_const; 00283 index=it->val[i]+it->o; 00284 #ifdef DEBUG_WDFEATURES 00285 SG_PRINT("index=%d val=%f w_size=%d lim=%d vlen=%d\n", index, value, w_dim, it->lim, it->vlen) 00286 #endif 00287 00288 it->o+=it->asize; 00289 it->i=i+1; 00290 00291 return true; 00292 } 00293 00294 void CWDFeatures::free_feature_iterator(void* iterator) 00295 { 00296 ASSERT(iterator) 00297 wd_feature_iterator* it=(wd_feature_iterator*) iterator; 00298 strings->free_feature_vector(it->vec, it->vidx, it->vfree); 00299 SG_FREE(it->val); 00300 SG_FREE(it); 00301 } 00302 00303 CFeatures* CWDFeatures::duplicate() const 00304 { 00305 return new CWDFeatures(*this); 00306 } 00307 00308 int32_t CWDFeatures::get_dim_feature_space() const 00309 { 00310 return w_dim; 00311 } 00312 00313 int32_t CWDFeatures::get_nnz_features_for_vector(int32_t num) 00314 { 00315 int32_t vlen=-1; 00316 bool free_vec; 00317 uint8_t* vec=strings->get_feature_vector(num, vlen, free_vec); 00318 strings->free_feature_vector(vec, num, free_vec); 00319 return degree*vlen; 00320 } 00321 00322 EFeatureType CWDFeatures::get_feature_type() const 00323 { 00324 return F_UNKNOWN; 00325 } 00326 00327 EFeatureClass CWDFeatures::get_feature_class() const 00328 { 00329 return C_WD; 00330 } 00331 00332 int32_t CWDFeatures::get_num_vectors() const 00333 { 00334 return num_strings; 00335 } 00336 00337 float64_t CWDFeatures::get_normalization_const() 00338 { 00339 return normalization_const; 00340 } 00341 00342 void CWDFeatures::set_wd_weights(SGVector<float64_t> weights) 00343 { 00344 ASSERT(weights.vlen==degree) 00345 00346 for (int32_t i=0; i<degree; i++) 00347 wd_weights[i]=weights.vector[i]; 00348 } 00349