SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2010 Soeren Sonnenburg 00008 * Copyright (C) 2010 Berlin Institute of Technology 00009 */ 00010 00011 #include <shogun/features/HashedWDFeatures.h> 00012 #include <shogun/io/SGIO.h> 00013 00014 using namespace shogun; 00015 00016 CHashedWDFeatures::CHashedWDFeatures() :CDotFeatures() 00017 { 00018 SG_UNSTABLE("CHashedWDFeatures::CHashedWDFeatures()", "\n") 00019 00020 strings = NULL; 00021 00022 degree = 0; 00023 start_degree = 0; 00024 from_degree = 0; 00025 string_length = 0; 00026 num_strings = 0; 00027 alphabet_size = 0; 00028 w_dim = 0; 00029 partial_w_dim = 0; 00030 wd_weights = NULL; 00031 mask = 0; 00032 m_hash_bits = 0; 00033 00034 normalization_const = 0.0; 00035 } 00036 00037 CHashedWDFeatures::CHashedWDFeatures(CStringFeatures<uint8_t>* str, 00038 int32_t start_order, int32_t order, int32_t from_order, 00039 int32_t hash_bits) : CDotFeatures() 00040 { 00041 ASSERT(start_order>=0) 00042 ASSERT(start_order<order) 00043 ASSERT(order<=from_order) 00044 ASSERT(hash_bits>0) 00045 ASSERT(str) 00046 ASSERT(str->have_same_length()) 00047 SG_REF(str); 00048 00049 strings=str; 00050 string_length=str->get_max_vector_length(); 00051 num_strings=str->get_num_vectors(); 00052 CAlphabet* alpha=str->get_alphabet(); 00053 alphabet_size=alpha->get_num_symbols(); 00054 SG_UNREF(alpha); 00055 00056 degree=order; 00057 start_degree=start_order; 00058 from_degree=from_order; 00059 m_hash_bits=hash_bits; 00060 set_wd_weights(); 00061 set_normalization_const(); 00062 } 00063 00064 CHashedWDFeatures::CHashedWDFeatures(const CHashedWDFeatures& orig) 00065 : CDotFeatures(orig), strings(orig.strings), 00066 degree(orig.degree), start_degree(orig.start_degree), 00067 from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits), 00068 normalization_const(orig.normalization_const) 00069 { 00070 00071 00072 SG_REF(strings); 00073 if (strings) 00074 { 00075 string_length=strings->get_max_vector_length(); 00076 num_strings=strings->get_num_vectors(); 00077 CAlphabet* alpha=strings->get_alphabet(); 00078 alphabet_size=alpha->get_num_symbols(); 00079 SG_UNREF(alpha); 00080 } 00081 else 00082 { 00083 string_length = 0; 00084 num_strings = 0; 00085 alphabet_size = 0; 00086 } 00087 00088 if (degree>0) 00089 set_wd_weights(); 00090 } 00091 00092 CHashedWDFeatures::~CHashedWDFeatures() 00093 { 00094 SG_UNREF(strings); 00095 SG_FREE(wd_weights); 00096 } 00097 00098 float64_t CHashedWDFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2) 00099 { 00100 ASSERT(df) 00101 ASSERT(df->get_feature_type() == get_feature_type()) 00102 ASSERT(df->get_feature_class() == get_feature_class()) 00103 CHashedWDFeatures* wdf = (CHashedWDFeatures*) df; 00104 00105 int32_t len1, len2; 00106 bool free_vec1, free_vec2; 00107 00108 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00109 uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2); 00110 00111 ASSERT(len1==len2) 00112 00113 float64_t sum=0.0; 00114 00115 for (int32_t i=0; i<len1; i++) 00116 { 00117 for (int32_t j=0; (i+j<len1) && (j<degree); j++) 00118 { 00119 if (vec1[i+j]!=vec2[i+j]) 00120 break; 00121 if (j>=start_degree) 00122 sum += wd_weights[j]*wd_weights[j]; 00123 } 00124 } 00125 strings->free_feature_vector(vec1, vec_idx1, free_vec1); 00126 wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2); 00127 return sum/CMath::sq(normalization_const); 00128 } 00129 00130 float64_t CHashedWDFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00131 { 00132 if (vec2_len != w_dim) 00133 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim) 00134 00135 float64_t sum=0; 00136 int32_t lim=CMath::min(degree, string_length); 00137 int32_t len; 00138 bool free_vec1; 00139 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00140 uint32_t* val=SG_MALLOC(uint32_t, len); 00141 00142 uint32_t offs=0; 00143 00144 if (start_degree>0) 00145 { 00146 // compute hash for strings of length start_degree-1 00147 for (int32_t i=0; i+start_degree < len; i++) 00148 val[i]=CHash::MurmurHash3(&vec[i], start_degree, 0xDEADBEAF); 00149 } 00150 else 00151 SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF); 00152 00153 for (int32_t k=start_degree; k<lim; k++) 00154 { 00155 float64_t wd = wd_weights[k]; 00156 00157 uint32_t o=offs; 00158 uint32_t carry = 0; 00159 uint32_t chunk = 0; 00160 00161 for (int32_t i=0; i+k < len; i++) 00162 { 00163 chunk++; 00164 CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1); 00165 uint32_t h = 00166 CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk); 00167 #ifdef DEBUG_HASHEDWD 00168 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o) 00169 #endif 00170 sum+=vec2[o+(h & mask)]*wd; 00171 val[i] = h; 00172 o+=partial_w_dim; 00173 } 00174 val[len-k-1] = 00175 CHash::FinalizeIncrementalMurmurHash3(val[len-k-1], carry, chunk); 00176 offs+=partial_w_dim*len; 00177 } 00178 SG_FREE(val); 00179 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00180 00181 return sum/normalization_const; 00182 } 00183 00184 void CHashedWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00185 { 00186 if (vec2_len != w_dim) 00187 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim) 00188 00189 int32_t lim=CMath::min(degree, string_length); 00190 int32_t len; 00191 bool free_vec1; 00192 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00193 uint32_t* val=SG_MALLOC(uint32_t, len); 00194 00195 uint32_t offs=0; 00196 00197 if (start_degree>0) 00198 { 00199 // compute hash for strings of length start_degree-1 00200 for (int32_t i=0; i+start_degree < len; i++) 00201 val[i]=CHash::MurmurHash3(&vec[i], start_degree, 0xDEADBEAF); 00202 } 00203 else 00204 SGVector<uint32_t>::fill_vector(val, len, 0xDEADBEAF); 00205 00206 for (int32_t k=start_degree; k<lim; k++) 00207 { 00208 float64_t wd = alpha*wd_weights[k]/normalization_const; 00209 00210 if (abs_val) 00211 wd=CMath::abs(wd); 00212 00213 uint32_t o=offs; 00214 uint32_t carry = 0; 00215 uint32_t chunk = 0; 00216 00217 for (int32_t i=0; i+k < len; i++) 00218 { 00219 chunk++; 00220 CHash::IncrementalMurmurHash3(&(val[i]), &carry, &(vec[i+k]), 1); 00221 uint32_t h = CHash::FinalizeIncrementalMurmurHash3(val[i], carry, chunk); 00222 00223 #ifdef DEBUG_HASHEDWD 00224 SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h) 00225 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d\n", vec[i], k,offs, o) 00226 #endif 00227 vec2[o+(h & mask)]+=wd; 00228 val[i] = h; 00229 o+=partial_w_dim; 00230 } 00231 val[len-k-1] = 00232 CHash::FinalizeIncrementalMurmurHash3(val[len-k-1], carry, chunk); 00233 00234 offs+=partial_w_dim*len; 00235 } 00236 00237 SG_FREE(val); 00238 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00239 } 00240 00241 void CHashedWDFeatures::set_wd_weights() 00242 { 00243 ASSERT(degree>0) 00244 00245 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1; 00246 partial_w_dim=1<<m_hash_bits; 00247 w_dim=partial_w_dim*string_length*(degree-start_degree); 00248 00249 wd_weights=SG_MALLOC(float64_t, degree); 00250 00251 for (int32_t i=0; i<degree; i++) 00252 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1))); 00253 00254 SG_DEBUG("created HashedWDFeatures with d=%d (%d), alphabetsize=%d, " 00255 "dim=%d partial_dim=%d num=%d, len=%d\n", 00256 degree, from_degree, alphabet_size, 00257 w_dim, partial_w_dim, num_strings, string_length); 00258 } 00259 00260 00261 void CHashedWDFeatures::set_normalization_const(float64_t n) 00262 { 00263 if (n==0) 00264 { 00265 normalization_const=0; 00266 for (int32_t i=0; i<degree; i++) 00267 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i]; 00268 00269 normalization_const=CMath::sqrt(normalization_const); 00270 } 00271 else 00272 normalization_const=n; 00273 00274 SG_DEBUG("normalization_const:%f\n", normalization_const) 00275 } 00276 00277 CFeatures* CHashedWDFeatures::duplicate() const 00278 { 00279 return new CHashedWDFeatures(*this); 00280 } 00281 00282 00283 int32_t CHashedWDFeatures::get_nnz_features_for_vector(int32_t num) 00284 { 00285 int32_t vlen=-1; 00286 bool free_vec; 00287 uint8_t* vec=strings->get_feature_vector(num, vlen, free_vec); 00288 strings->free_feature_vector(vec, num, free_vec); 00289 return degree*vlen; 00290 } 00291 00292 void* CHashedWDFeatures::get_feature_iterator(int32_t vector_index) 00293 { 00294 SG_NOTIMPLEMENTED 00295 return NULL; 00296 } 00297 00298 bool CHashedWDFeatures::get_next_feature(int32_t& index, float64_t& value, 00299 void* iterator) 00300 { 00301 SG_NOTIMPLEMENTED 00302 return false; 00303 } 00304 00305 void CHashedWDFeatures::free_feature_iterator(void* iterator) 00306 { 00307 SG_NOTIMPLEMENTED 00308 }