SHOGUN
v3.2.0
|
00001 #include <shogun/lib/SGSparseVector.h> 00002 #include <shogun/lib/SGVector.h> 00003 #include <shogun/mathematics/Math.h> 00004 #include <shogun/io/File.h> 00005 00006 namespace shogun { 00007 00008 template <class T> 00009 SGSparseVector<T>::SGSparseVector() : SGReferencedData() 00010 { 00011 init_data(); 00012 } 00013 00014 template <class T> 00015 SGSparseVector<T>::SGSparseVector(SGSparseVectorEntry<T>* feats, index_t num_entries, 00016 bool ref_counting) : 00017 SGReferencedData(ref_counting), 00018 num_feat_entries(num_entries), features(feats) 00019 { 00020 } 00021 00022 template <class T> 00023 SGSparseVector<T>::SGSparseVector(index_t num_entries, bool ref_counting) : 00024 SGReferencedData(ref_counting), 00025 num_feat_entries(num_entries) 00026 { 00027 features = SG_MALLOC(SGSparseVectorEntry<T>, num_feat_entries); 00028 } 00029 00030 template <class T> 00031 SGSparseVector<T>::SGSparseVector(const SGSparseVector& orig) : 00032 SGReferencedData(orig) 00033 { 00034 copy_data(orig); 00035 } 00036 00037 template <class T> 00038 SGSparseVector<T>::~SGSparseVector() 00039 { 00040 unref(); 00041 } 00042 00043 template <class T> 00044 T SGSparseVector<T>::dense_dot(T alpha, T* vec, int32_t dim, T b) 00045 { 00046 ASSERT(vec) 00047 T result=b; 00048 00049 if (features) 00050 { 00051 for (int32_t i=0; i<num_feat_entries; i++) 00052 { 00053 if (features[i].feat_index<dim) 00054 result+=alpha*vec[features[i].feat_index]*features[i].entry; 00055 } 00056 } 00057 00058 return result; 00059 } 00060 00061 template <class T> 00062 template <typename ST> 00063 T SGSparseVector<T>::dense_dot(SGVector<ST> vec) 00064 { 00065 ASSERT(vec) 00066 T result(0.0); 00067 00068 if (features) 00069 { 00070 for (int32_t i=0; i<num_feat_entries; i++) 00071 { 00072 if (features[i].feat_index<vec.vlen) 00073 result+=static_cast<T>(vec[features[i].feat_index]) 00074 *features[i].entry; 00075 } 00076 } 00077 00078 return result; 00079 } 00080 00081 template complex128_t SGSparseVector<complex128_t>::dense_dot<float64_t>(SGVector<float64_t>); 00082 template complex128_t SGSparseVector<complex128_t>::dense_dot<int32_t>(SGVector<int32_t> vec); 00083 template float64_t SGSparseVector<float64_t>::dense_dot<int32_t>(SGVector<int32_t> vec); 00084 00085 template <class T> 00086 T SGSparseVector<T>::sparse_dot(const SGSparseVector<T>& v) 00087 { 00088 return sparse_dot(*this, v); 00089 } 00090 00091 template <class T> 00092 T SGSparseVector<T>::sparse_dot(const SGSparseVector<T>& a, const SGSparseVector<T>& b) 00093 { 00094 if (a.num_feat_entries == 0 || b.num_feat_entries == 0) 00095 return 0; 00096 00097 int32_t cmp = cmp_dot_prod_symmetry_fast(a.num_feat_entries, b.num_feat_entries); 00098 00099 if (cmp == 0) // symmetric 00100 { 00101 return dot_prod_symmetric(a, b); 00102 } 00103 else if (cmp > 0) // b has more element 00104 { 00105 return dot_prod_asymmetric(a, b); 00106 } 00107 else // a has more element 00108 { 00109 return dot_prod_asymmetric(b, a); 00110 } 00111 } 00112 00113 template<class T> 00114 int32_t SGSparseVector<T>::get_num_dimensions() 00115 { 00116 if (!features) 00117 return 0; 00118 00119 int32_t dimensions = -1; 00120 for (index_t i=0; i<num_feat_entries; i++) 00121 { 00122 if (features[i].feat_index > dimensions) 00123 { 00124 dimensions = features[i].feat_index; 00125 } 00126 } 00127 00128 return dimensions+1; 00129 } 00130 00131 template<class T> 00132 void SGSparseVector<T>::sort_features(bool stable_pointer) 00133 { 00134 if (!num_feat_entries) 00135 return; 00136 00137 // remember old pointer to enforce quarantee 00138 const SGSparseVectorEntry<T>* old_features_ptr = features; 00139 00140 int32_t* feat_idx=SG_MALLOC(int32_t, num_feat_entries); 00141 for (index_t j=0; j<num_feat_entries; j++) 00142 { 00143 feat_idx[j]=features[j].feat_index; 00144 } 00145 00146 CMath::qsort_index(feat_idx, features, num_feat_entries); 00147 SG_FREE(feat_idx); 00148 00149 for (index_t j=1; j<num_feat_entries; j++) 00150 { 00151 REQUIRE(features[j-1].feat_index <= features[j].feat_index, 00152 "sort_features(): failed sanity check %d <= %d after sorting (comparing indices features[%d] <= features[%d], features=%d)\n", 00153 features[j-1].feat_index, features[j].feat_index, j-1, j, num_feat_entries); 00154 } 00155 00156 // compression: removing zero-entries and merging features with same index 00157 int32_t last_index = 0; 00158 for (index_t j=1; j<num_feat_entries; j++) 00159 { 00160 // always true, but kept for future changes 00161 REQUIRE(last_index < j, 00162 "sort_features(): target index %d must not exceed source index j=%d", 00163 last_index, j); 00164 REQUIRE(features[last_index].feat_index <= features[j].feat_index, 00165 "sort_features(): failed sanity check %d = features[%d].feat_index <= features[%d].feat_index = %d\n", 00166 features[last_index].feat_index, last_index, j, features[j].feat_index); 00167 00168 // merging of features with same index 00169 if (features[last_index].feat_index == features[j].feat_index) 00170 { 00171 features[last_index].entry += features[j].entry; 00172 continue; 00173 } 00174 00175 // only skip to next element if current one is not zero 00176 if (features[last_index].entry != 0.0) 00177 { 00178 last_index++; 00179 } 00180 00181 features[last_index] = features[j]; 00182 } 00183 00184 // remove single first element if zero (not caught by loop) 00185 if (features[last_index].entry == 0.0) 00186 { 00187 last_index--; 00188 } 00189 00190 int32_t new_feat_count = last_index+1; 00191 ASSERT(new_feat_count <= num_feat_entries); 00192 00193 // shrinking vector 00194 if (!stable_pointer) 00195 { 00196 SG_SINFO("shrinking vector from %d to %d\n", num_feat_entries, new_feat_count); 00197 features = SG_REALLOC(SGSparseVectorEntry<T>, features, num_feat_entries, new_feat_count); 00198 } 00199 num_feat_entries = new_feat_count; 00200 00201 for (index_t j=1; j<num_feat_entries; j++) 00202 { 00203 REQUIRE(features[j-1].feat_index < features[j].feat_index, 00204 "sort_features(): failed sanity check %d < %d after sorting (comparing indices features[%d] < features[%d], features=%d)\n", 00205 features[j-1].feat_index, features[j].feat_index, j-1, j, num_feat_entries); 00206 } 00207 00208 // compare with old pointer to enforce quarantee 00209 if (stable_pointer) { 00210 ASSERT(old_features_ptr == features); 00211 } 00212 return; 00213 } 00214 00215 template<class T> 00216 T SGSparseVector<T>::get_feature(int32_t index) 00217 { 00218 T ret = 0; 00219 if (features) 00220 { 00221 for (index_t i=0; i<num_feat_entries; i++) 00222 if (features[i].feat_index==index) 00223 ret+=features[i].entry ; 00224 } 00225 00226 return ret ; 00227 } 00228 00229 template<class T> 00230 SGVector<T> SGSparseVector<T>::get_dense() 00231 { 00232 SGVector<T> dense; 00233 00234 if (features) 00235 { 00236 dense.resize_vector(get_num_dimensions()); 00237 dense.zero(); 00238 00239 for (index_t i=0; i<num_feat_entries; i++) 00240 { 00241 dense.vector[features[i].feat_index] += features[i].entry; 00242 } 00243 } 00244 00245 return dense; 00246 } 00247 00248 template<class T> 00249 SGVector<T> SGSparseVector<T>::get_dense(int32_t dimension) 00250 { 00251 SGVector<T> dense(dimension); 00252 dense.zero(); 00253 00254 if (features) 00255 { 00256 REQUIRE(get_num_dimensions() <= dimension, "get_dense(dimension=%d): sparse dimension %d exceeds requested dimension\n", 00257 dimension, get_num_dimensions()); 00258 00259 for (index_t i=0; i<num_feat_entries; i++) 00260 { 00261 dense.vector[features[i].feat_index] += features[i].entry; 00262 } 00263 } 00264 00265 return dense; 00266 } 00267 00268 template<class T> 00269 SGSparseVector<T> SGSparseVector<T>::clone() const 00270 { 00271 SGSparseVectorEntry <T> * copy = SG_MALLOC(SGSparseVectorEntry <T>, num_feat_entries); 00272 memcpy(copy, features, num_feat_entries * sizeof(SGSparseVectorEntry <T>)); 00273 return SGSparseVector<T>(copy, num_feat_entries); 00274 } 00275 00276 template<class T> void SGSparseVector<T>::load(CFile* loader) 00277 { 00278 ASSERT(loader) 00279 unref(); 00280 00281 SG_SET_LOCALE_C; 00282 loader->get_sparse_vector(features, num_feat_entries); 00283 SG_RESET_LOCALE; 00284 } 00285 00286 template<class T> void SGSparseVector<T>::save(CFile* saver) 00287 { 00288 ASSERT(saver) 00289 00290 SG_SET_LOCALE_C; 00291 saver->set_sparse_vector(features, num_feat_entries); 00292 SG_RESET_LOCALE; 00293 } 00294 00295 template <> 00296 void SGSparseVector<complex128_t>::load(CFile* loader) 00297 { 00298 SG_SERROR("SGSparseVector::load():: Not supported for complex128_t\n"); 00299 } 00300 00301 template <> 00302 void SGSparseVector<complex128_t>::save(CFile* saver) 00303 { 00304 SG_SERROR("SGSparseVector::save():: Not supported for complex128_t\n"); 00305 } 00306 00307 template <class T> 00308 void SGSparseVector<T>::copy_data(const SGReferencedData& orig) 00309 { 00310 num_feat_entries = ((SGSparseVector*)(&orig))->num_feat_entries; 00311 features = ((SGSparseVector*)(&orig))->features; 00312 } 00313 00314 template <class T> 00315 void SGSparseVector<T>::init_data() 00316 { 00317 num_feat_entries = 0; 00318 features = NULL; 00319 } 00320 00321 template <class T> 00322 void SGSparseVector<T>::free_data() 00323 { 00324 num_feat_entries = 0; 00325 SG_FREE(features); 00326 } 00327 00328 template <class T> 00329 int32_t SGSparseVector<T>::cmp_dot_prod_symmetry_fast(index_t alen, index_t blen) 00330 { 00331 if (alen > blen) // no need for floats here 00332 { 00333 return (blen * CMath::floor_log(alen) < alen) ? -1 : 0; 00334 } 00335 else // alen <= blen 00336 { 00337 return (alen * CMath::floor_log(blen) < blen) ? 1 : 0; 00338 } 00339 } 00340 00341 template <class T> 00342 T SGSparseVector<T>::dot_prod_asymmetric(const SGSparseVector<T>& a, const SGSparseVector<T>& b) 00343 { 00344 T dot_prod = 0; 00345 for(index_t b_idx = 0; b_idx < b.num_feat_entries; ++b_idx) 00346 { 00347 const T tmp = b.features[b_idx].entry; 00348 if (a.features[a.num_feat_entries-1].feat_index < b.features[b_idx].feat_index) 00349 break; 00350 for (index_t a_idx = 0; a_idx < a.num_feat_entries; ++a_idx) 00351 { 00352 if (a.features[a_idx].feat_index == b.features[b_idx].feat_index) 00353 dot_prod += tmp * a.features[a_idx].entry; 00354 } 00355 } 00356 return dot_prod; 00357 } 00358 00359 template <class T> 00360 T SGSparseVector<T>::dot_prod_symmetric(const SGSparseVector<T>& a, const SGSparseVector<T>& b) 00361 { 00362 ASSERT(a.num_feat_entries > 0 && b.num_feat_entries > 0) 00363 T dot_prod = 0; 00364 index_t a_idx = 0, b_idx = 0; 00365 while (true) 00366 { 00367 if (a.features[a_idx].feat_index == b.features[b_idx].feat_index) 00368 { 00369 dot_prod += a.features[a_idx].entry * b.features[b_idx].entry; 00370 00371 a_idx++; 00372 if (a.num_feat_entries == a_idx) 00373 break; 00374 b_idx++; 00375 if (b.num_feat_entries == b_idx) 00376 break; 00377 } 00378 else if (a.features[a_idx].feat_index < b.features[b_idx].feat_index) 00379 { 00380 a_idx++; 00381 if (a.num_feat_entries == a_idx) 00382 break; 00383 } 00384 else 00385 { 00386 b_idx++; 00387 if (b.num_feat_entries == b_idx) 00388 break; 00389 } 00390 } 00391 return dot_prod; 00392 } 00393 00394 template <> 00395 void SGSparseVector<bool>::display_vector(const char* name, const char* prefix) 00396 { 00397 SG_SPRINT("%s%s=[", prefix, name); 00398 for (int32_t i=0; i<num_feat_entries; i++) 00399 SG_SPRINT("%s%s%d:%d", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry ? 1 : 0); 00400 SG_SPRINT("%s]\n", prefix); 00401 } 00402 00403 template <> 00404 void SGSparseVector<char>::display_vector(const char* name, const char* prefix) 00405 { 00406 SG_SPRINT("%s%s=[", prefix, name); 00407 for (int32_t i=0; i<num_feat_entries; i++) 00408 SG_SPRINT("%s%s%d:%c", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry); 00409 SG_SPRINT("%s]\n", prefix); 00410 } 00411 00412 template <> 00413 void SGSparseVector<int8_t>::display_vector(const char* name, const char* prefix) 00414 { 00415 SG_SPRINT("%s%s=[", prefix, name); 00416 for (int32_t i=0; i<num_feat_entries; i++) 00417 SG_SPRINT("%s%s%d:%d", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry); 00418 SG_SPRINT("%s]\n", prefix); 00419 } 00420 00421 template <> 00422 void SGSparseVector<uint8_t>::display_vector(const char* name, const char* prefix) 00423 { 00424 SG_SPRINT("%s%s=[", prefix, name); 00425 for (int32_t i=0; i<num_feat_entries; i++) 00426 SG_SPRINT("%s%s%d:%u", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry); 00427 SG_SPRINT("%s]\n", prefix); 00428 } 00429 00430 template <> 00431 void SGSparseVector<int16_t>::display_vector(const char* name, const char* prefix) 00432 { 00433 SG_SPRINT("%s%s=[", prefix, name); 00434 for (int32_t i=0; i<num_feat_entries; i++) 00435 SG_SPRINT("%s%s%d:%d", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry); 00436 SG_SPRINT("%s]\n", prefix); 00437 } 00438 00439 template <> 00440 void SGSparseVector<uint16_t>::display_vector(const char* name, const char* prefix) 00441 { 00442 SG_SPRINT("%s%s=[", prefix, name); 00443 for (int32_t i=0; i<num_feat_entries; i++) 00444 SG_SPRINT("%s%s%d:%u", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry); 00445 SG_SPRINT("%s]\n", prefix); 00446 } 00447 00448 template <> 00449 void SGSparseVector<int32_t>::display_vector(const char* name, const char* prefix) 00450 { 00451 SG_SPRINT("%s%s=[", prefix, name); 00452 for (int32_t i=0; i<num_feat_entries; i++) 00453 SG_SPRINT("%s%s%d:%d", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry); 00454 SG_SPRINT("%s]\n", prefix); 00455 } 00456 00457 template <> 00458 void SGSparseVector<uint32_t>::display_vector(const char* name, const char* prefix) 00459 { 00460 SG_SPRINT("%s%s=[", prefix, name); 00461 for (int32_t i=0; i<num_feat_entries; i++) 00462 SG_SPRINT("%s%s%d:%u", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry); 00463 SG_SPRINT("%s]\n", prefix); 00464 } 00465 00466 template <> 00467 void SGSparseVector<int64_t>::display_vector(const char* name, const char* prefix) 00468 { 00469 SG_SPRINT("%s%s=[", prefix, name); 00470 for (int32_t i=0; i<num_feat_entries; i++) 00471 SG_SPRINT("%s%s%d:%lld", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry); 00472 SG_SPRINT("%s]\n", prefix); 00473 } 00474 00475 template <> 00476 void SGSparseVector<uint64_t>::display_vector(const char* name, const char* prefix) 00477 { 00478 SG_SPRINT("%s%s=[", prefix, name); 00479 for (int32_t i=0; i<num_feat_entries; i++) 00480 SG_SPRINT("%s%s%d:%llu ", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry); 00481 SG_SPRINT("%s]\n", prefix); 00482 } 00483 00484 template <> 00485 void SGSparseVector<float32_t>::display_vector(const char* name, const char* prefix) 00486 { 00487 SG_SPRINT("%s%s=[", prefix, name); 00488 for (int32_t i=0; i<num_feat_entries; i++) 00489 SG_SPRINT("%s%s%d:%g", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry); 00490 SG_SPRINT("%s]\n", prefix); 00491 } 00492 00493 template <> 00494 void SGSparseVector<float64_t>::display_vector(const char* name, const char* prefix) 00495 { 00496 SG_SPRINT("%s%s=[", prefix, name); 00497 for (int32_t i=0; i<num_feat_entries; i++) 00498 SG_SPRINT("%s%s%d:%.18g", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry); 00499 SG_SPRINT("%s]\n", prefix); 00500 } 00501 00502 template <> 00503 void SGSparseVector<floatmax_t>::display_vector(const char* name, const char* prefix) 00504 { 00505 SG_SPRINT("%s%s=[", prefix, name); 00506 for (int32_t i=0; i<num_feat_entries; i++) 00507 SG_SPRINT("%s%s%d:%.36Lg", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry); 00508 SG_SPRINT("%s]\n", prefix); 00509 } 00510 00511 template <> 00512 void SGSparseVector<complex128_t>::display_vector(const char* name, const char* prefix) 00513 { 00514 SG_SPRINT("%s%s=[", prefix, name); 00515 for (int32_t i=0; i<num_feat_entries; i++) 00516 SG_SPRINT("%s%s%d:(%.18lg+i%.18lg)", prefix, i==0 ? "" : " ", features[i].feat_index, 00517 features[i].entry.real(), features[i].entry.imag()); 00518 SG_SPRINT("%s]\n", prefix); 00519 } 00520 00521 template class SGSparseVector<bool>; 00522 template class SGSparseVector<char>; 00523 template class SGSparseVector<int8_t>; 00524 template class SGSparseVector<uint8_t>; 00525 template class SGSparseVector<int16_t>; 00526 template class SGSparseVector<uint16_t>; 00527 template class SGSparseVector<int32_t>; 00528 template class SGSparseVector<uint32_t>; 00529 template class SGSparseVector<int64_t>; 00530 template class SGSparseVector<uint64_t>; 00531 template class SGSparseVector<float32_t>; 00532 template class SGSparseVector<float64_t>; 00533 template class SGSparseVector<floatmax_t>; 00534 template class SGSparseVector<complex128_t>; 00535 } 00536