SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
SGSparseVector.cpp
Go to the documentation of this file.
00001 #include <shogun/lib/SGSparseVector.h>
00002 #include <shogun/lib/SGVector.h>
00003 #include <shogun/mathematics/Math.h>
00004 #include <shogun/io/File.h>
00005 
00006 namespace shogun {
00007 
00008 template <class T>
00009 SGSparseVector<T>::SGSparseVector() : SGReferencedData()
00010 {
00011     init_data();
00012 }
00013 
00014 template <class T>
00015 SGSparseVector<T>::SGSparseVector(SGSparseVectorEntry<T>* feats, index_t num_entries,
00016         bool ref_counting) :
00017         SGReferencedData(ref_counting),
00018         num_feat_entries(num_entries), features(feats)
00019 {
00020 }
00021 
00022 template <class T>
00023 SGSparseVector<T>::SGSparseVector(index_t num_entries, bool ref_counting) :
00024     SGReferencedData(ref_counting),
00025     num_feat_entries(num_entries)
00026 {
00027     features = SG_MALLOC(SGSparseVectorEntry<T>, num_feat_entries);
00028 }
00029 
00030 template <class T>
00031 SGSparseVector<T>::SGSparseVector(const SGSparseVector& orig) :
00032     SGReferencedData(orig)
00033 {
00034     copy_data(orig);
00035 }
00036 
00037 template <class T>
00038 SGSparseVector<T>::~SGSparseVector()
00039 {
00040     unref();
00041 }
00042 
00043 template <class T>
00044 T SGSparseVector<T>::dense_dot(T alpha, T* vec, int32_t dim, T b)
00045 {
00046     ASSERT(vec)
00047     T result=b;
00048 
00049     if (features)
00050     {
00051         for (int32_t i=0; i<num_feat_entries; i++)
00052         {
00053             if (features[i].feat_index<dim)
00054                 result+=alpha*vec[features[i].feat_index]*features[i].entry;
00055         }
00056     }
00057 
00058     return result;
00059 }
00060 
00061 template <class T>
00062 template <typename ST>
00063 T SGSparseVector<T>::dense_dot(SGVector<ST> vec)
00064 {
00065     ASSERT(vec)
00066     T result(0.0);
00067 
00068     if (features)
00069     {
00070         for (int32_t i=0; i<num_feat_entries; i++)
00071         {
00072             if (features[i].feat_index<vec.vlen)
00073                 result+=static_cast<T>(vec[features[i].feat_index])
00074                     *features[i].entry;
00075         }
00076     }
00077 
00078     return result;
00079 }
00080 
00081 template complex128_t SGSparseVector<complex128_t>::dense_dot<float64_t>(SGVector<float64_t>);
00082 template complex128_t SGSparseVector<complex128_t>::dense_dot<int32_t>(SGVector<int32_t> vec);
00083 template float64_t SGSparseVector<float64_t>::dense_dot<int32_t>(SGVector<int32_t> vec);
00084 
00085 template <class T>
00086 T SGSparseVector<T>::sparse_dot(const SGSparseVector<T>& v)
00087 {
00088     return sparse_dot(*this, v);
00089 }
00090 
00091 template <class T>
00092 T SGSparseVector<T>::sparse_dot(const SGSparseVector<T>& a, const SGSparseVector<T>& b)
00093 {
00094     if (a.num_feat_entries == 0 || b.num_feat_entries == 0)
00095         return 0;
00096 
00097     int32_t cmp = cmp_dot_prod_symmetry_fast(a.num_feat_entries, b.num_feat_entries);
00098 
00099     if (cmp == 0) // symmetric
00100     {
00101         return dot_prod_symmetric(a, b);
00102     }
00103     else if (cmp > 0) // b has more element
00104     {
00105         return dot_prod_asymmetric(a, b);
00106     }
00107     else // a has more element
00108     {
00109         return dot_prod_asymmetric(b, a);
00110     }
00111 }
00112 
00113 template<class T>
00114 int32_t SGSparseVector<T>::get_num_dimensions()
00115 {
00116     if (!features)
00117         return 0;
00118 
00119     int32_t dimensions = -1;
00120     for (index_t i=0; i<num_feat_entries; i++)
00121     {
00122         if (features[i].feat_index > dimensions)
00123         {
00124             dimensions = features[i].feat_index;
00125         }
00126     }
00127 
00128     return dimensions+1;
00129 }
00130 
00131 template<class T>
00132 void SGSparseVector<T>::sort_features(bool stable_pointer)
00133 {
00134     if (!num_feat_entries)
00135         return;
00136 
00137     // remember old pointer to enforce quarantee
00138     const SGSparseVectorEntry<T>* old_features_ptr = features;
00139 
00140     int32_t* feat_idx=SG_MALLOC(int32_t, num_feat_entries);
00141     for (index_t j=0; j<num_feat_entries; j++)
00142     {
00143         feat_idx[j]=features[j].feat_index;
00144     }
00145 
00146     CMath::qsort_index(feat_idx, features, num_feat_entries);
00147     SG_FREE(feat_idx);
00148 
00149     for (index_t j=1; j<num_feat_entries; j++)
00150     {
00151         REQUIRE(features[j-1].feat_index <= features[j].feat_index,
00152                 "sort_features(): failed sanity check %d <= %d after sorting (comparing indices features[%d] <= features[%d], features=%d)\n",
00153                 features[j-1].feat_index, features[j].feat_index, j-1, j, num_feat_entries);
00154     }
00155 
00156     // compression: removing zero-entries and merging features with same index
00157     int32_t last_index = 0;
00158     for (index_t j=1; j<num_feat_entries; j++)
00159     {
00160         // always true, but kept for future changes
00161         REQUIRE(last_index < j,
00162             "sort_features(): target index %d must not exceed source index j=%d",
00163             last_index, j);
00164         REQUIRE(features[last_index].feat_index <= features[j].feat_index,
00165             "sort_features(): failed sanity check %d = features[%d].feat_index <= features[%d].feat_index = %d\n",
00166             features[last_index].feat_index, last_index, j, features[j].feat_index);
00167 
00168         // merging of features with same index
00169         if (features[last_index].feat_index == features[j].feat_index)
00170         {
00171             features[last_index].entry += features[j].entry;
00172             continue;
00173         }
00174 
00175         // only skip to next element if current one is not zero
00176         if (features[last_index].entry != 0.0)
00177         {
00178             last_index++;
00179         }
00180 
00181         features[last_index] = features[j];
00182     }
00183 
00184     // remove single first element if zero (not caught by loop)
00185     if (features[last_index].entry == 0.0)
00186     {
00187         last_index--;
00188     }
00189 
00190     int32_t new_feat_count = last_index+1;
00191     ASSERT(new_feat_count <= num_feat_entries);
00192 
00193     // shrinking vector
00194     if (!stable_pointer)
00195     {
00196         SG_SINFO("shrinking vector from %d to %d\n", num_feat_entries, new_feat_count);
00197         features = SG_REALLOC(SGSparseVectorEntry<T>, features, num_feat_entries, new_feat_count);
00198     }
00199     num_feat_entries = new_feat_count;
00200 
00201     for (index_t j=1; j<num_feat_entries; j++)
00202     {
00203         REQUIRE(features[j-1].feat_index < features[j].feat_index,
00204                 "sort_features(): failed sanity check %d < %d after sorting (comparing indices features[%d] < features[%d], features=%d)\n",
00205                 features[j-1].feat_index, features[j].feat_index, j-1, j, num_feat_entries);
00206     }
00207 
00208     // compare with old pointer to enforce quarantee
00209     if (stable_pointer) {
00210         ASSERT(old_features_ptr == features);
00211     }
00212     return;
00213 }
00214 
00215 template<class T>
00216 T SGSparseVector<T>::get_feature(int32_t index)
00217 {
00218     T ret = 0;
00219     if (features)
00220     {
00221         for (index_t i=0; i<num_feat_entries; i++)
00222             if (features[i].feat_index==index)
00223                 ret+=features[i].entry ;
00224     }
00225 
00226     return ret ;
00227 }
00228 
00229 template<class T>
00230 SGVector<T> SGSparseVector<T>::get_dense()
00231 {
00232     SGVector<T> dense;
00233 
00234     if (features)
00235     {
00236         dense.resize_vector(get_num_dimensions());
00237         dense.zero();
00238 
00239         for (index_t i=0; i<num_feat_entries; i++)
00240         {
00241             dense.vector[features[i].feat_index] += features[i].entry;
00242         }
00243     }
00244 
00245     return dense;
00246 }
00247 
00248 template<class T>
00249 SGVector<T> SGSparseVector<T>::get_dense(int32_t dimension)
00250 {
00251     SGVector<T> dense(dimension);
00252     dense.zero();
00253 
00254     if (features)
00255     {
00256         REQUIRE(get_num_dimensions() <= dimension, "get_dense(dimension=%d): sparse dimension %d exceeds requested dimension\n",
00257                 dimension, get_num_dimensions());
00258 
00259         for (index_t i=0; i<num_feat_entries; i++)
00260         {
00261             dense.vector[features[i].feat_index] += features[i].entry;
00262         }
00263     }
00264 
00265     return dense;
00266 }
00267 
00268 template<class T>
00269 SGSparseVector<T> SGSparseVector<T>::clone() const
00270 {
00271     SGSparseVectorEntry <T> * copy = SG_MALLOC(SGSparseVectorEntry <T>, num_feat_entries);
00272     memcpy(copy, features, num_feat_entries * sizeof(SGSparseVectorEntry <T>));
00273     return SGSparseVector<T>(copy, num_feat_entries);
00274 }
00275 
00276 template<class T> void SGSparseVector<T>::load(CFile* loader)
00277 {
00278     ASSERT(loader)
00279     unref();
00280 
00281     SG_SET_LOCALE_C;
00282     loader->get_sparse_vector(features, num_feat_entries);
00283     SG_RESET_LOCALE;
00284 }
00285 
00286 template<class T> void SGSparseVector<T>::save(CFile* saver)
00287 {
00288     ASSERT(saver)
00289 
00290     SG_SET_LOCALE_C;
00291     saver->set_sparse_vector(features, num_feat_entries);
00292     SG_RESET_LOCALE;
00293 }
00294 
00295 template <>
00296 void SGSparseVector<complex128_t>::load(CFile* loader)
00297 {
00298     SG_SERROR("SGSparseVector::load():: Not supported for complex128_t\n");
00299 }
00300 
00301 template <>
00302 void SGSparseVector<complex128_t>::save(CFile* saver)
00303 {
00304     SG_SERROR("SGSparseVector::save():: Not supported for complex128_t\n");
00305 }
00306 
00307 template <class T>
00308 void SGSparseVector<T>::copy_data(const SGReferencedData& orig)
00309 {
00310     num_feat_entries = ((SGSparseVector*)(&orig))->num_feat_entries;
00311     features = ((SGSparseVector*)(&orig))->features;
00312 }
00313 
00314 template <class T>
00315 void SGSparseVector<T>::init_data()
00316 {
00317     num_feat_entries = 0;
00318     features = NULL;
00319 }
00320 
00321 template <class T>
00322 void SGSparseVector<T>::free_data()
00323 {
00324     num_feat_entries = 0;
00325     SG_FREE(features);
00326 }
00327 
00328 template <class T>
00329 int32_t SGSparseVector<T>::cmp_dot_prod_symmetry_fast(index_t alen, index_t blen)
00330 {
00331     if (alen > blen) // no need for floats here
00332     {
00333         return (blen * CMath::floor_log(alen) < alen) ? -1 : 0;
00334     }
00335     else // alen <= blen
00336     {
00337         return (alen * CMath::floor_log(blen) < blen) ? 1 : 0;
00338     }
00339 }
00340 
00341 template <class T>
00342 T SGSparseVector<T>::dot_prod_asymmetric(const SGSparseVector<T>& a, const SGSparseVector<T>& b)
00343 {
00344     T dot_prod = 0;
00345     for(index_t b_idx = 0; b_idx < b.num_feat_entries; ++b_idx)
00346     {
00347         const T tmp = b.features[b_idx].entry;
00348         if (a.features[a.num_feat_entries-1].feat_index < b.features[b_idx].feat_index)
00349             break;
00350         for (index_t a_idx = 0; a_idx < a.num_feat_entries; ++a_idx)
00351         {
00352             if (a.features[a_idx].feat_index == b.features[b_idx].feat_index)
00353                 dot_prod += tmp * a.features[a_idx].entry;
00354         }
00355     }
00356     return dot_prod;
00357 }
00358 
00359 template <class T>
00360 T SGSparseVector<T>::dot_prod_symmetric(const SGSparseVector<T>& a, const SGSparseVector<T>& b)
00361 {
00362     ASSERT(a.num_feat_entries > 0 && b.num_feat_entries > 0)
00363     T dot_prod = 0;
00364     index_t a_idx = 0, b_idx = 0;
00365     while (true)
00366     {
00367         if (a.features[a_idx].feat_index == b.features[b_idx].feat_index)
00368         {
00369             dot_prod += a.features[a_idx].entry * b.features[b_idx].entry;
00370 
00371             a_idx++;
00372             if (a.num_feat_entries == a_idx)
00373                 break;
00374             b_idx++;
00375             if (b.num_feat_entries == b_idx)
00376                 break;
00377         }
00378         else if (a.features[a_idx].feat_index < b.features[b_idx].feat_index)
00379         {
00380             a_idx++;
00381             if (a.num_feat_entries == a_idx)
00382                 break;
00383         }
00384         else
00385         {
00386             b_idx++;
00387             if (b.num_feat_entries == b_idx)
00388                 break;
00389         }
00390     }
00391     return dot_prod;
00392 }
00393 
00394 template <>
00395 void SGSparseVector<bool>::display_vector(const char* name, const char* prefix)
00396 {
00397     SG_SPRINT("%s%s=[", prefix, name);
00398     for (int32_t i=0; i<num_feat_entries; i++)
00399         SG_SPRINT("%s%s%d:%d", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry ? 1 : 0);
00400     SG_SPRINT("%s]\n", prefix);
00401 }
00402 
00403 template <>
00404 void SGSparseVector<char>::display_vector(const char* name, const char* prefix)
00405 {
00406     SG_SPRINT("%s%s=[", prefix, name);
00407     for (int32_t i=0; i<num_feat_entries; i++)
00408         SG_SPRINT("%s%s%d:%c", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry);
00409     SG_SPRINT("%s]\n", prefix);
00410 }
00411 
00412 template <>
00413 void SGSparseVector<int8_t>::display_vector(const char* name, const char* prefix)
00414 {
00415     SG_SPRINT("%s%s=[", prefix, name);
00416     for (int32_t i=0; i<num_feat_entries; i++)
00417         SG_SPRINT("%s%s%d:%d", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry);
00418     SG_SPRINT("%s]\n", prefix);
00419 }
00420 
00421 template <>
00422 void SGSparseVector<uint8_t>::display_vector(const char* name, const char* prefix)
00423 {
00424     SG_SPRINT("%s%s=[", prefix, name);
00425     for (int32_t i=0; i<num_feat_entries; i++)
00426         SG_SPRINT("%s%s%d:%u", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry);
00427     SG_SPRINT("%s]\n", prefix);
00428 }
00429 
00430 template <>
00431 void SGSparseVector<int16_t>::display_vector(const char* name, const char* prefix)
00432 {
00433     SG_SPRINT("%s%s=[", prefix, name);
00434     for (int32_t i=0; i<num_feat_entries; i++)
00435         SG_SPRINT("%s%s%d:%d", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry);
00436     SG_SPRINT("%s]\n", prefix);
00437 }
00438 
00439 template <>
00440 void SGSparseVector<uint16_t>::display_vector(const char* name, const char* prefix)
00441 {
00442     SG_SPRINT("%s%s=[", prefix, name);
00443     for (int32_t i=0; i<num_feat_entries; i++)
00444         SG_SPRINT("%s%s%d:%u", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry);
00445     SG_SPRINT("%s]\n", prefix);
00446 }
00447 
00448 template <>
00449 void SGSparseVector<int32_t>::display_vector(const char* name, const char* prefix)
00450 {
00451     SG_SPRINT("%s%s=[", prefix, name);
00452     for (int32_t i=0; i<num_feat_entries; i++)
00453         SG_SPRINT("%s%s%d:%d", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry);
00454     SG_SPRINT("%s]\n", prefix);
00455 }
00456 
00457 template <>
00458 void SGSparseVector<uint32_t>::display_vector(const char* name, const char* prefix)
00459 {
00460     SG_SPRINT("%s%s=[", prefix, name);
00461     for (int32_t i=0; i<num_feat_entries; i++)
00462         SG_SPRINT("%s%s%d:%u", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry);
00463     SG_SPRINT("%s]\n", prefix);
00464 }
00465 
00466 template <>
00467 void SGSparseVector<int64_t>::display_vector(const char* name, const char* prefix)
00468 {
00469     SG_SPRINT("%s%s=[", prefix, name);
00470     for (int32_t i=0; i<num_feat_entries; i++)
00471         SG_SPRINT("%s%s%d:%lld", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry);
00472     SG_SPRINT("%s]\n", prefix);
00473 }
00474 
00475 template <>
00476 void SGSparseVector<uint64_t>::display_vector(const char* name, const char* prefix)
00477 {
00478     SG_SPRINT("%s%s=[", prefix, name);
00479     for (int32_t i=0; i<num_feat_entries; i++)
00480         SG_SPRINT("%s%s%d:%llu ", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry);
00481     SG_SPRINT("%s]\n", prefix);
00482 }
00483 
00484 template <>
00485 void SGSparseVector<float32_t>::display_vector(const char* name, const char* prefix)
00486 {
00487     SG_SPRINT("%s%s=[", prefix, name);
00488     for (int32_t i=0; i<num_feat_entries; i++)
00489         SG_SPRINT("%s%s%d:%g", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry);
00490     SG_SPRINT("%s]\n", prefix);
00491 }
00492 
00493 template <>
00494 void SGSparseVector<float64_t>::display_vector(const char* name, const char* prefix)
00495 {
00496     SG_SPRINT("%s%s=[", prefix, name);
00497     for (int32_t i=0; i<num_feat_entries; i++)
00498         SG_SPRINT("%s%s%d:%.18g", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry);
00499     SG_SPRINT("%s]\n", prefix);
00500 }
00501 
00502 template <>
00503 void SGSparseVector<floatmax_t>::display_vector(const char* name, const char* prefix)
00504 {
00505     SG_SPRINT("%s%s=[", prefix, name);
00506     for (int32_t i=0; i<num_feat_entries; i++)
00507         SG_SPRINT("%s%s%d:%.36Lg", prefix, i==0 ? "" : " ", features[i].feat_index, features[i].entry);
00508     SG_SPRINT("%s]\n", prefix);
00509 }
00510 
00511 template <>
00512 void SGSparseVector<complex128_t>::display_vector(const char* name, const char* prefix)
00513 {
00514     SG_SPRINT("%s%s=[", prefix, name);
00515     for (int32_t i=0; i<num_feat_entries; i++)
00516         SG_SPRINT("%s%s%d:(%.18lg+i%.18lg)", prefix, i==0 ? "" : " ", features[i].feat_index,
00517                 features[i].entry.real(), features[i].entry.imag());
00518     SG_SPRINT("%s]\n", prefix);
00519 }
00520 
00521 template class SGSparseVector<bool>;
00522 template class SGSparseVector<char>;
00523 template class SGSparseVector<int8_t>;
00524 template class SGSparseVector<uint8_t>;
00525 template class SGSparseVector<int16_t>;
00526 template class SGSparseVector<uint16_t>;
00527 template class SGSparseVector<int32_t>;
00528 template class SGSparseVector<uint32_t>;
00529 template class SGSparseVector<int64_t>;
00530 template class SGSparseVector<uint64_t>;
00531 template class SGSparseVector<float32_t>;
00532 template class SGSparseVector<float64_t>;
00533 template class SGSparseVector<floatmax_t>;
00534 template class SGSparseVector<complex128_t>;
00535 }
00536 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation