SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
HashedDocDotFeatures.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evangelos Anagnostopoulos
00008  * Copyright (C) 2013 Evangelos Anagnostopoulos
00009  */
00010 
00011 #include <shogun/features/HashedDocDotFeatures.h>
00012 #include <shogun/lib/DelimiterTokenizer.h>
00013 #include <shogun/lib/Hash.h>
00014 #include <shogun/mathematics/Math.h>
00015 
00016 namespace shogun
00017 {
00018 CHashedDocDotFeatures::CHashedDocDotFeatures(int32_t hash_bits, CStringFeatures<char>* docs,
00019     CTokenizer* tzer, bool normalize, int32_t n_grams, int32_t skips, int32_t size) : CDotFeatures(size)
00020 {
00021     if (n_grams < 1)
00022         n_grams = 1;
00023 
00024     if ( (n_grams==1 && skips!=0) || (skips<0))
00025         skips = 0;
00026 
00027     init(hash_bits, docs, tzer, normalize, n_grams, skips);
00028 }
00029 
00030 CHashedDocDotFeatures::CHashedDocDotFeatures(const CHashedDocDotFeatures& orig)
00031 : CDotFeatures(orig)
00032 {
00033     init(orig.num_bits, orig.doc_collection, orig.tokenizer, orig.should_normalize,
00034             orig.ngrams, orig.tokens_to_skip);
00035 }
00036 
00037 CHashedDocDotFeatures::CHashedDocDotFeatures(CFile* loader)
00038 {
00039     SG_NOTIMPLEMENTED;
00040 }
00041 
00042 void CHashedDocDotFeatures::init(int32_t hash_bits, CStringFeatures<char>* docs,
00043     CTokenizer* tzer, bool normalize, int32_t n_grams, int32_t skips)
00044 {
00045     num_bits = hash_bits;
00046     ngrams = n_grams;
00047     tokens_to_skip = skips;
00048     doc_collection = docs;
00049     tokenizer = tzer;
00050     should_normalize = normalize;
00051 
00052     if (!tokenizer)
00053     {
00054         tokenizer = new CDelimiterTokenizer();
00055         ((CDelimiterTokenizer* )tokenizer)->init_for_whitespace();
00056     }
00057 
00058     SG_ADD(&num_bits, "num_bits", "Number of bits of hash", MS_NOT_AVAILABLE);
00059     SG_ADD(&ngrams, "ngrams", "Number of tokens to combine for quadratic feature support",
00060             MS_NOT_AVAILABLE);
00061     SG_ADD(&tokens_to_skip, "tokens_to_skip", "Number of tokens to skip when combining features",
00062             MS_NOT_AVAILABLE);
00063     SG_ADD((CSGObject**) &doc_collection, "doc_collection", "Document collection",
00064             MS_NOT_AVAILABLE);
00065     SG_ADD((CSGObject**) &tokenizer, "tokenizer", "Document tokenizer",
00066             MS_NOT_AVAILABLE);
00067     SG_ADD(&should_normalize, "should_normalize", "Normalize or not the dot products",
00068             MS_NOT_AVAILABLE);
00069 
00070     SG_REF(doc_collection);
00071     SG_REF(tokenizer);
00072 }
00073 
00074 CHashedDocDotFeatures::~CHashedDocDotFeatures()
00075 {
00076     SG_UNREF(doc_collection);
00077     SG_UNREF(tokenizer);
00078 }
00079 
00080 int32_t CHashedDocDotFeatures::get_dim_feature_space() const
00081 {
00082     return CMath::pow(2, num_bits);
00083 }
00084 
00085 float64_t CHashedDocDotFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
00086 {
00087     ASSERT(df)
00088     ASSERT(df->get_name() == get_name())
00089 
00090     CHashedDocDotFeatures* hddf = (CHashedDocDotFeatures*) df;
00091 
00092     SGVector<char> sv1 = doc_collection->get_feature_vector(vec_idx1);
00093     SGVector<char> sv2 = hddf->doc_collection->get_feature_vector(vec_idx2);
00094 
00095     CHashedDocConverter* converter = new CHashedDocConverter(tokenizer, num_bits,
00096             should_normalize, ngrams, tokens_to_skip);
00097     SGSparseVector<float64_t> cv1 = converter->apply(sv1);
00098     SGSparseVector<float64_t> cv2 = converter->apply(sv2);
00099     float64_t result = SGSparseVector<float64_t>::sparse_dot(cv1,cv2);
00100 
00101     doc_collection->free_feature_vector(sv1, vec_idx1);
00102     hddf->doc_collection->free_feature_vector(sv2, vec_idx2);
00103     SG_UNREF(converter);
00104 
00105     return result;
00106 }
00107 
00108 float64_t CHashedDocDotFeatures::dense_dot_sgvec(int32_t vec_idx1, const SGVector<float64_t> vec2)
00109 {
00110     return dense_dot(vec_idx1, vec2.vector, vec2.vlen);
00111 }
00112 
00113 float64_t CHashedDocDotFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len)
00114 {
00115     ASSERT(vec2_len == CMath::pow(2,num_bits))
00116 
00117     SGVector<char> sv = doc_collection->get_feature_vector(vec_idx1);
00118 
00121     SGVector<uint32_t> hashes(ngrams+tokens_to_skip);
00122     index_t hashes_start = 0;
00123     index_t hashes_end = 0;
00124     int32_t len = hashes.vlen - 1;
00125 
00128     SGVector<index_t> hashed_indices((ngrams-1)*(tokens_to_skip+1) + 1);
00129 
00130     float64_t result = 0;
00131     CTokenizer* local_tzer = tokenizer->get_copy();
00132 
00134     const int32_t seed = 0xdeadbeaf;
00135     local_tzer->set_text(sv);
00136     index_t start = 0;
00137     while (hashes_end<ngrams-1+tokens_to_skip && local_tzer->has_next())
00138     {
00139         index_t end = local_tzer->next_token_idx(start);
00140         uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
00141         hashes[hashes_end++] = token_hash;
00142     }
00143 
00145     while (local_tzer->has_next())
00146     {
00147         index_t end = local_tzer->next_token_idx(start);
00148         uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
00149         hashes[hashes_end] = token_hash;
00150 
00151         CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start, len, hashed_indices,
00152                 num_bits, ngrams, tokens_to_skip);
00153 
00154         for (index_t i=0; i<hashed_indices.vlen; i++)
00155             result += vec2[hashed_indices[i]];
00156 
00157         hashes_start++;
00158         hashes_end++;
00159         if (hashes_end==hashes.vlen)
00160             hashes_end = 0;
00161         if (hashes_start==hashes.vlen)
00162             hashes_start = 0;
00163     }
00164 
00165     if (ngrams>1)
00166     {
00167         while (hashes_start!=hashes_end)
00168         {
00169             len--;
00170             index_t max_idx = CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start,
00171                     len, hashed_indices, num_bits, ngrams, tokens_to_skip);
00172 
00173             for (index_t i=0; i<max_idx; i++)
00174                 result += vec2[hashed_indices[i]];
00175 
00176             hashes_start++;
00177             if (hashes_start==hashes.vlen)
00178                 hashes_start = 0;
00179         }
00180     }
00181     doc_collection->free_feature_vector(sv, vec_idx1);
00182     SG_UNREF(local_tzer);
00183     return should_normalize ? result / CMath::sqrt((float64_t) sv.size()) : result;
00184 }
00185 
00186 void CHashedDocDotFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1,
00187     float64_t* vec2, int32_t vec2_len, bool abs_val)
00188 {
00189     ASSERT(vec2_len == CMath::pow(2,num_bits))
00190 
00191     if (abs_val)
00192         alpha = CMath::abs(alpha);
00193 
00194     SGVector<char> sv = doc_collection->get_feature_vector(vec_idx1);
00195     const float64_t value = should_normalize ? alpha / CMath::sqrt((float64_t) sv.size()) : alpha;
00196 
00199     SGVector<uint32_t> hashes(ngrams+tokens_to_skip);
00200     index_t hashes_start = 0;
00201     index_t hashes_end = 0;
00202     index_t len = hashes.vlen - 1;
00203 
00206     SGVector<index_t> hashed_indices((ngrams-1)*(tokens_to_skip+1) + 1);
00207 
00208     CTokenizer* local_tzer = tokenizer->get_copy();
00209 
00211     const int32_t seed = 0xdeadbeaf;
00212     local_tzer->set_text(sv);
00213     index_t start = 0;
00214     while (hashes_end<ngrams-1+tokens_to_skip && local_tzer->has_next())
00215     {
00216         index_t end = local_tzer->next_token_idx(start);
00217         uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
00218         hashes[hashes_end++] = token_hash;
00219     }
00220 
00221     while (local_tzer->has_next())
00222     {
00223         index_t end = local_tzer->next_token_idx(start);
00224         uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed);
00225         hashes[hashes_end] = token_hash;
00226 
00227         CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start, len, hashed_indices,
00228                 num_bits, ngrams, tokens_to_skip);
00229 
00230         for (index_t i=0; i<hashed_indices.vlen; i++)
00231             vec2[hashed_indices[i]] += value;
00232 
00233         hashes_start++;
00234         hashes_end++;
00235         if (hashes_end==hashes.vlen)
00236             hashes_end = 0;
00237         if (hashes_start==hashes.vlen)
00238             hashes_start = 0;
00239     }
00240 
00241     if (ngrams>1)
00242     {
00243         while (hashes_start!=hashes_end)
00244         {
00245             len--;
00246             index_t max_idx = CHashedDocConverter::generate_ngram_hashes(hashes,
00247                     hashes_start, len, hashed_indices, num_bits, ngrams, tokens_to_skip);
00248 
00249             for (index_t i=0; i<max_idx; i++)
00250                 vec2[hashed_indices[i]] += value;
00251 
00252             hashes_start++;
00253             if (hashes_start==hashes.vlen)
00254                 hashes_start = 0;
00255         }
00256     }
00257 
00258     doc_collection->free_feature_vector(sv, vec_idx1);
00259     SG_UNREF(local_tzer);
00260 }
00261 
00262 uint32_t CHashedDocDotFeatures::calculate_token_hash(char* token,
00263         int32_t length, int32_t num_bits, uint32_t seed)
00264 {
00265     int32_t hash = CHash::MurmurHash3((uint8_t* ) token, length, seed);
00266     return hash & ((1 << num_bits) - 1);
00267 }
00268 
00269 void CHashedDocDotFeatures::set_doc_collection(CStringFeatures<char>* docs)
00270 {
00271     SG_UNREF(doc_collection);
00272     doc_collection = docs;
00273 }
00274 
00275 int32_t CHashedDocDotFeatures::get_nnz_features_for_vector(int32_t num)
00276 {
00277     SGVector<char> sv = doc_collection->get_feature_vector(num);
00278     int32_t num_nnz_features = sv.size();
00279     doc_collection->free_feature_vector(sv, num);
00280     return num_nnz_features;
00281 }
00282 
00283 void* CHashedDocDotFeatures::get_feature_iterator(int32_t vector_index)
00284 {
00285     SG_NOTIMPLEMENTED;
00286     return NULL;
00287 }
00288 
00289 bool CHashedDocDotFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator)
00290 {
00291     SG_NOTIMPLEMENTED;
00292     return false;
00293 }
00294 
00295 void CHashedDocDotFeatures::free_feature_iterator(void* iterator)
00296 {
00297     SG_NOTIMPLEMENTED;
00298 }
00299 
00300 const char* CHashedDocDotFeatures::get_name() const
00301 {
00302     return "HashedDocDotFeatures";
00303 }
00304 
00305 CFeatures* CHashedDocDotFeatures::duplicate() const
00306 {
00307     return new CHashedDocDotFeatures(*this);
00308 }
00309 
00310 EFeatureType CHashedDocDotFeatures::get_feature_type() const
00311 {
00312     return F_UINT;
00313 }
00314 
00315 EFeatureClass CHashedDocDotFeatures::get_feature_class() const
00316 {
00317     return C_SPARSE;
00318 }
00319 
00320 int32_t CHashedDocDotFeatures::get_num_vectors() const
00321 {
00322     return doc_collection->get_num_vectors();
00323 }
00324 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation