SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
HashedDocConverter.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evangelos Anagnostopoulos
00008  * Copyright (C) 2013 Evangelos Anagnostopoulos
00009  */
00010 
00011 #include <shogun/converter/HashedDocConverter.h>
00012 #include <shogun/lib/DelimiterTokenizer.h>
00013 #include <shogun/lib/Hash.h>
00014 #include <shogun/lib/DynamicArray.h>
00015 #include <shogun/features/StringFeatures.h>
00016 #include <shogun/features/HashedDocDotFeatures.h>
00017 #include <shogun/mathematics/Math.h>
00018 
00019 using namespace shogun;
00020 
00021 namespace shogun
00022 {
00023 CHashedDocConverter::CHashedDocConverter() : CConverter()
00024 {
00025     init(NULL, 16, false, 1, 0);
00026 }
00027 
00028 CHashedDocConverter::CHashedDocConverter(int32_t hash_bits, bool normalize,
00029     int32_t n_grams, int32_t skips) : CConverter()
00030 {
00031     init(NULL, hash_bits, normalize, n_grams, skips);
00032 }
00033 
00034 CHashedDocConverter::CHashedDocConverter(CTokenizer* tzer,
00035     int32_t hash_bits, bool normalize, int32_t n_grams, int32_t skips) : CConverter()
00036 {
00037     init(tzer, hash_bits, normalize, n_grams, skips);
00038 }
00039 
00040 CHashedDocConverter::~CHashedDocConverter()
00041 {
00042     SG_UNREF(tokenizer);
00043 }
00044 
00045 void CHashedDocConverter::init(CTokenizer* tzer, int32_t hash_bits, bool normalize,
00046     int32_t n_grams, int32_t skips)
00047 {
00048     num_bits = hash_bits;
00049     should_normalize = normalize;
00050     ngrams = n_grams;
00051     tokens_to_skip = skips;
00052 
00053     if (tzer==NULL)
00054     {
00055         CDelimiterTokenizer* tk = new CDelimiterTokenizer();
00056         tk->delimiters[(uint8_t) ' '] = 1;
00057         tk->delimiters[(uint8_t) '\t'] = 1;
00058         tokenizer = tk;
00059     }
00060     else
00061         tokenizer = tzer;
00062 
00063     SG_REF(tokenizer);
00064     SG_ADD(&num_bits, "num_bits", "Number of bits of the hash",
00065         MS_NOT_AVAILABLE);
00066     SG_ADD(&ngrams, "ngrams", "Number of consecutive tokens",
00067         MS_NOT_AVAILABLE);
00068     SG_ADD(&tokens_to_skip, "tokens_to_skip", "Number of tokens to skip",
00069         MS_NOT_AVAILABLE);
00070     SG_ADD(&should_normalize, "should_normalize", "Whether to normalize vectors or not",
00071         MS_NOT_AVAILABLE);
00072     m_parameters->add((CSGObject**) &tokenizer, "tokenizer",
00073         "Tokenizer");
00074 }
00075 
00076 const char* CHashedDocConverter::get_name() const
00077 {
00078     return "HashedDocConverter";
00079 }
00080 
00081 CFeatures* CHashedDocConverter::apply(CFeatures* features)
00082 {
00083     ASSERT(features);
00084     if (strcmp(features->get_name(), "StringFeatures")!=0)
00085         SG_ERROR("CHashedConverter::apply() : CFeatures object passed is not of type CStringFeatures.");
00086 
00087     CStringFeatures<char>* s_features = (CStringFeatures<char>*) features;
00088 
00089     int32_t dim = CMath::pow(2, num_bits);
00090     SGSparseMatrix<float64_t> matrix(dim,features->get_num_vectors());
00091     for (index_t vec_idx=0; vec_idx<s_features->get_num_vectors(); vec_idx++)
00092     {
00093         SGVector<char> doc = s_features->get_feature_vector(vec_idx);
00094         matrix[vec_idx] = apply(doc);
00095         s_features->free_feature_vector(doc, vec_idx);
00096     }
00097 
00098     return (CFeatures*) new CSparseFeatures<float64_t>(matrix);
00099 }
00100 
00101 SGSparseVector<float64_t> CHashedDocConverter::apply(SGVector<char> document)
00102 {
00103     ASSERT(document.size()>0)
00104     const int32_t array_size = 1024*1024;
00106     CDynamicArray<uint32_t> hashed_indices(array_size);
00107 
00110     SGVector<uint32_t> cached_hashes(ngrams+tokens_to_skip);
00111     index_t hashes_start = 0;
00112     index_t hashes_end = 0;
00113     int32_t len = cached_hashes.vlen - 1;
00114 
00117     SGVector<index_t> ngram_indices((ngrams-1)*(tokens_to_skip+1) + 1);
00118 
00120     const int32_t seed = 0xdeadbeaf;
00121     tokenizer->set_text(document);
00122     index_t token_start = 0;
00123     while (hashes_end<ngrams-1+tokens_to_skip && tokenizer->has_next())
00124     {
00125         index_t end = tokenizer->next_token_idx(token_start);
00126         uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &document.vector[token_start],
00127                 end-token_start, seed);
00128         cached_hashes[hashes_end++] = token_hash;
00129     }
00130 
00132     while (tokenizer->has_next())
00133     {
00134         index_t end = tokenizer->next_token_idx(token_start);
00135         uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &document.vector[token_start],
00136                 end-token_start, seed);
00137         cached_hashes[hashes_end] = token_hash;
00138 
00139         CHashedDocConverter::generate_ngram_hashes(cached_hashes, hashes_start, len,
00140                 ngram_indices, num_bits, ngrams, tokens_to_skip);
00141 
00142         for (index_t i=0; i<ngram_indices.vlen; i++)
00143             hashed_indices.append_element(ngram_indices[i]);
00144 
00145         hashes_start++;
00146         hashes_end++;
00147         if (hashes_end==cached_hashes.vlen)
00148             hashes_end = 0;
00149         if (hashes_start==cached_hashes.vlen)
00150             hashes_start = 0;
00151     }
00152 
00154     if (ngrams>1)
00155     {
00156         while (hashes_start!=hashes_end)
00157         {
00158             len--;
00159             index_t max_idx = CHashedDocConverter::generate_ngram_hashes(cached_hashes, hashes_start,
00160                     len, ngram_indices, num_bits, ngrams, tokens_to_skip);
00161 
00162             for (index_t i=0; i<max_idx; i++)
00163                 hashed_indices.append_element(ngram_indices[i]);
00164 
00165             hashes_start++;
00166             if (hashes_start==cached_hashes.vlen)
00167                 hashes_start = 0;
00168         }
00169     }
00170 
00171     SGSparseVector<float64_t> sparse_doc_rep = create_hashed_representation(hashed_indices);
00172 
00174     if (should_normalize)
00175     {
00176         float64_t norm_const = CMath::sqrt((float64_t) document.size());
00177         for (index_t i=0; i<sparse_doc_rep.num_feat_entries; i++)
00178             sparse_doc_rep.features[i].entry /= norm_const;
00179     }
00180 
00181     return sparse_doc_rep;
00182 }
00183 
00184 SGSparseVector<float64_t> CHashedDocConverter::create_hashed_representation(CDynamicArray<uint32_t>& hashed_indices)
00185 {
00186     int32_t num_nnz_features = count_distinct_indices(hashed_indices);
00187 
00188     SGSparseVector<float64_t> sparse_doc_rep(num_nnz_features);
00189     index_t sparse_idx = 0;
00190     for (index_t i=0; i<hashed_indices.get_num_elements(); i++)
00191     {
00192         sparse_doc_rep.features[sparse_idx].feat_index = hashed_indices[i];
00193         sparse_doc_rep.features[sparse_idx].entry = 1;
00194         while ( (i+1<hashed_indices.get_num_elements()) &&
00195                 (hashed_indices[i+1]==hashed_indices[i]) )
00196         {
00197             sparse_doc_rep.features[sparse_idx].entry++;
00198             i++;
00199         }
00200         sparse_idx++;
00201     }
00202     return sparse_doc_rep;
00203 }
00204 
00205 index_t CHashedDocConverter::generate_ngram_hashes(SGVector<uint32_t>& hashes, index_t hashes_start,
00206     index_t len, SGVector<index_t>& ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip)
00207 {
00208     index_t h_idx = 0;
00209     ngram_hashes[h_idx++] = hashes[hashes_start] & ((1 << num_bits) -1);
00210 
00211     for (index_t n=1; n<ngrams; n++)
00212     {
00213         for (index_t s=0; s<=tokens_to_skip; s++)
00214         {
00215             if ( n+s > len)
00216                 break;
00217 
00218             uint32_t ngram_hash = hashes[hashes_start];
00219             for (index_t i=hashes_start+1+s; i<=hashes_start+n+s; i++)
00220                 ngram_hash = ngram_hash ^ hashes[i % hashes.vlen];
00221             ngram_hash = ngram_hash & ((1 << num_bits) - 1);
00222             ngram_hashes[h_idx++] = ngram_hash;
00223         }
00224     }
00225     return h_idx;
00226 }
00227 
00228 int32_t CHashedDocConverter::count_distinct_indices(CDynamicArray<uint32_t>& hashed_indices)
00229 {
00230     CMath::qsort(hashed_indices.get_array(), hashed_indices.get_num_elements());
00231 
00233     int32_t num_nnz_features = 0;
00234     for (index_t i=0; i<hashed_indices.get_num_elements(); i++)
00235     {
00236         num_nnz_features++;
00237         while ( (i+1<hashed_indices.get_num_elements()) &&
00238                 (hashed_indices[i+1]==hashed_indices[i]) )
00239         {
00240             i++;
00241         }
00242     }
00243     return num_nnz_features;
00244 }
00245 
00246 void CHashedDocConverter::set_normalization(bool normalize)
00247 {
00248     should_normalize = normalize;
00249 }
00250 
00251 void CHashedDocConverter::set_k_skip_n_grams(int32_t k, int32_t n)
00252 {
00253     tokens_to_skip = k;
00254     ngrams = n;
00255 }
00256 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation