SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evangelos Anagnostopoulos 00008 * Copyright (C) 2013 Evangelos Anagnostopoulos 00009 */ 00010 00011 #include <shogun/converter/HashedDocConverter.h> 00012 #include <shogun/lib/DelimiterTokenizer.h> 00013 #include <shogun/lib/Hash.h> 00014 #include <shogun/lib/DynamicArray.h> 00015 #include <shogun/features/StringFeatures.h> 00016 #include <shogun/features/HashedDocDotFeatures.h> 00017 #include <shogun/mathematics/Math.h> 00018 00019 using namespace shogun; 00020 00021 namespace shogun 00022 { 00023 CHashedDocConverter::CHashedDocConverter() : CConverter() 00024 { 00025 init(NULL, 16, false, 1, 0); 00026 } 00027 00028 CHashedDocConverter::CHashedDocConverter(int32_t hash_bits, bool normalize, 00029 int32_t n_grams, int32_t skips) : CConverter() 00030 { 00031 init(NULL, hash_bits, normalize, n_grams, skips); 00032 } 00033 00034 CHashedDocConverter::CHashedDocConverter(CTokenizer* tzer, 00035 int32_t hash_bits, bool normalize, int32_t n_grams, int32_t skips) : CConverter() 00036 { 00037 init(tzer, hash_bits, normalize, n_grams, skips); 00038 } 00039 00040 CHashedDocConverter::~CHashedDocConverter() 00041 { 00042 SG_UNREF(tokenizer); 00043 } 00044 00045 void CHashedDocConverter::init(CTokenizer* tzer, int32_t hash_bits, bool normalize, 00046 int32_t n_grams, int32_t skips) 00047 { 00048 num_bits = hash_bits; 00049 should_normalize = normalize; 00050 ngrams = n_grams; 00051 tokens_to_skip = skips; 00052 00053 if (tzer==NULL) 00054 { 00055 CDelimiterTokenizer* tk = new CDelimiterTokenizer(); 00056 tk->delimiters[(uint8_t) ' '] = 1; 00057 tk->delimiters[(uint8_t) '\t'] = 1; 00058 tokenizer = tk; 00059 } 00060 else 00061 tokenizer = tzer; 00062 00063 SG_REF(tokenizer); 00064 SG_ADD(&num_bits, "num_bits", "Number of bits of the hash", 00065 MS_NOT_AVAILABLE); 00066 SG_ADD(&ngrams, "ngrams", "Number of consecutive tokens", 00067 MS_NOT_AVAILABLE); 00068 SG_ADD(&tokens_to_skip, "tokens_to_skip", "Number of tokens to skip", 00069 MS_NOT_AVAILABLE); 00070 SG_ADD(&should_normalize, "should_normalize", "Whether to normalize vectors or not", 00071 MS_NOT_AVAILABLE); 00072 m_parameters->add((CSGObject**) &tokenizer, "tokenizer", 00073 "Tokenizer"); 00074 } 00075 00076 const char* CHashedDocConverter::get_name() const 00077 { 00078 return "HashedDocConverter"; 00079 } 00080 00081 CFeatures* CHashedDocConverter::apply(CFeatures* features) 00082 { 00083 ASSERT(features); 00084 if (strcmp(features->get_name(), "StringFeatures")!=0) 00085 SG_ERROR("CHashedConverter::apply() : CFeatures object passed is not of type CStringFeatures."); 00086 00087 CStringFeatures<char>* s_features = (CStringFeatures<char>*) features; 00088 00089 int32_t dim = CMath::pow(2, num_bits); 00090 SGSparseMatrix<float64_t> matrix(dim,features->get_num_vectors()); 00091 for (index_t vec_idx=0; vec_idx<s_features->get_num_vectors(); vec_idx++) 00092 { 00093 SGVector<char> doc = s_features->get_feature_vector(vec_idx); 00094 matrix[vec_idx] = apply(doc); 00095 s_features->free_feature_vector(doc, vec_idx); 00096 } 00097 00098 return (CFeatures*) new CSparseFeatures<float64_t>(matrix); 00099 } 00100 00101 SGSparseVector<float64_t> CHashedDocConverter::apply(SGVector<char> document) 00102 { 00103 ASSERT(document.size()>0) 00104 const int32_t array_size = 1024*1024; 00106 CDynamicArray<uint32_t> hashed_indices(array_size); 00107 00110 SGVector<uint32_t> cached_hashes(ngrams+tokens_to_skip); 00111 index_t hashes_start = 0; 00112 index_t hashes_end = 0; 00113 int32_t len = cached_hashes.vlen - 1; 00114 00117 SGVector<index_t> ngram_indices((ngrams-1)*(tokens_to_skip+1) + 1); 00118 00120 const int32_t seed = 0xdeadbeaf; 00121 tokenizer->set_text(document); 00122 index_t token_start = 0; 00123 while (hashes_end<ngrams-1+tokens_to_skip && tokenizer->has_next()) 00124 { 00125 index_t end = tokenizer->next_token_idx(token_start); 00126 uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &document.vector[token_start], 00127 end-token_start, seed); 00128 cached_hashes[hashes_end++] = token_hash; 00129 } 00130 00132 while (tokenizer->has_next()) 00133 { 00134 index_t end = tokenizer->next_token_idx(token_start); 00135 uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &document.vector[token_start], 00136 end-token_start, seed); 00137 cached_hashes[hashes_end] = token_hash; 00138 00139 CHashedDocConverter::generate_ngram_hashes(cached_hashes, hashes_start, len, 00140 ngram_indices, num_bits, ngrams, tokens_to_skip); 00141 00142 for (index_t i=0; i<ngram_indices.vlen; i++) 00143 hashed_indices.append_element(ngram_indices[i]); 00144 00145 hashes_start++; 00146 hashes_end++; 00147 if (hashes_end==cached_hashes.vlen) 00148 hashes_end = 0; 00149 if (hashes_start==cached_hashes.vlen) 00150 hashes_start = 0; 00151 } 00152 00154 if (ngrams>1) 00155 { 00156 while (hashes_start!=hashes_end) 00157 { 00158 len--; 00159 index_t max_idx = CHashedDocConverter::generate_ngram_hashes(cached_hashes, hashes_start, 00160 len, ngram_indices, num_bits, ngrams, tokens_to_skip); 00161 00162 for (index_t i=0; i<max_idx; i++) 00163 hashed_indices.append_element(ngram_indices[i]); 00164 00165 hashes_start++; 00166 if (hashes_start==cached_hashes.vlen) 00167 hashes_start = 0; 00168 } 00169 } 00170 00171 SGSparseVector<float64_t> sparse_doc_rep = create_hashed_representation(hashed_indices); 00172 00174 if (should_normalize) 00175 { 00176 float64_t norm_const = CMath::sqrt((float64_t) document.size()); 00177 for (index_t i=0; i<sparse_doc_rep.num_feat_entries; i++) 00178 sparse_doc_rep.features[i].entry /= norm_const; 00179 } 00180 00181 return sparse_doc_rep; 00182 } 00183 00184 SGSparseVector<float64_t> CHashedDocConverter::create_hashed_representation(CDynamicArray<uint32_t>& hashed_indices) 00185 { 00186 int32_t num_nnz_features = count_distinct_indices(hashed_indices); 00187 00188 SGSparseVector<float64_t> sparse_doc_rep(num_nnz_features); 00189 index_t sparse_idx = 0; 00190 for (index_t i=0; i<hashed_indices.get_num_elements(); i++) 00191 { 00192 sparse_doc_rep.features[sparse_idx].feat_index = hashed_indices[i]; 00193 sparse_doc_rep.features[sparse_idx].entry = 1; 00194 while ( (i+1<hashed_indices.get_num_elements()) && 00195 (hashed_indices[i+1]==hashed_indices[i]) ) 00196 { 00197 sparse_doc_rep.features[sparse_idx].entry++; 00198 i++; 00199 } 00200 sparse_idx++; 00201 } 00202 return sparse_doc_rep; 00203 } 00204 00205 index_t CHashedDocConverter::generate_ngram_hashes(SGVector<uint32_t>& hashes, index_t hashes_start, 00206 index_t len, SGVector<index_t>& ngram_hashes, int32_t num_bits, int32_t ngrams, int32_t tokens_to_skip) 00207 { 00208 index_t h_idx = 0; 00209 ngram_hashes[h_idx++] = hashes[hashes_start] & ((1 << num_bits) -1); 00210 00211 for (index_t n=1; n<ngrams; n++) 00212 { 00213 for (index_t s=0; s<=tokens_to_skip; s++) 00214 { 00215 if ( n+s > len) 00216 break; 00217 00218 uint32_t ngram_hash = hashes[hashes_start]; 00219 for (index_t i=hashes_start+1+s; i<=hashes_start+n+s; i++) 00220 ngram_hash = ngram_hash ^ hashes[i % hashes.vlen]; 00221 ngram_hash = ngram_hash & ((1 << num_bits) - 1); 00222 ngram_hashes[h_idx++] = ngram_hash; 00223 } 00224 } 00225 return h_idx; 00226 } 00227 00228 int32_t CHashedDocConverter::count_distinct_indices(CDynamicArray<uint32_t>& hashed_indices) 00229 { 00230 CMath::qsort(hashed_indices.get_array(), hashed_indices.get_num_elements()); 00231 00233 int32_t num_nnz_features = 0; 00234 for (index_t i=0; i<hashed_indices.get_num_elements(); i++) 00235 { 00236 num_nnz_features++; 00237 while ( (i+1<hashed_indices.get_num_elements()) && 00238 (hashed_indices[i+1]==hashed_indices[i]) ) 00239 { 00240 i++; 00241 } 00242 } 00243 return num_nnz_features; 00244 } 00245 00246 void CHashedDocConverter::set_normalization(bool normalize) 00247 { 00248 should_normalize = normalize; 00249 } 00250 00251 void CHashedDocConverter::set_k_skip_n_grams(int32_t k, int32_t n) 00252 { 00253 tokens_to_skip = k; 00254 ngrams = n; 00255 } 00256 }