SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evangelos Anagnostopoulos 00008 * Copyright (C) 2013 Evangelos Anagnostopoulos 00009 */ 00010 00011 #include <shogun/features/HashedDocDotFeatures.h> 00012 #include <shogun/lib/DelimiterTokenizer.h> 00013 #include <shogun/lib/Hash.h> 00014 #include <shogun/mathematics/Math.h> 00015 00016 namespace shogun 00017 { 00018 CHashedDocDotFeatures::CHashedDocDotFeatures(int32_t hash_bits, CStringFeatures<char>* docs, 00019 CTokenizer* tzer, bool normalize, int32_t n_grams, int32_t skips, int32_t size) : CDotFeatures(size) 00020 { 00021 if (n_grams < 1) 00022 n_grams = 1; 00023 00024 if ( (n_grams==1 && skips!=0) || (skips<0)) 00025 skips = 0; 00026 00027 init(hash_bits, docs, tzer, normalize, n_grams, skips); 00028 } 00029 00030 CHashedDocDotFeatures::CHashedDocDotFeatures(const CHashedDocDotFeatures& orig) 00031 : CDotFeatures(orig) 00032 { 00033 init(orig.num_bits, orig.doc_collection, orig.tokenizer, orig.should_normalize, 00034 orig.ngrams, orig.tokens_to_skip); 00035 } 00036 00037 CHashedDocDotFeatures::CHashedDocDotFeatures(CFile* loader) 00038 { 00039 SG_NOTIMPLEMENTED; 00040 } 00041 00042 void CHashedDocDotFeatures::init(int32_t hash_bits, CStringFeatures<char>* docs, 00043 CTokenizer* tzer, bool normalize, int32_t n_grams, int32_t skips) 00044 { 00045 num_bits = hash_bits; 00046 ngrams = n_grams; 00047 tokens_to_skip = skips; 00048 doc_collection = docs; 00049 tokenizer = tzer; 00050 should_normalize = normalize; 00051 00052 if (!tokenizer) 00053 { 00054 tokenizer = new CDelimiterTokenizer(); 00055 ((CDelimiterTokenizer* )tokenizer)->init_for_whitespace(); 00056 } 00057 00058 SG_ADD(&num_bits, "num_bits", "Number of bits of hash", MS_NOT_AVAILABLE); 00059 SG_ADD(&ngrams, "ngrams", "Number of tokens to combine for quadratic feature support", 00060 MS_NOT_AVAILABLE); 00061 SG_ADD(&tokens_to_skip, "tokens_to_skip", "Number of tokens to skip when combining features", 00062 MS_NOT_AVAILABLE); 00063 SG_ADD((CSGObject**) &doc_collection, "doc_collection", "Document collection", 00064 MS_NOT_AVAILABLE); 00065 SG_ADD((CSGObject**) &tokenizer, "tokenizer", "Document tokenizer", 00066 MS_NOT_AVAILABLE); 00067 SG_ADD(&should_normalize, "should_normalize", "Normalize or not the dot products", 00068 MS_NOT_AVAILABLE); 00069 00070 SG_REF(doc_collection); 00071 SG_REF(tokenizer); 00072 } 00073 00074 CHashedDocDotFeatures::~CHashedDocDotFeatures() 00075 { 00076 SG_UNREF(doc_collection); 00077 SG_UNREF(tokenizer); 00078 } 00079 00080 int32_t CHashedDocDotFeatures::get_dim_feature_space() const 00081 { 00082 return CMath::pow(2, num_bits); 00083 } 00084 00085 float64_t CHashedDocDotFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2) 00086 { 00087 ASSERT(df) 00088 ASSERT(df->get_name() == get_name()) 00089 00090 CHashedDocDotFeatures* hddf = (CHashedDocDotFeatures*) df; 00091 00092 SGVector<char> sv1 = doc_collection->get_feature_vector(vec_idx1); 00093 SGVector<char> sv2 = hddf->doc_collection->get_feature_vector(vec_idx2); 00094 00095 CHashedDocConverter* converter = new CHashedDocConverter(tokenizer, num_bits, 00096 should_normalize, ngrams, tokens_to_skip); 00097 SGSparseVector<float64_t> cv1 = converter->apply(sv1); 00098 SGSparseVector<float64_t> cv2 = converter->apply(sv2); 00099 float64_t result = SGSparseVector<float64_t>::sparse_dot(cv1,cv2); 00100 00101 doc_collection->free_feature_vector(sv1, vec_idx1); 00102 hddf->doc_collection->free_feature_vector(sv2, vec_idx2); 00103 SG_UNREF(converter); 00104 00105 return result; 00106 } 00107 00108 float64_t CHashedDocDotFeatures::dense_dot_sgvec(int32_t vec_idx1, const SGVector<float64_t> vec2) 00109 { 00110 return dense_dot(vec_idx1, vec2.vector, vec2.vlen); 00111 } 00112 00113 float64_t CHashedDocDotFeatures::dense_dot(int32_t vec_idx1, float64_t* vec2, int32_t vec2_len) 00114 { 00115 ASSERT(vec2_len == CMath::pow(2,num_bits)) 00116 00117 SGVector<char> sv = doc_collection->get_feature_vector(vec_idx1); 00118 00121 SGVector<uint32_t> hashes(ngrams+tokens_to_skip); 00122 index_t hashes_start = 0; 00123 index_t hashes_end = 0; 00124 int32_t len = hashes.vlen - 1; 00125 00128 SGVector<index_t> hashed_indices((ngrams-1)*(tokens_to_skip+1) + 1); 00129 00130 float64_t result = 0; 00131 CTokenizer* local_tzer = tokenizer->get_copy(); 00132 00134 const int32_t seed = 0xdeadbeaf; 00135 local_tzer->set_text(sv); 00136 index_t start = 0; 00137 while (hashes_end<ngrams-1+tokens_to_skip && local_tzer->has_next()) 00138 { 00139 index_t end = local_tzer->next_token_idx(start); 00140 uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed); 00141 hashes[hashes_end++] = token_hash; 00142 } 00143 00145 while (local_tzer->has_next()) 00146 { 00147 index_t end = local_tzer->next_token_idx(start); 00148 uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed); 00149 hashes[hashes_end] = token_hash; 00150 00151 CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start, len, hashed_indices, 00152 num_bits, ngrams, tokens_to_skip); 00153 00154 for (index_t i=0; i<hashed_indices.vlen; i++) 00155 result += vec2[hashed_indices[i]]; 00156 00157 hashes_start++; 00158 hashes_end++; 00159 if (hashes_end==hashes.vlen) 00160 hashes_end = 0; 00161 if (hashes_start==hashes.vlen) 00162 hashes_start = 0; 00163 } 00164 00165 if (ngrams>1) 00166 { 00167 while (hashes_start!=hashes_end) 00168 { 00169 len--; 00170 index_t max_idx = CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start, 00171 len, hashed_indices, num_bits, ngrams, tokens_to_skip); 00172 00173 for (index_t i=0; i<max_idx; i++) 00174 result += vec2[hashed_indices[i]]; 00175 00176 hashes_start++; 00177 if (hashes_start==hashes.vlen) 00178 hashes_start = 0; 00179 } 00180 } 00181 doc_collection->free_feature_vector(sv, vec_idx1); 00182 SG_UNREF(local_tzer); 00183 return should_normalize ? result / CMath::sqrt((float64_t) sv.size()) : result; 00184 } 00185 00186 void CHashedDocDotFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, 00187 float64_t* vec2, int32_t vec2_len, bool abs_val) 00188 { 00189 ASSERT(vec2_len == CMath::pow(2,num_bits)) 00190 00191 if (abs_val) 00192 alpha = CMath::abs(alpha); 00193 00194 SGVector<char> sv = doc_collection->get_feature_vector(vec_idx1); 00195 const float64_t value = should_normalize ? alpha / CMath::sqrt((float64_t) sv.size()) : alpha; 00196 00199 SGVector<uint32_t> hashes(ngrams+tokens_to_skip); 00200 index_t hashes_start = 0; 00201 index_t hashes_end = 0; 00202 index_t len = hashes.vlen - 1; 00203 00206 SGVector<index_t> hashed_indices((ngrams-1)*(tokens_to_skip+1) + 1); 00207 00208 CTokenizer* local_tzer = tokenizer->get_copy(); 00209 00211 const int32_t seed = 0xdeadbeaf; 00212 local_tzer->set_text(sv); 00213 index_t start = 0; 00214 while (hashes_end<ngrams-1+tokens_to_skip && local_tzer->has_next()) 00215 { 00216 index_t end = local_tzer->next_token_idx(start); 00217 uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed); 00218 hashes[hashes_end++] = token_hash; 00219 } 00220 00221 while (local_tzer->has_next()) 00222 { 00223 index_t end = local_tzer->next_token_idx(start); 00224 uint32_t token_hash = CHash::MurmurHash3((uint8_t* ) &sv.vector[start], end-start, seed); 00225 hashes[hashes_end] = token_hash; 00226 00227 CHashedDocConverter::generate_ngram_hashes(hashes, hashes_start, len, hashed_indices, 00228 num_bits, ngrams, tokens_to_skip); 00229 00230 for (index_t i=0; i<hashed_indices.vlen; i++) 00231 vec2[hashed_indices[i]] += value; 00232 00233 hashes_start++; 00234 hashes_end++; 00235 if (hashes_end==hashes.vlen) 00236 hashes_end = 0; 00237 if (hashes_start==hashes.vlen) 00238 hashes_start = 0; 00239 } 00240 00241 if (ngrams>1) 00242 { 00243 while (hashes_start!=hashes_end) 00244 { 00245 len--; 00246 index_t max_idx = CHashedDocConverter::generate_ngram_hashes(hashes, 00247 hashes_start, len, hashed_indices, num_bits, ngrams, tokens_to_skip); 00248 00249 for (index_t i=0; i<max_idx; i++) 00250 vec2[hashed_indices[i]] += value; 00251 00252 hashes_start++; 00253 if (hashes_start==hashes.vlen) 00254 hashes_start = 0; 00255 } 00256 } 00257 00258 doc_collection->free_feature_vector(sv, vec_idx1); 00259 SG_UNREF(local_tzer); 00260 } 00261 00262 uint32_t CHashedDocDotFeatures::calculate_token_hash(char* token, 00263 int32_t length, int32_t num_bits, uint32_t seed) 00264 { 00265 int32_t hash = CHash::MurmurHash3((uint8_t* ) token, length, seed); 00266 return hash & ((1 << num_bits) - 1); 00267 } 00268 00269 void CHashedDocDotFeatures::set_doc_collection(CStringFeatures<char>* docs) 00270 { 00271 SG_UNREF(doc_collection); 00272 doc_collection = docs; 00273 } 00274 00275 int32_t CHashedDocDotFeatures::get_nnz_features_for_vector(int32_t num) 00276 { 00277 SGVector<char> sv = doc_collection->get_feature_vector(num); 00278 int32_t num_nnz_features = sv.size(); 00279 doc_collection->free_feature_vector(sv, num); 00280 return num_nnz_features; 00281 } 00282 00283 void* CHashedDocDotFeatures::get_feature_iterator(int32_t vector_index) 00284 { 00285 SG_NOTIMPLEMENTED; 00286 return NULL; 00287 } 00288 00289 bool CHashedDocDotFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00290 { 00291 SG_NOTIMPLEMENTED; 00292 return false; 00293 } 00294 00295 void CHashedDocDotFeatures::free_feature_iterator(void* iterator) 00296 { 00297 SG_NOTIMPLEMENTED; 00298 } 00299 00300 const char* CHashedDocDotFeatures::get_name() const 00301 { 00302 return "HashedDocDotFeatures"; 00303 } 00304 00305 CFeatures* CHashedDocDotFeatures::duplicate() const 00306 { 00307 return new CHashedDocDotFeatures(*this); 00308 } 00309 00310 EFeatureType CHashedDocDotFeatures::get_feature_type() const 00311 { 00312 return F_UINT; 00313 } 00314 00315 EFeatureClass CHashedDocDotFeatures::get_feature_class() const 00316 { 00317 return C_SPARSE; 00318 } 00319 00320 int32_t CHashedDocDotFeatures::get_num_vectors() const 00321 { 00322 return doc_collection->get_num_vectors(); 00323 } 00324 }