SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2013 Evangelos Anagnostopoulos 00008 * Copyright (C) 2013 Evangelos Anagnostopoulos 00009 */ 00010 00011 #include <shogun/features/streaming/StreamingHashedDocDotFeatures.h> 00012 #include <shogun/features/HashedDocDotFeatures.h> 00013 #include <shogun/mathematics/Math.h> 00014 00015 using namespace shogun; 00016 00017 CStreamingHashedDocDotFeatures::CStreamingHashedDocDotFeatures(CStreamingFile* file, 00018 bool is_labelled, int32_t size, CTokenizer* tzer, int32_t bits) 00019 : CStreamingDotFeatures() 00020 { 00021 init(file, is_labelled, size, tzer, bits, true, 1, 0); 00022 } 00023 00024 CStreamingHashedDocDotFeatures::CStreamingHashedDocDotFeatures() : CStreamingDotFeatures() 00025 { 00026 init(NULL, false, 0, NULL, 0, false, 1, 0); 00027 } 00028 00029 CStreamingHashedDocDotFeatures::CStreamingHashedDocDotFeatures( 00030 CStringFeatures<char>* dot_features, CTokenizer* tzer, int32_t bits, float64_t* lab) 00031 : CStreamingDotFeatures() 00032 { 00033 CStreamingFileFromStringFeatures<char>* file = 00034 new CStreamingFileFromStringFeatures<char>(dot_features, lab); 00035 bool is_labelled = (lab != NULL); 00036 int32_t size=1024; 00037 00038 init(file, is_labelled, size, tzer, bits, true, 1, 0); 00039 00040 parser.set_free_vectors_on_destruct(false); 00041 seekable= true; 00042 } 00043 void CStreamingHashedDocDotFeatures::init(CStreamingFile* file, bool is_labelled, 00044 int32_t size, CTokenizer* tzer, int32_t bits, bool normalize, int32_t n_grams, int32_t skips) 00045 { 00046 num_bits = bits; 00047 tokenizer = tzer; 00048 if (tokenizer) 00049 { 00050 SG_REF(tokenizer); 00051 converter = new CHashedDocConverter(tzer, bits, normalize, n_grams, skips); 00052 } 00053 else 00054 converter=NULL; 00055 00056 SG_ADD(&num_bits, "num_bits", "Number of bits for hash", MS_NOT_AVAILABLE); 00057 SG_ADD((CSGObject** ) &tokenizer, "tokenizer", "The tokenizer used on the documents", 00058 MS_NOT_AVAILABLE); 00059 SG_ADD((CSGObject** ) &converter, "converter", "Converter", MS_NOT_AVAILABLE); 00060 00061 has_labels = is_labelled; 00062 if (file) 00063 { 00064 working_file = file; 00065 SG_REF(working_file); 00066 parser.init(file, is_labelled, size); 00067 seekable = false; 00068 } 00069 else 00070 working_file = NULL; 00071 00072 set_read_functions(); 00073 parser.set_free_vector_after_release(false); 00074 } 00075 00076 CStreamingHashedDocDotFeatures::~CStreamingHashedDocDotFeatures() 00077 { 00078 if (parser.is_running()) 00079 parser.end_parser(); 00080 SG_UNREF(working_file); 00081 SG_UNREF(tokenizer); 00082 SG_UNREF(converter); 00083 } 00084 00085 float32_t CStreamingHashedDocDotFeatures::dot(CStreamingDotFeatures* df) 00086 { 00087 ASSERT(df) 00088 ASSERT(df->get_name() == get_name()) 00089 00090 CStreamingHashedDocDotFeatures* cdf = (CStreamingHashedDocDotFeatures* ) df; 00091 float32_t result = current_vector.sparse_dot(cdf->current_vector); 00092 return result; 00093 } 00094 00095 float32_t CStreamingHashedDocDotFeatures::dense_dot(const float32_t* vec2, int32_t vec2_len) 00096 { 00097 ASSERT(vec2_len == CMath::pow(2, num_bits)) 00098 00099 float32_t result = 0; 00100 for (index_t i=0; i<current_vector.num_feat_entries; i++) 00101 { 00102 result += vec2[current_vector.features[i].feat_index] * 00103 current_vector.features[i].entry; 00104 } 00105 return result; 00106 } 00107 00108 void CStreamingHashedDocDotFeatures::add_to_dense_vec(float32_t alpha, float32_t* vec2, 00109 int32_t vec2_len, bool abs_val) 00110 { 00111 float32_t value = abs_val ? CMath::abs(alpha) : alpha; 00112 00113 for (index_t i=0; i<current_vector.num_feat_entries; i++) 00114 vec2[current_vector.features[i].feat_index] += value * current_vector.features[i].entry; 00115 } 00116 00117 int32_t CStreamingHashedDocDotFeatures::get_dim_feature_space() const 00118 { 00119 return CMath::pow(2, num_bits); 00120 } 00121 00122 const char* CStreamingHashedDocDotFeatures::get_name() const 00123 { 00124 return "StreamingHashedDocDotFeatures"; 00125 } 00126 00127 CFeatures* CStreamingHashedDocDotFeatures::duplicate() const 00128 { 00129 return new CStreamingHashedDocDotFeatures(*this); 00130 } 00131 00132 EFeatureType CStreamingHashedDocDotFeatures::get_feature_type() const 00133 { 00134 return F_UINT; 00135 } 00136 00137 EFeatureClass CStreamingHashedDocDotFeatures::get_feature_class() const 00138 { 00139 return C_STREAMING_SPARSE; 00140 } 00141 00142 void CStreamingHashedDocDotFeatures::start_parser() 00143 { 00144 if (!parser.is_running()) 00145 parser.start_parser(); 00146 } 00147 00148 void CStreamingHashedDocDotFeatures::end_parser() 00149 { 00150 parser.end_parser(); 00151 } 00152 00153 bool CStreamingHashedDocDotFeatures::get_next_example() 00154 { 00155 SGVector<char> tmp; 00156 if (parser.get_next_example(tmp.vector, 00157 tmp.vlen, current_label)) 00158 { 00159 ASSERT(tmp.vector) 00160 ASSERT(tmp.vlen > 0) 00161 current_vector = converter->apply(tmp); 00162 return true; 00163 } 00164 return false; 00165 } 00166 00167 void CStreamingHashedDocDotFeatures::release_example() 00168 { 00169 parser.finalize_example(); 00170 } 00171 00172 int32_t CStreamingHashedDocDotFeatures::get_num_features() 00173 { 00174 return (int32_t) CMath::pow(2, num_bits); 00175 } 00176 00177 float64_t CStreamingHashedDocDotFeatures::get_label() 00178 { 00179 return current_label; 00180 } 00181 00182 int32_t CStreamingHashedDocDotFeatures::get_num_vectors() const 00183 { 00184 return 1; 00185 } 00186 00187 void CStreamingHashedDocDotFeatures::set_vector_reader() 00188 { 00189 parser.set_read_vector(&CStreamingFile::get_string); 00190 } 00191 00192 void CStreamingHashedDocDotFeatures::set_vector_and_label_reader() 00193 { 00194 parser.set_read_vector_and_label(&CStreamingFile::get_string_and_label); 00195 } 00196 00197 SGSparseVector<float64_t> CStreamingHashedDocDotFeatures::get_vector() 00198 { 00199 return current_vector; 00200 } 00201 00202 void CStreamingHashedDocDotFeatures::set_normalization(bool normalize) 00203 { 00204 converter->set_normalization(normalize); 00205 } 00206 00207 void CStreamingHashedDocDotFeatures::set_k_skip_n_grams(int32_t k, int32_t n) 00208 { 00209 converter->set_k_skip_n_grams(k, n); 00210 }