SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
StreamingHashedDocDotFeatures.cpp
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2013 Evangelos Anagnostopoulos
00008  * Copyright (C) 2013 Evangelos Anagnostopoulos
00009  */
00010 
00011 #include <shogun/features/streaming/StreamingHashedDocDotFeatures.h>
00012 #include <shogun/features/HashedDocDotFeatures.h>
00013 #include <shogun/mathematics/Math.h>
00014 
00015 using namespace shogun;
00016 
00017 CStreamingHashedDocDotFeatures::CStreamingHashedDocDotFeatures(CStreamingFile* file,
00018     bool is_labelled, int32_t size, CTokenizer* tzer, int32_t bits)
00019 : CStreamingDotFeatures()
00020 {
00021     init(file, is_labelled, size, tzer, bits, true, 1, 0);
00022 }
00023 
00024 CStreamingHashedDocDotFeatures::CStreamingHashedDocDotFeatures() : CStreamingDotFeatures()
00025 {
00026     init(NULL, false, 0, NULL, 0, false, 1, 0);
00027 }
00028 
00029 CStreamingHashedDocDotFeatures::CStreamingHashedDocDotFeatures(
00030     CStringFeatures<char>* dot_features, CTokenizer* tzer, int32_t bits, float64_t* lab)
00031 : CStreamingDotFeatures()
00032 {
00033     CStreamingFileFromStringFeatures<char>* file =
00034         new CStreamingFileFromStringFeatures<char>(dot_features, lab);
00035     bool is_labelled = (lab != NULL);
00036     int32_t size=1024;
00037 
00038     init(file, is_labelled, size, tzer, bits, true, 1, 0);
00039 
00040     parser.set_free_vectors_on_destruct(false);
00041     seekable= true;
00042 }
00043 void CStreamingHashedDocDotFeatures::init(CStreamingFile* file, bool is_labelled,
00044     int32_t size, CTokenizer* tzer, int32_t bits, bool normalize, int32_t n_grams, int32_t skips)
00045 {
00046     num_bits = bits;
00047     tokenizer = tzer;
00048     if (tokenizer)
00049     {
00050         SG_REF(tokenizer);
00051         converter = new CHashedDocConverter(tzer, bits, normalize, n_grams, skips);
00052     }
00053     else
00054         converter=NULL;
00055 
00056     SG_ADD(&num_bits, "num_bits", "Number of bits for hash", MS_NOT_AVAILABLE);
00057     SG_ADD((CSGObject** ) &tokenizer, "tokenizer", "The tokenizer used on the documents",
00058         MS_NOT_AVAILABLE);
00059     SG_ADD((CSGObject** ) &converter, "converter", "Converter", MS_NOT_AVAILABLE);
00060 
00061     has_labels = is_labelled;
00062     if (file)
00063     {
00064         working_file = file;
00065         SG_REF(working_file);
00066         parser.init(file, is_labelled, size);
00067         seekable = false;
00068     }
00069     else
00070         working_file = NULL;
00071 
00072     set_read_functions();
00073     parser.set_free_vector_after_release(false);
00074 }
00075 
00076 CStreamingHashedDocDotFeatures::~CStreamingHashedDocDotFeatures()
00077 {
00078     if (parser.is_running())
00079         parser.end_parser();
00080     SG_UNREF(working_file);
00081     SG_UNREF(tokenizer);
00082     SG_UNREF(converter);
00083 }
00084 
00085 float32_t CStreamingHashedDocDotFeatures::dot(CStreamingDotFeatures* df)
00086 {
00087     ASSERT(df)
00088     ASSERT(df->get_name() == get_name())
00089 
00090     CStreamingHashedDocDotFeatures* cdf = (CStreamingHashedDocDotFeatures* ) df;
00091     float32_t result = current_vector.sparse_dot(cdf->current_vector);
00092     return result;
00093 }
00094 
00095 float32_t CStreamingHashedDocDotFeatures::dense_dot(const float32_t* vec2, int32_t vec2_len)
00096 {
00097     ASSERT(vec2_len == CMath::pow(2, num_bits))
00098 
00099     float32_t result = 0;
00100     for (index_t i=0; i<current_vector.num_feat_entries; i++)
00101     {
00102         result += vec2[current_vector.features[i].feat_index] *
00103                     current_vector.features[i].entry;
00104     }
00105     return result;
00106 }
00107 
00108 void CStreamingHashedDocDotFeatures::add_to_dense_vec(float32_t alpha, float32_t* vec2,
00109             int32_t vec2_len, bool abs_val)
00110 {
00111     float32_t value = abs_val ? CMath::abs(alpha) : alpha;
00112 
00113     for (index_t i=0; i<current_vector.num_feat_entries; i++)
00114         vec2[current_vector.features[i].feat_index] += value * current_vector.features[i].entry;
00115 }
00116 
00117 int32_t CStreamingHashedDocDotFeatures::get_dim_feature_space() const
00118 {
00119     return CMath::pow(2, num_bits);
00120 }
00121 
00122 const char* CStreamingHashedDocDotFeatures::get_name() const
00123 {
00124     return "StreamingHashedDocDotFeatures";
00125 }
00126 
00127 CFeatures* CStreamingHashedDocDotFeatures::duplicate() const
00128 {
00129     return new CStreamingHashedDocDotFeatures(*this);
00130 }
00131 
00132 EFeatureType CStreamingHashedDocDotFeatures::get_feature_type() const
00133 {
00134     return F_UINT;
00135 }
00136 
00137 EFeatureClass CStreamingHashedDocDotFeatures::get_feature_class() const
00138 {
00139     return C_STREAMING_SPARSE;
00140 }
00141 
00142 void CStreamingHashedDocDotFeatures::start_parser()
00143 {
00144     if (!parser.is_running())
00145         parser.start_parser();
00146 }
00147 
00148 void CStreamingHashedDocDotFeatures::end_parser()
00149 {
00150     parser.end_parser();
00151 }
00152 
00153 bool CStreamingHashedDocDotFeatures::get_next_example()
00154 {
00155     SGVector<char> tmp;
00156     if (parser.get_next_example(tmp.vector,
00157         tmp.vlen, current_label))
00158     {
00159         ASSERT(tmp.vector)
00160         ASSERT(tmp.vlen > 0)
00161         current_vector = converter->apply(tmp);
00162         return true;
00163     }
00164     return false;
00165 }
00166 
00167 void CStreamingHashedDocDotFeatures::release_example()
00168 {
00169     parser.finalize_example();
00170 }
00171 
00172 int32_t CStreamingHashedDocDotFeatures::get_num_features()
00173 {
00174     return (int32_t) CMath::pow(2, num_bits);
00175 }
00176 
00177 float64_t CStreamingHashedDocDotFeatures::get_label()
00178 {
00179     return current_label;
00180 }
00181 
00182 int32_t CStreamingHashedDocDotFeatures::get_num_vectors() const
00183 {
00184     return 1;
00185 }
00186 
00187 void CStreamingHashedDocDotFeatures::set_vector_reader()
00188 {
00189     parser.set_read_vector(&CStreamingFile::get_string);
00190 }
00191 
00192 void CStreamingHashedDocDotFeatures::set_vector_and_label_reader()
00193 {
00194     parser.set_read_vector_and_label(&CStreamingFile::get_string_and_label);
00195 }
00196 
00197 SGSparseVector<float64_t> CStreamingHashedDocDotFeatures::get_vector()
00198 {
00199     return current_vector;
00200 }
00201 
00202 void CStreamingHashedDocDotFeatures::set_normalization(bool normalize)
00203 {
00204     converter->set_normalization(normalize);
00205 }
00206 
00207 void CStreamingHashedDocDotFeatures::set_k_skip_n_grams(int32_t k, int32_t n)
00208 {
00209     converter->set_k_skip_n_grams(k, n);
00210 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation