SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
StringFeatures.h
Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Written (W) 2011-2012 Heiko Strathmann
00010  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00011  */
00012 
00013 #ifndef _CSTRINGFEATURES__H__
00014 #define _CSTRINGFEATURES__H__
00015 
00016 #include <shogun/lib/common.h>
00017 #include <shogun/lib/Cache.h>
00018 #include <shogun/lib/DynamicArray.h>
00019 #include <shogun/lib/Compressor.h>
00020 #include <shogun/io/File.h>
00021 
00022 #include <shogun/features/Features.h>
00023 #include <shogun/features/Alphabet.h>
00024 
00025 namespace shogun
00026 {
00027 class CAlphabet;
00028 template <class T> class CDynamicArray;
00029 class CFile;
00030 template <class T> class SGString;
00031 template <class T> class SGStringList;
00032 
00033 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00034 struct SSKDoubleFeature
00035 {
00036     int feature1;
00037     int feature2;
00038     int group;
00039 };
00040 
00041 struct SSKTripleFeature
00042 {
00043     int feature1;
00044     int feature2;
00045     int feature3;
00046     int group;
00047 };
00048 #endif
00049 
00073 template <class ST> class CStringFeatures : public CFeatures
00074 {
00075     public:
00077         CStringFeatures();
00078 
00083         CStringFeatures(EAlphabet alpha);
00084 
00090         CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha);
00091 
00097         CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha);
00098 
00103         CStringFeatures(CAlphabet* alpha);
00104 
00109         CStringFeatures(const CStringFeatures& orig);
00110 
00116         CStringFeatures(CFile* loader, EAlphabet alpha=DNA);
00117 
00119         virtual ~CStringFeatures();
00120 
00126         virtual void cleanup();
00127 
00134         virtual void cleanup_feature_vector(int32_t num);
00135 
00143         virtual void cleanup_feature_vectors(int32_t start, int32_t stop);
00144 
00149         virtual EFeatureClass get_feature_class() const;
00150 
00155         virtual EFeatureType get_feature_type() const;
00156 
00161         CAlphabet* get_alphabet();
00162 
00167         virtual CFeatures* duplicate() const;
00168 
00176         SGVector<ST> get_feature_vector(int32_t num);
00177 
00185         void set_feature_vector(SGVector<ST> vector, int32_t num);
00186 
00188         void enable_on_the_fly_preprocessing();
00189 
00193         void disable_on_the_fly_preprocessing();
00194 
00205         ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree);
00206 
00213         CStringFeatures<ST>* get_transposed();
00214 
00228         SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec);
00229 
00238         void free_feature_vector(ST* feat_vec, int32_t num, bool dofree);
00239 
00247         void free_feature_vector(SGVector<ST> feat_vec, int32_t num);
00248 
00257         virtual ST get_feature(int32_t vec_num, int32_t feat_num);
00258 
00266         virtual int32_t get_vector_length(int32_t vec_num);
00267 
00274         virtual int32_t get_max_vector_length();
00275 
00277         virtual int32_t get_num_vectors() const;
00278 
00285         floatmax_t get_num_symbols();
00286 
00294         floatmax_t get_max_num_symbols();
00295 
00296         // these functions are necessary to find out about a former conversion process
00297 
00302         floatmax_t get_original_num_symbols();
00303 
00308         int32_t get_order();
00309 
00317         ST get_masked_symbols(ST symbol, uint8_t mask);
00318 
00325         ST shift_offset(ST offset, int32_t amount);
00326 
00333         ST shift_symbol(ST symbol, int32_t amount);
00334 
00339         virtual void load(CFile* loader);
00340 
00351         void load_ascii_file(char* fname, bool remap_to_bin=true,
00352                 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA);
00353 
00362         bool load_fasta_file(const char* fname, bool ignore_invalid=false);
00363 
00373         bool load_fastq_file(const char* fname,
00374                 bool ignore_invalid=false, bool bitremap_in_single_string=false);
00375 
00383         bool load_from_directory(char* dirname);
00384 
00390         void set_features(SGStringList<ST> feats);
00391 
00401         bool set_features(SGString<ST>* p_features, int32_t p_num_vectors,
00402                 int32_t p_max_string_length);
00403 
00412         bool append_features(CStringFeatures<ST>* sf);
00413 
00426         bool append_features(SGString<ST>* p_features, int32_t p_num_vectors,
00427                 int32_t p_max_string_length);
00428 
00432         SGStringList<ST> get_features();
00433 
00442         virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len);
00443 
00452         virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len);
00453 
00461         virtual void get_features(SGString<ST>** dst, int32_t* num_str);
00462 
00469         virtual void save(CFile* writer);
00470 
00479         virtual bool load_compressed(char* src, bool decompress);
00480 
00490         virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level);
00491 
00497         virtual bool apply_preprocessor(bool force_preprocessing=false);
00498 
00511         int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0);
00512 
00523         int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
00524                 int32_t skip=0);
00525 
00539         bool obtain_from_char(CStringFeatures<char>* sf, int32_t start,
00540                 int32_t p_order, int32_t gap, bool rev);
00541 
00553         template <class CT>
00554             bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
00555                     int32_t p_order, int32_t gap, bool rev);
00556 
00566         bool have_same_length(int32_t len=-1);
00567 
00573         void embed_features(int32_t p_order);
00574 
00581         void compute_symbol_mask_table(int64_t max_val);
00582 
00589         void unembed_word(ST word, uint8_t* seq, int32_t len);
00590 
00596         ST embed_word(ST* seq, int32_t len);
00597 
00602         void determine_maximum_string_length();
00603 
00611         static ST* get_zero_terminated_string_copy(SGString<ST> str);
00612 
00621         virtual void set_feature_vector(int32_t num, ST* string, int32_t len);
00622 
00627         virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols,
00628                 bool normalize=true);
00629 
00634         virtual void create_random(float64_t* hist, int32_t rows, int32_t cols,
00635                 int32_t num_vec);
00636 
00645         virtual CFeatures* copy_subset(SGVector<index_t> indices);
00646 
00648         virtual const char* get_name() const { return "StringFeatures"; }
00649 
00651         virtual void subset_changed_post();
00652 
00653     protected:
00664         virtual ST* compute_feature_vector(int32_t num, int32_t& len);
00665 
00666     private:
00667         void init();
00668 
00669     protected:
00671         CAlphabet* alphabet;
00672 
00674         int32_t num_vectors;
00675 
00677         SGString<ST>* features;
00678 
00680         ST* single_string;
00681 
00683         int32_t length_of_single_string;
00684 
00686         int32_t max_string_length;
00687 
00689         floatmax_t num_symbols;
00690 
00692         floatmax_t original_num_symbols;
00693 
00695         int32_t order;
00696 
00698         ST* symbol_mask_table;
00699 
00701         int32_t symbol_mask_table_len;
00702 
00704         bool preprocess_on_get;
00705 
00707         CCache<ST>* feature_cache;
00708 };
00709 }
00710 #endif // _CSTRINGFEATURES__H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation