SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Written (W) 2011-2012 Heiko Strathmann 00010 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00011 */ 00012 00013 #ifndef _CSTRINGFEATURES__H__ 00014 #define _CSTRINGFEATURES__H__ 00015 00016 #include <shogun/lib/common.h> 00017 #include <shogun/lib/Cache.h> 00018 #include <shogun/lib/DynamicArray.h> 00019 #include <shogun/lib/Compressor.h> 00020 #include <shogun/io/File.h> 00021 00022 #include <shogun/features/Features.h> 00023 #include <shogun/features/Alphabet.h> 00024 00025 namespace shogun 00026 { 00027 class CAlphabet; 00028 template <class T> class CDynamicArray; 00029 class CFile; 00030 template <class T> class SGString; 00031 template <class T> class SGStringList; 00032 00033 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00034 struct SSKDoubleFeature 00035 { 00036 int feature1; 00037 int feature2; 00038 int group; 00039 }; 00040 00041 struct SSKTripleFeature 00042 { 00043 int feature1; 00044 int feature2; 00045 int feature3; 00046 int group; 00047 }; 00048 #endif 00049 00073 template <class ST> class CStringFeatures : public CFeatures 00074 { 00075 public: 00077 CStringFeatures(); 00078 00083 CStringFeatures(EAlphabet alpha); 00084 00090 CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha); 00091 00097 CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha); 00098 00103 CStringFeatures(CAlphabet* alpha); 00104 00109 CStringFeatures(const CStringFeatures& orig); 00110 00116 CStringFeatures(CFile* loader, EAlphabet alpha=DNA); 00117 00119 virtual ~CStringFeatures(); 00120 00126 virtual void cleanup(); 00127 00134 virtual void cleanup_feature_vector(int32_t num); 00135 00143 virtual void cleanup_feature_vectors(int32_t start, int32_t stop); 00144 00149 virtual EFeatureClass get_feature_class() const; 00150 00155 virtual EFeatureType get_feature_type() const; 00156 00161 CAlphabet* get_alphabet(); 00162 00167 virtual CFeatures* duplicate() const; 00168 00176 SGVector<ST> get_feature_vector(int32_t num); 00177 00185 void set_feature_vector(SGVector<ST> vector, int32_t num); 00186 00188 void enable_on_the_fly_preprocessing(); 00189 00193 void disable_on_the_fly_preprocessing(); 00194 00205 ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree); 00206 00213 CStringFeatures<ST>* get_transposed(); 00214 00228 SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec); 00229 00238 void free_feature_vector(ST* feat_vec, int32_t num, bool dofree); 00239 00247 void free_feature_vector(SGVector<ST> feat_vec, int32_t num); 00248 00257 virtual ST get_feature(int32_t vec_num, int32_t feat_num); 00258 00266 virtual int32_t get_vector_length(int32_t vec_num); 00267 00274 virtual int32_t get_max_vector_length(); 00275 00277 virtual int32_t get_num_vectors() const; 00278 00285 floatmax_t get_num_symbols(); 00286 00294 floatmax_t get_max_num_symbols(); 00295 00296 // these functions are necessary to find out about a former conversion process 00297 00302 floatmax_t get_original_num_symbols(); 00303 00308 int32_t get_order(); 00309 00317 ST get_masked_symbols(ST symbol, uint8_t mask); 00318 00325 ST shift_offset(ST offset, int32_t amount); 00326 00333 ST shift_symbol(ST symbol, int32_t amount); 00334 00339 virtual void load(CFile* loader); 00340 00351 void load_ascii_file(char* fname, bool remap_to_bin=true, 00352 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA); 00353 00362 bool load_fasta_file(const char* fname, bool ignore_invalid=false); 00363 00373 bool load_fastq_file(const char* fname, 00374 bool ignore_invalid=false, bool bitremap_in_single_string=false); 00375 00383 bool load_from_directory(char* dirname); 00384 00390 void set_features(SGStringList<ST> feats); 00391 00401 bool set_features(SGString<ST>* p_features, int32_t p_num_vectors, 00402 int32_t p_max_string_length); 00403 00412 bool append_features(CStringFeatures<ST>* sf); 00413 00426 bool append_features(SGString<ST>* p_features, int32_t p_num_vectors, 00427 int32_t p_max_string_length); 00428 00432 SGStringList<ST> get_features(); 00433 00442 virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len); 00443 00452 virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len); 00453 00461 virtual void get_features(SGString<ST>** dst, int32_t* num_str); 00462 00469 virtual void save(CFile* writer); 00470 00479 virtual bool load_compressed(char* src, bool decompress); 00480 00490 virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level); 00491 00497 virtual bool apply_preprocessor(bool force_preprocessing=false); 00498 00511 int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0); 00512 00523 int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, 00524 int32_t skip=0); 00525 00539 bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, 00540 int32_t p_order, int32_t gap, bool rev); 00541 00553 template <class CT> 00554 bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, 00555 int32_t p_order, int32_t gap, bool rev); 00556 00566 bool have_same_length(int32_t len=-1); 00567 00573 void embed_features(int32_t p_order); 00574 00581 void compute_symbol_mask_table(int64_t max_val); 00582 00589 void unembed_word(ST word, uint8_t* seq, int32_t len); 00590 00596 ST embed_word(ST* seq, int32_t len); 00597 00602 void determine_maximum_string_length(); 00603 00611 static ST* get_zero_terminated_string_copy(SGString<ST> str); 00612 00621 virtual void set_feature_vector(int32_t num, ST* string, int32_t len); 00622 00627 virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, 00628 bool normalize=true); 00629 00634 virtual void create_random(float64_t* hist, int32_t rows, int32_t cols, 00635 int32_t num_vec); 00636 00645 virtual CFeatures* copy_subset(SGVector<index_t> indices); 00646 00648 virtual const char* get_name() const { return "StringFeatures"; } 00649 00651 virtual void subset_changed_post(); 00652 00653 protected: 00664 virtual ST* compute_feature_vector(int32_t num, int32_t& len); 00665 00666 private: 00667 void init(); 00668 00669 protected: 00671 CAlphabet* alphabet; 00672 00674 int32_t num_vectors; 00675 00677 SGString<ST>* features; 00678 00680 ST* single_string; 00681 00683 int32_t length_of_single_string; 00684 00686 int32_t max_string_length; 00687 00689 floatmax_t num_symbols; 00690 00692 floatmax_t original_num_symbols; 00693 00695 int32_t order; 00696 00698 ST* symbol_mask_table; 00699 00701 int32_t symbol_mask_table_len; 00702 00704 bool preprocess_on_get; 00705 00707 CCache<ST>* feature_cache; 00708 }; 00709 } 00710 #endif // _CSTRINGFEATURES__H__