SHOGUN  v3.2.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
StringFeatures.cpp
Go to the documentation of this file.
00001 #include <shogun/features/StringFeatures.h>
00002 #include <shogun/preprocessor/Preprocessor.h>
00003 #include <shogun/preprocessor/StringPreprocessor.h>
00004 #include <shogun/io/MemoryMappedFile.h>
00005 #include <shogun/io/SGIO.h>
00006 #include <shogun/mathematics/Math.h>
00007 #include <shogun/base/Parameter.h>
00008 #include <shogun/lib/SGStringList.h>
00009 
00010 #include <sys/types.h>
00011 #include <sys/stat.h>
00012 #include <dirent.h>
00013 #include <stdio.h>
00014 #include <stdlib.h>
00015 #include <unistd.h>
00016 
00017 
00018 namespace shogun
00019 {
00020 
00021 template<class ST> CStringFeatures<ST>::CStringFeatures() : CFeatures(0)
00022 {
00023     init();
00024     alphabet=new CAlphabet();
00025 }
00026 
00027 template<class ST> CStringFeatures<ST>::CStringFeatures(EAlphabet alpha) : CFeatures(0)
00028 {
00029     init();
00030 
00031     alphabet=new CAlphabet(alpha);
00032     SG_REF(alphabet);
00033     num_symbols=alphabet->get_num_symbols();
00034     original_num_symbols=num_symbols;
00035 }
00036 
00037 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha)
00038 : CFeatures(0)
00039 {
00040     init();
00041 
00042     alphabet=new CAlphabet(alpha);
00043     SG_REF(alphabet);
00044     num_symbols=alphabet->get_num_symbols();
00045     original_num_symbols=num_symbols;
00046     set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00047 }
00048 
00049 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha)
00050 : CFeatures(0)
00051 {
00052     init();
00053 
00054     alphabet=new CAlphabet(alpha);
00055     SG_REF(alphabet);
00056     num_symbols=alphabet->get_num_symbols();
00057     original_num_symbols=num_symbols;
00058     set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
00059 }
00060 
00061 template<class ST> CStringFeatures<ST>::CStringFeatures(CAlphabet* alpha)
00062 : CFeatures(0)
00063 {
00064     init();
00065 
00066     ASSERT(alpha)
00067     SG_REF(alpha);
00068     alphabet=alpha;
00069     num_symbols=alphabet->get_num_symbols();
00070     original_num_symbols=num_symbols;
00071 }
00072 
00073 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig)
00074 : CFeatures(orig), num_vectors(orig.num_vectors),
00075     single_string(orig.single_string),
00076     length_of_single_string(orig.length_of_single_string),
00077     max_string_length(orig.max_string_length),
00078     num_symbols(orig.num_symbols),
00079     original_num_symbols(orig.original_num_symbols),
00080     order(orig.order), preprocess_on_get(false),
00081     feature_cache(NULL)
00082 {
00083     init();
00084 
00085     ASSERT(orig.single_string == NULL) //not implemented
00086 
00087     alphabet=orig.alphabet;
00088     SG_REF(alphabet);
00089 
00090     if (orig.features)
00091     {
00092         features=SG_MALLOC(SGString<ST>, orig.num_vectors);
00093 
00094         for (int32_t i=0; i<num_vectors; i++)
00095         {
00096             features[i].string=SG_MALLOC(ST, orig.features[i].slen);
00097             features[i].slen=orig.features[i].slen;
00098             memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen);
00099         }
00100     }
00101 
00102     if (orig.symbol_mask_table)
00103     {
00104         symbol_mask_table=SG_MALLOC(ST, 256);
00105         symbol_mask_table_len=256;
00106 
00107         for (int32_t i=0; i<256; i++)
00108             symbol_mask_table[i]=orig.symbol_mask_table[i];
00109     }
00110 
00111     m_subset_stack=orig.m_subset_stack;
00112     SG_REF(m_subset_stack);
00113 }
00114 
00115 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha)
00116 : CFeatures(), num_vectors(0),
00117   features(NULL), single_string(NULL), length_of_single_string(0),
00118   max_string_length(0), order(0),
00119   preprocess_on_get(false), feature_cache(NULL)
00120 {
00121     init();
00122 
00123     alphabet=new CAlphabet(alpha);
00124     SG_REF(alphabet);
00125     num_symbols=alphabet->get_num_symbols();
00126     original_num_symbols=num_symbols;
00127     load(loader);
00128 }
00129 
00130 template<class ST> CStringFeatures<ST>::~CStringFeatures()
00131 {
00132     cleanup();
00133 
00134     SG_UNREF(alphabet);
00135 }
00136 
00137 template<class ST> void CStringFeatures<ST>::cleanup()
00138 {
00139     remove_all_subsets();
00140 
00141     if (single_string)
00142     {
00143         SG_FREE(single_string);
00144         single_string=NULL;
00145     }
00146     else
00147         cleanup_feature_vectors(0, num_vectors-1);
00148 
00149     /*
00150     if (single_string)
00151     {
00152         SG_FREE(single_string);
00153         single_string=NULL;
00154     }
00155     else
00156         cleanup_feature_vectors(0, num_vectors-1);
00157     */
00158 
00159     num_vectors=0;
00160     SG_FREE(features);
00161     SG_FREE(symbol_mask_table);
00162     features=NULL;
00163     symbol_mask_table=NULL;
00164 
00165     /* start with a fresh alphabet, but instead of emptying the histogram
00166      * create a new object (to leave the alphabet object alone if it is used
00167      * by others)
00168      */
00169     CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00170     SG_UNREF(alphabet);
00171     alphabet=alpha;
00172     SG_REF(alphabet);
00173 }
00174 
00175 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num)
00176 {
00177     ASSERT(num<get_num_vectors())
00178 
00179     if (features)
00180     {
00181         int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00182         SG_FREE(features[real_num].string);
00183         features[real_num].string=NULL;
00184         features[real_num].slen=0;
00185 
00186         determine_maximum_string_length();
00187     }
00188 }
00189 
00190 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop)
00191 {
00192     if (features && get_num_vectors())
00193     {
00194         ASSERT(start<get_num_vectors())
00195         ASSERT(stop<get_num_vectors())
00196 
00197         for (int32_t i=start; i<=stop; i++)
00198         {
00199             int32_t real_num=m_subset_stack->subset_idx_conversion(i);
00200             SG_FREE(features[real_num].string);
00201             features[real_num].string=NULL;
00202             features[real_num].slen=0;
00203         }
00204         determine_maximum_string_length();
00205     }
00206 }
00207 
00208 template<class ST> EFeatureClass CStringFeatures<ST>::get_feature_class() const { return C_STRING; }
00209 
00210 template<class ST> EFeatureType CStringFeatures<ST>::get_feature_type() const { return F_UNKNOWN; }
00211 
00212 template<class ST> CAlphabet* CStringFeatures<ST>::get_alphabet()
00213 {
00214     SG_REF(alphabet);
00215     return alphabet;
00216 }
00217 
00218 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const
00219 {
00220     return new CStringFeatures<ST>(*this);
00221 }
00222 
00223 template<class ST> SGVector<ST> CStringFeatures<ST>::get_feature_vector(int32_t num)
00224 {
00225     ASSERT(features)
00226     if (num>=get_num_vectors())
00227     {
00228         SG_ERROR("Index out of bounds (number of strings %d, you "
00229                 "requested %d)\n", get_num_vectors(), num);
00230     }
00231 
00232     int32_t l;
00233     bool free_vec;
00234     ST* vec=get_feature_vector(num, l, free_vec);
00235     ST* dst=SG_MALLOC(ST, l);
00236     memcpy(dst, vec, l*sizeof(ST));
00237     free_feature_vector(vec, num, free_vec);
00238     return SGVector<ST>(dst, l, true);
00239 }
00240 
00241 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num)
00242 {
00243     ASSERT(features)
00244 
00245     if (m_subset_stack->has_subsets())
00246         SG_ERROR("A subset is set, cannot set feature vector\n")
00247 
00248     if (num>=num_vectors)
00249     {
00250         SG_ERROR("Index out of bounds (number of strings %d, you "
00251                 "requested %d)\n", num_vectors, num);
00252     }
00253 
00254     if (vector.vlen<=0)
00255         SG_ERROR("String has zero or negative length\n")
00256 
00257     cleanup_feature_vector(num);
00258     features[num].slen=vector.vlen;
00259     features[num].string=SG_MALLOC(ST, vector.vlen);
00260     memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST));
00261 
00262     determine_maximum_string_length();
00263 }
00264 
00265 template<class ST> void CStringFeatures<ST>::enable_on_the_fly_preprocessing()
00266 {
00267     preprocess_on_get=true;
00268 }
00269 
00270 template<class ST> void CStringFeatures<ST>::disable_on_the_fly_preprocessing()
00271 {
00272     preprocess_on_get=false;
00273 }
00274 
00275 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree)
00276 {
00277     ASSERT(features)
00278     if (num>=get_num_vectors())
00279         SG_ERROR("Requested feature vector with index %d while total num is", num, get_num_vectors())
00280 
00281     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00282 
00283     if (!preprocess_on_get)
00284     {
00285         dofree=false;
00286         len=features[real_num].slen;
00287         return features[real_num].string;
00288     }
00289     else
00290     {
00291         SG_DEBUG("computing feature vector!\n")
00292         ST* feat=compute_feature_vector(num, len);
00293         dofree=true;
00294 
00295         if (get_num_preprocessors())
00296         {
00297             ST* tmp_feat_before=feat;
00298 
00299             for (int32_t i=0; i<get_num_preprocessors(); i++)
00300             {
00301                 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
00302                 feat=p->apply_to_string(tmp_feat_before, len);
00303                 SG_UNREF(p);
00304                 SG_FREE(tmp_feat_before);
00305                 tmp_feat_before=feat;
00306             }
00307         }
00308         // TODO: implement caching
00309         return feat;
00310     }
00311 }
00312 
00313 template<class ST> CStringFeatures<ST>* CStringFeatures<ST>::get_transposed()
00314 {
00315     int32_t num_feat;
00316     int32_t num_vec;
00317     SGString<ST>* s=get_transposed(num_feat, num_vec);
00318     SGStringList<ST> string_list;
00319     string_list.strings = s;
00320     string_list.num_strings = num_vec;
00321     string_list.max_string_length = num_feat;
00322 
00323     return new CStringFeatures<ST>(string_list, alphabet);
00324 }
00325 
00326 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
00327 {
00328     num_feat=get_num_vectors();
00329     num_vec=get_max_vector_length();
00330     ASSERT(have_same_length())
00331 
00332     SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
00333             int64_t(num_feat)*num_vec);
00334 
00335     SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
00336 
00337     for (int32_t i=0; i<num_vec; i++)
00338     {
00339         sf[i].string=SG_MALLOC(ST, num_feat);
00340         sf[i].slen=num_feat;
00341     }
00342 
00343     for (int32_t i=0; i<num_feat; i++)
00344     {
00345         int32_t len=0;
00346         bool free_vec=false;
00347         ST* vec=get_feature_vector(i, len, free_vec);
00348 
00349         for (int32_t j=0; j<num_vec; j++)
00350             sf[j].string[i]=vec[j];
00351 
00352         free_feature_vector(vec, i, free_vec);
00353     }
00354     return sf;
00355 }
00356 
00357 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
00358 {
00359     if (num>=get_num_vectors())
00360     {
00361         SG_ERROR(
00362             "Trying to access string[%d] but num_str=%d\n", num,
00363             get_num_vectors());
00364     }
00365 
00366     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00367 
00368     if (feature_cache)
00369         feature_cache->unlock_entry(real_num);
00370 
00371     if (dofree)
00372         SG_FREE(feat_vec);
00373 }
00374 
00375 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num)
00376 {
00377     if (num>=get_num_vectors())
00378     {
00379         SG_ERROR(
00380             "Trying to access string[%d] but num_str=%d\n", num,
00381             get_num_vectors());
00382     }
00383 
00384     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
00385 
00386     if (feature_cache)
00387         feature_cache->unlock_entry(real_num);
00388 }
00389 
00390 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num)
00391 {
00392     ASSERT(vec_num<get_num_vectors())
00393 
00394     int32_t len;
00395     bool free_vec;
00396     ST* vec=get_feature_vector(vec_num, len, free_vec);
00397     ASSERT(feat_num<len)
00398     ST result=vec[feat_num];
00399     free_feature_vector(vec, vec_num, free_vec);
00400 
00401     return result;
00402 }
00403 
00404 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num)
00405 {
00406     ASSERT(vec_num<get_num_vectors())
00407 
00408     int32_t len;
00409     bool free_vec;
00410     ST* vec=get_feature_vector(vec_num, len, free_vec);
00411     free_feature_vector(vec, vec_num, free_vec);
00412     return len;
00413 }
00414 
00415 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length()
00416 {
00417     return max_string_length;
00418 }
00419 
00420 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const
00421 {
00422     return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
00423 }
00424 
00425 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; }
00426 
00427 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00428 
00429 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; }
00430 
00431 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; }
00432 
00433 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask)
00434 {
00435     ASSERT(symbol_mask_table)
00436     return symbol_mask_table[mask] & symbol;
00437 }
00438 
00439 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount)
00440 {
00441     ASSERT(alphabet)
00442     return (offset << (amount*alphabet->get_num_bits()));
00443 }
00444 
00445 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount)
00446 {
00447     ASSERT(alphabet)
00448     return (symbol >> (amount*alphabet->get_num_bits()));
00449 }
00450 
00451 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin,
00452         EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
00453 {
00454     remove_all_subsets();
00455 
00456     size_t blocksize=1024*1024;
00457     size_t required_blocksize=0;
00458     uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
00459     uint8_t* overflow=NULL;
00460     int32_t overflow_len=0;
00461 
00462     cleanup();
00463 
00464     CAlphabet* alpha=new CAlphabet(ascii_alphabet);
00465     CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
00466 
00467     FILE* f=fopen(fname, "ro");
00468 
00469     if (f)
00470     {
00471         num_vectors=0;
00472         max_string_length=0;
00473 
00474         SG_INFO("counting line numbers in file %s\n", fname)
00475         size_t block_offs=0;
00476         size_t old_block_offs=0;
00477         fseek(f, 0, SEEK_END);
00478         size_t fsize=ftell(f);
00479         rewind(f);
00480 
00481         if (blocksize>fsize)
00482             blocksize=fsize;
00483 
00484         SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize)
00485 
00486         size_t sz=blocksize;
00487         while (sz == blocksize)
00488         {
00489             sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00490             for (size_t i=0; i<sz; i++)
00491             {
00492                 block_offs++;
00493                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00494                 {
00495                     num_vectors++;
00496                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00497                     old_block_offs=block_offs;
00498                 }
00499             }
00500             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t")
00501         }
00502 
00503         SG_INFO("found %d strings\n", num_vectors)
00504         SG_FREE(dummy);
00505         blocksize=required_blocksize;
00506         dummy=SG_MALLOC(uint8_t, blocksize);
00507         overflow=SG_MALLOC(uint8_t, blocksize);
00508         features=SG_MALLOC(SGString<ST>, num_vectors);
00509 
00510         rewind(f);
00511         sz=blocksize;
00512         int32_t lines=0;
00513         while (sz == blocksize)
00514         {
00515             sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00516 
00517             size_t old_sz=0;
00518             for (size_t i=0; i<sz; i++)
00519             {
00520                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00521                 {
00522                     int32_t len=i-old_sz;
00523                     //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz)
00524                     max_string_length=CMath::max(max_string_length, len+overflow_len);
00525 
00526                     features[lines].slen=len;
00527                     features[lines].string=SG_MALLOC(ST, len);
00528 
00529                     if (remap_to_bin)
00530                     {
00531                         for (int32_t j=0; j<overflow_len; j++)
00532                             features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00533                         for (int32_t j=0; j<len; j++)
00534                             features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00535                         alpha->add_string_to_histogram(&dummy[old_sz], len);
00536                         alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen);
00537                     }
00538                     else
00539                     {
00540                         for (int32_t j=0; j<overflow_len; j++)
00541                             features[lines].string[j]=overflow[j];
00542                         for (int32_t j=0; j<len; j++)
00543                             features[lines].string[j+overflow_len]=dummy[old_sz+j];
00544                         alpha->add_string_to_histogram(&dummy[old_sz], len);
00545                         alpha->add_string_to_histogram(features[lines].string, features[lines].slen);
00546                     }
00547 
00548                     // clear overflow
00549                     overflow_len=0;
00550 
00551                     //CMath::display_vector(features[lines].string, len);
00552                     old_sz=i+1;
00553                     lines++;
00554                     SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t")
00555                 }
00556             }
00557             for (size_t i=old_sz; i<sz; i++)
00558                 overflow[i-old_sz]=dummy[i];
00559 
00560             overflow_len=sz-old_sz;
00561         }
00562 
00563         if (alpha->check_alphabet_size() && alpha->check_alphabet())
00564         {
00565             SG_INFO("file successfully read\n")
00566             SG_INFO("max_string_length=%d\n", max_string_length)
00567             SG_INFO("num_strings=%d\n", num_vectors)
00568         }
00569         fclose(f);
00570     }
00571 
00572     SG_FREE(dummy);
00573 
00574     SG_UNREF(alphabet);
00575 
00576     if (remap_to_bin)
00577         alphabet=alpha_bin;
00578     else
00579         alphabet=alpha;
00580     SG_REF(alphabet);
00581     num_symbols=alphabet->get_num_symbols();
00582 }
00583 
00584 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid)
00585 {
00586     remove_all_subsets();
00587 
00588     int32_t i=0;
00589     uint64_t len=0;
00590     uint64_t offs=0;
00591     int32_t num=0;
00592     int32_t max_len=0;
00593 
00594     CMemoryMappedFile<char> f(fname);
00595 
00596     while (true)
00597     {
00598         char* s=f.get_line(len, offs);
00599         if (!s)
00600             break;
00601 
00602         if (len>0 && s[0]=='>')
00603             num++;
00604     }
00605 
00606     if (num==0)
00607         SG_ERROR("No fasta hunks (lines starting with '>') found\n")
00608 
00609     cleanup();
00610     SG_UNREF(alphabet);
00611     alphabet=new CAlphabet(DNA);
00612     num_symbols=alphabet->get_num_symbols();
00613 
00614     SGString<ST>* strings=SG_MALLOC(SGString<ST>, num);
00615     offs=0;
00616 
00617     for (i=0;i<num; i++)
00618     {
00619         uint64_t id_len=0;
00620         char* id=f.get_line(id_len, offs);
00621 
00622         char* fasta=f.get_line(len, offs);
00623         char* s=fasta;
00624         int32_t fasta_len=0;
00625         int32_t spanned_lines=0;
00626 
00627         while (true)
00628         {
00629             if (!s || len==0)
00630                 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len)
00631 
00632             if (s[0]=='>' || offs==f.get_size())
00633             {
00634                 offs-=len+1; // seek to beginning
00635                 if (offs==f.get_size())
00636                 {
00637                     SG_DEBUG("at EOF\n")
00638                     fasta_len+=len;
00639                 }
00640 
00641                 len=fasta_len-spanned_lines;
00642                 strings[i].string=SG_MALLOC(ST, len);
00643                 strings[i].slen=len;
00644 
00645                 ST* str=strings[i].string;
00646                 int32_t idx=0;
00647                 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines)
00648 
00649                 for (int32_t j=0; j<fasta_len; j++)
00650                 {
00651                     if (fasta[j]=='\n')
00652                         continue;
00653 
00654                     ST c=(ST) fasta[j];
00655 
00656                     if (ignore_invalid  && !alphabet->is_valid((uint8_t) fasta[j]))
00657                         c=(ST) 'A';
00658 
00659                     if (uint64_t(idx)>=len)
00660                         SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str)
00661                     str[idx++]=c;
00662                 }
00663                 max_len=CMath::max(max_len, strings[i].slen);
00664 
00665 
00666                 break;
00667             }
00668 
00669             spanned_lines++;
00670             fasta_len+=len+1; // including '\n'
00671             s=f.get_line(len, offs);
00672         }
00673     }
00674     return set_features(strings, num, max_len);
00675 }
00676 
00677 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname,
00678         bool ignore_invalid, bool bitremap_in_single_string)
00679 {
00680     remove_all_subsets();
00681 
00682     CMemoryMappedFile<char> f(fname);
00683 
00684     int32_t i=0;
00685     uint64_t len=0;
00686     uint64_t offs=0;
00687 
00688     int32_t num=f.get_num_lines();
00689     int32_t max_len=0;
00690 
00691     if (num%4)
00692         SG_ERROR("Number of lines must be divisible by 4 in fastq files\n")
00693     num/=4;
00694 
00695     cleanup();
00696     SG_UNREF(alphabet);
00697     alphabet=new CAlphabet(DNA);
00698 
00699     SGString<ST>* strings;
00700 
00701     ST* str=NULL;
00702     if (bitremap_in_single_string)
00703     {
00704         strings=SG_MALLOC(SGString<ST>, 1);
00705         strings[0].string=SG_MALLOC(ST, num);
00706         strings[0].slen=num;
00707         f.get_line(len, offs);
00708         f.get_line(len, offs);
00709         order=len;
00710         max_len=num;
00711         offs=0;
00712         original_num_symbols=alphabet->get_num_symbols();
00713         str=SG_MALLOC(ST, len);
00714     }
00715     else
00716         strings=SG_MALLOC(SGString<ST>, num);
00717 
00718     for (i=0;i<num; i++)
00719     {
00720         if (!f.get_line(len, offs))
00721             SG_ERROR("Error reading 'read' identifier in line %d", 4*i)
00722 
00723         char* s=f.get_line(len, offs);
00724         if (!s || len==0)
00725             SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len)
00726 
00727         if (bitremap_in_single_string)
00728         {
00729             if (len!=(uint64_t) order)
00730                 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len)
00731             for (int32_t j=0; j<order; j++)
00732                 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00733 
00734             strings[0].string[i]=embed_word(str, order);
00735         }
00736         else
00737         {
00738             strings[i].string=SG_MALLOC(ST, len);
00739             strings[i].slen=len;
00740             str=strings[i].string;
00741 
00742             if (ignore_invalid)
00743             {
00744                 for (uint64_t j=0; j<len; j++)
00745                 {
00746                     if (alphabet->is_valid((uint8_t) s[j]))
00747                         str[j]= (ST) s[j];
00748                     else
00749                         str[j]= (ST) 'A';
00750                 }
00751             }
00752             else
00753             {
00754                 for (uint64_t j=0; j<len; j++)
00755                     str[j]= (ST) s[j];
00756             }
00757             max_len=CMath::max(max_len, (int32_t) len);
00758         }
00759 
00760 
00761         if (!f.get_line(len, offs))
00762             SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2)
00763 
00764         if (!f.get_line(len, offs))
00765             SG_ERROR("Error reading 'read' quality in line %d", 4*i+3)
00766     }
00767 
00768     if (bitremap_in_single_string)
00769         num=1;
00770 
00771     num_vectors=num;
00772     max_string_length=max_len;
00773     features=strings;
00774 
00775     return true;
00776 }
00777 
00778 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname)
00779 {
00780     remove_all_subsets();
00781 
00782     struct dirent **namelist;
00783     int32_t n;
00784 
00785     SGIO::set_dirname(dirname);
00786 
00787     SG_DEBUG("dirname '%s'\n", dirname)
00788 
00789     n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
00790     if (n <= 0)
00791     {
00792         SG_ERROR("error calling scandir - no files found\n")
00793         return false;
00794     }
00795     else
00796     {
00797         SGString<ST>* strings=NULL;
00798 
00799         int32_t num=0;
00800         int32_t max_len=-1;
00801 
00802         //usually n==num_vec, but it might not in race conditions
00803         //(file perms modified, file erased)
00804         strings=SG_MALLOC(SGString<ST>, n);
00805 
00806         for (int32_t i=0; i<n; i++)
00807         {
00808             char* fname=SGIO::concat_filename(namelist[i]->d_name);
00809 
00810             struct stat s;
00811             off_t filesize=0;
00812 
00813             if (!stat(fname, &s) && s.st_size>0)
00814             {
00815                 filesize=s.st_size/sizeof(ST);
00816 
00817                 FILE* f=fopen(fname, "ro");
00818                 if (f)
00819                 {
00820                     ST* str=SG_MALLOC(ST, filesize);
00821                     SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize)
00822                     if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize)
00823                         SG_ERROR("failed to read file\n")
00824                     strings[num].string=str;
00825                     strings[num].slen=filesize;
00826                     max_len=CMath::max(max_len, strings[num].slen);
00827 
00828                     num++;
00829                     fclose(f);
00830                 }
00831             }
00832             else
00833                 SG_ERROR("empty or non readable file \'%s\'\n", fname)
00834 
00835             SG_FREE(namelist[i]);
00836         }
00837         SG_FREE(namelist);
00838 
00839         if (num>0 && strings)
00840         {
00841             set_features(strings, num, max_len);
00842             return true;
00843         }
00844     }
00845     return false;
00846 }
00847 
00848 template<class ST> void CStringFeatures<ST>::set_features(SGStringList<ST> feats)
00849 {
00850     set_features(feats.strings, feats.num_strings, feats.max_string_length);
00851 }
00852 
00853 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00854 {
00855     if (m_subset_stack->has_subsets())
00856         SG_ERROR("Cannot call set_features() with subset.\n")
00857 
00858     if (p_features)
00859     {
00860         CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00861 
00862         //compute histogram for char/byte
00863         for (int32_t i=0; i<p_num_vectors; i++)
00864             alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00865 
00866         SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram())
00867         SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram())
00868 
00869         if (alpha->check_alphabet_size() && alpha->check_alphabet())
00870         {
00871             cleanup();
00872             SG_UNREF(alphabet);
00873 
00874             alphabet=alpha;
00875             SG_REF(alphabet);
00876 
00877             // TODO remove copying
00878             features = SG_MALLOC(SGString<ST>,p_num_vectors);
00879             memcpy(features,p_features,sizeof(SGString<ST>)*p_num_vectors);
00880             num_vectors = p_num_vectors;
00881             max_string_length = p_max_string_length;
00882 
00883             return true;
00884         }
00885         else
00886             SG_UNREF(alpha);
00887     }
00888 
00889     return false;
00890 }
00891 
00892 template<class ST> bool CStringFeatures<ST>::append_features(CStringFeatures<ST>* sf)
00893 {
00894     ASSERT(sf)
00895 
00896     if (m_subset_stack->has_subsets())
00897         SG_ERROR("Cannot call set_features() with subset.\n")
00898 
00899     SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors());
00900 
00901     index_t sf_num_str=sf->get_num_vectors();
00902     for (int32_t i=0; i<sf_num_str; i++)
00903     {
00904         int32_t real_i = sf->m_subset_stack->subset_idx_conversion(i);
00905         int32_t length=sf->features[real_i].slen;
00906         new_features[i].string=SG_MALLOC(ST, length);
00907         memcpy(new_features[i].string, sf->features[real_i].string, length);
00908         new_features[i].slen=length;
00909     }
00910     return append_features(new_features, sf_num_str,
00911             sf->max_string_length);
00912 }
00913 
00914 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00915 {
00916     if (m_subset_stack->has_subsets())
00917         SG_ERROR("Cannot call set_features() with subset.\n")
00918 
00919     if (!features)
00920         return set_features(p_features, p_num_vectors, p_max_string_length);
00921 
00922     CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00923 
00924     //compute histogram for char/byte
00925     for (int32_t i=0; i<p_num_vectors; i++)
00926         alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00927 
00928     SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram())
00929     SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram())
00930 
00931     if (alpha->check_alphabet_size() && alpha->check_alphabet())
00932     {
00933         SG_UNREF(alpha);
00934         for (int32_t i=0; i<p_num_vectors; i++)
00935             alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen);
00936 
00937         int32_t old_num_vectors=num_vectors;
00938         num_vectors=old_num_vectors+p_num_vectors;
00939         SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors);
00940 
00941         for (int32_t i=0; i<num_vectors; i++)
00942         {
00943             if (i<old_num_vectors)
00944             {
00945                 new_features[i].string=features[i].string;
00946                 new_features[i].slen=features[i].slen;
00947             }
00948             else
00949             {
00950                 new_features[i].string=p_features[i-old_num_vectors].string;
00951                 new_features[i].slen=p_features[i-old_num_vectors].slen;
00952             }
00953         }
00954         SG_FREE(features);
00955         SG_FREE(p_features); // free now obsolete features
00956 
00957         this->features=new_features;
00958         max_string_length=CMath::max(max_string_length, p_max_string_length);
00959 
00960         return true;
00961     }
00962     SG_UNREF(alpha);
00963 
00964     return false;
00965 }
00966 
00967 template<class ST> SGStringList<ST> CStringFeatures<ST>::get_features()
00968 {
00969     SGStringList<ST> sl(NULL,0,0,false);
00970 
00971     sl.strings=get_features(sl.num_strings, sl.max_string_length);
00972     return sl;
00973 }
00974 
00975 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len)
00976 {
00977     if (m_subset_stack->has_subsets())
00978         SG_ERROR("get features() is not possible on subset")
00979 
00980     num_str=num_vectors;
00981     max_str_len=max_string_length;
00982     return features;
00983 }
00984 
00985 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len)
00986 {
00987     ASSERT(num_vectors>0)
00988 
00989     num_str=get_num_vectors();
00990     max_str_len=max_string_length;
00991     SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str);
00992 
00993     for (int32_t i=0; i<num_str; i++)
00994     {
00995         int32_t len;
00996         bool free_vec;
00997         ST* vec=get_feature_vector(i, len, free_vec);
00998         new_feat[i].string=SG_MALLOC(ST, len);
00999         new_feat[i].slen=len;
01000         memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
01001         free_feature_vector(vec, i, free_vec);
01002     }
01003 
01004     return new_feat;
01005 }
01006 
01007 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str)
01008 {
01009     int32_t num_vec;
01010     int32_t max_str_len;
01011     *dst=copy_features(num_vec, max_str_len);
01012     *num_str=num_vec;
01013 }
01014 
01015 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress)
01016 {
01017     remove_all_subsets();
01018 
01019     FILE* file=NULL;
01020 
01021     if (!(file=fopen(src, "r")))
01022         return false;
01023     cleanup();
01024 
01025     // header shogun v0
01026     char id[4];
01027     if (fread(&id[0], sizeof(char), 1, file)!=1)
01028         SG_ERROR("failed to read header")
01029     ASSERT(id[0]=='S')
01030     if (fread(&id[1], sizeof(char), 1, file)!=1)
01031         SG_ERROR("failed to read header")
01032     ASSERT(id[1]=='G')
01033     if (fread(&id[2], sizeof(char), 1, file)!=1)
01034         SG_ERROR("failed to read header")
01035     ASSERT(id[2]=='V')
01036     if (fread(&id[3], sizeof(char), 1, file)!=1)
01037         SG_ERROR("failed to read header")
01038     ASSERT(id[3]=='0')
01039 
01040     //compression type
01041     uint8_t c;
01042     if (fread(&c, sizeof(uint8_t), 1, file)!=1)
01043         SG_ERROR("failed to read compression type")
01044     CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
01045     //alphabet
01046     uint8_t a;
01047     delete alphabet;
01048     if (fread(&a, sizeof(uint8_t), 1, file)!=1)
01049         SG_ERROR("failed to read compression alphabet")
01050     alphabet=new CAlphabet((EAlphabet) a);
01051     // number of vectors
01052     if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1)
01053         SG_ERROR("failed to read compression number of vectors")
01054     ASSERT(num_vectors>0)
01055     // maximum string length
01056     if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1)
01057         SG_ERROR("failed to read maximum string length")
01058     ASSERT(max_string_length>0)
01059 
01060     features=SG_MALLOC(SGString<ST>, num_vectors);
01061 
01062     // vectors
01063     for (int32_t i=0; i<num_vectors; i++)
01064     {
01065         // vector len compressed
01066         int32_t len_compressed;
01067         if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1)
01068             SG_ERROR("failed to read vector length compressed")
01069         // vector len uncompressed
01070         int32_t len_uncompressed;
01071         if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1)
01072             SG_ERROR("failed to read vector length uncompressed")
01073 
01074         // vector raw data
01075         if (decompress)
01076         {
01077             features[i].string=SG_MALLOC(ST, len_uncompressed);
01078             features[i].slen=len_uncompressed;
01079             uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
01080             if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed)
01081                 SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed)
01082             uint64_t uncompressed_size=len_uncompressed;
01083             uncompressed_size*=sizeof(ST);
01084             compressor->decompress(compressed, len_compressed,
01085                     (uint8_t*) features[i].string, uncompressed_size);
01086             SG_FREE(compressed);
01087             ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST))
01088         }
01089         else
01090         {
01091             int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
01092             features[i].string=SG_MALLOC(ST, len_compressed+offs);
01093             features[i].slen=len_compressed+offs;
01094             int32_t* feat32ptr=((int32_t*) (features[i].string));
01095             memset(features[i].string, 0, offs*sizeof(ST));
01096             feat32ptr[0]=(int32_t) len_compressed;
01097             feat32ptr[1]=(int32_t) len_uncompressed;
01098             uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
01099             if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
01100                 SG_ERROR("failed to read uncompressed data")
01101         }
01102     }
01103 
01104     delete compressor;
01105     fclose(file);
01106 
01107     return false;
01108 }
01109 
01110 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
01111 {
01112     if (m_subset_stack->has_subsets())
01113         SG_ERROR("save_compressed() is not possible on subset")
01114 
01115     FILE* file=NULL;
01116 
01117     if (!(file=fopen(dest, "wb")))
01118         return false;
01119 
01120     CCompressor* compressor= new CCompressor(compression);
01121 
01122     // header shogun v0
01123     const char* id="SGV0";
01124     fwrite(&id[0], sizeof(char), 1, file);
01125     fwrite(&id[1], sizeof(char), 1, file);
01126     fwrite(&id[2], sizeof(char), 1, file);
01127     fwrite(&id[3], sizeof(char), 1, file);
01128 
01129     //compression type
01130     uint8_t c=(uint8_t) compression;
01131     fwrite(&c, sizeof(uint8_t), 1, file);
01132     //alphabet
01133     uint8_t a=(uint8_t) alphabet->get_alphabet();
01134     fwrite(&a, sizeof(uint8_t), 1, file);
01135     // number of vectors
01136     fwrite(&num_vectors, sizeof(int32_t), 1, file);
01137     // maximum string length
01138     fwrite(&max_string_length, sizeof(int32_t), 1, file);
01139 
01140     // vectors
01141     for (int32_t i=0; i<num_vectors; i++)
01142     {
01143         int32_t len=-1;
01144         bool vfree;
01145         ST* vec=get_feature_vector(i, len, vfree);
01146 
01147         uint8_t* compressed=NULL;
01148         uint64_t compressed_size=0;
01149 
01150         compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
01151                 compressed, compressed_size, level);
01152 
01153         int32_t len_compressed=(int32_t) compressed_size;
01154         // vector len compressed in bytes
01155         fwrite(&len_compressed, sizeof(int32_t), 1, file);
01156         // vector len uncompressed in number of elements of type ST
01157         fwrite(&len, sizeof(int32_t), 1, file);
01158         // vector raw data
01159         fwrite(compressed, compressed_size, 1, file);
01160         SG_FREE(compressed);
01161 
01162         free_feature_vector(vec, i, vfree);
01163     }
01164 
01165     delete compressor;
01166     fclose(file);
01167     return true;
01168 }
01169 
01170 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing)
01171 {
01172     SG_DEBUG("force: %d\n", force_preprocessing)
01173 
01174     for (int32_t i=0; i<get_num_preprocessors(); i++)
01175     {
01176         if ( (!is_preprocessed(i) || force_preprocessing) )
01177         {
01178             set_preprocessed(i);
01179             CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
01180             SG_INFO("preprocessing using preproc %s\n", p->get_name())
01181 
01182             if (!p->apply_to_string_features(this))
01183             {
01184                 SG_UNREF(p);
01185                 return false;
01186             }
01187             else
01188                 SG_UNREF(p);
01189         }
01190     }
01191     return true;
01192 }
01193 
01194 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip)
01195 {
01196     if (m_subset_stack->has_subsets())
01197         SG_NOTIMPLEMENTED
01198 
01199     ASSERT(step_size>0)
01200     ASSERT(window_size>0)
01201     ASSERT(num_vectors==1 || single_string)
01202     ASSERT(max_string_length>=window_size ||
01203             (single_string && length_of_single_string>=window_size));
01204 
01205     //in case we are dealing with a single remapped string
01206     //allow remapping
01207     if (single_string)
01208         num_vectors= (length_of_single_string-window_size)/step_size + 1;
01209     else if (num_vectors==1)
01210     {
01211         num_vectors= (max_string_length-window_size)/step_size + 1;
01212         length_of_single_string=max_string_length;
01213     }
01214 
01215     SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01216     int32_t offs=0;
01217     for (int32_t i=0; i<num_vectors; i++)
01218     {
01219         f[i].string=&features[0].string[offs+skip];
01220         f[i].slen=window_size-skip;
01221         offs+=step_size;
01222     }
01223     single_string=features[0].string;
01224     SG_FREE(features);
01225     features=f;
01226     max_string_length=window_size-skip;
01227 
01228     return num_vectors;
01229 }
01230 
01231 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
01232         int32_t skip)
01233 {
01234     if (m_subset_stack->has_subsets())
01235         SG_NOTIMPLEMENTED
01236 
01237     ASSERT(positions)
01238     ASSERT(window_size>0)
01239     ASSERT(num_vectors==1 || single_string)
01240     ASSERT(max_string_length>=window_size ||
01241             (single_string && length_of_single_string>=window_size));
01242 
01243     num_vectors= positions->get_num_elements();
01244     ASSERT(num_vectors>0)
01245 
01246     int32_t len;
01247 
01248     //in case we are dealing with a single remapped string
01249     //allow remapping
01250     if (single_string)
01251         len=length_of_single_string;
01252     else
01253     {
01254         single_string=features[0].string;
01255         len=max_string_length;
01256         length_of_single_string=max_string_length;
01257     }
01258 
01259     SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
01260     for (int32_t i=0; i<num_vectors; i++)
01261     {
01262         int32_t p=positions->get_element(i);
01263 
01264         if (p>=0 && p<=len-window_size)
01265         {
01266             f[i].string=&features[0].string[p+skip];
01267             f[i].slen=window_size-skip;
01268         }
01269         else
01270         {
01271             num_vectors=1;
01272             max_string_length=len;
01273             features[0].slen=len;
01274             single_string=NULL;
01275             SG_FREE(f);
01276             SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01277                     window_size, i, p, len);
01278             return -1;
01279         }
01280     }
01281 
01282     SG_FREE(features);
01283     features=f;
01284     max_string_length=window_size-skip;
01285 
01286     return num_vectors;
01287 }
01288 
01289 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01290 {
01291     return obtain_from_char_features(sf, start, p_order, gap, rev);
01292 }
01293 
01294 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len)
01295 {
01296     if (len!=-1)
01297     {
01298         if (len!=max_string_length)
01299             return false;
01300     }
01301     len=max_string_length;
01302 
01303     index_t num_str=get_num_vectors();
01304     for (int32_t i=0; i<num_str; i++)
01305     {
01306         if (get_vector_length(i)!=len)
01307             return false;
01308     }
01309 
01310     return true;
01311 }
01312 
01313 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order)
01314 {
01315     if (m_subset_stack->has_subsets())
01316         SG_NOTIMPLEMENTED
01317 
01318     ASSERT(alphabet->get_num_symbols_in_histogram() > 0)
01319 
01320     order=p_order;
01321     original_num_symbols=alphabet->get_num_symbols();
01322     int32_t max_val=alphabet->get_num_bits();
01323 
01324     if (p_order>1)
01325         num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01326     else
01327         num_symbols=original_num_symbols;
01328 
01329     SG_INFO("max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
01330 
01331     if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01332         SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val)
01333 
01334     ST mask=0;
01335     for (int32_t i=0; i<p_order*max_val; i++)
01336         mask= (mask<<1) | ((ST) 1);
01337 
01338     for (int32_t i=0; i<num_vectors; i++)
01339     {
01340         int32_t len=features[i].slen;
01341 
01342         if (len < p_order)
01343             SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order)
01344 
01345         ST* str=features[i].string;
01346 
01347         // convert first word
01348         for (int32_t j=0; j<p_order; j++)
01349             str[j]=(ST) alphabet->remap_to_bin(str[j]);
01350         str[0]=embed_word(&str[0], p_order);
01351 
01352         // convert the rest
01353         int32_t idx=0;
01354         for (int32_t j=p_order; j<len; j++)
01355         {
01356             str[j]=(ST) alphabet->remap_to_bin(str[j]);
01357             str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01358             idx++;
01359         }
01360 
01361         features[i].slen=len-p_order+1;
01362     }
01363 
01364     compute_symbol_mask_table(max_val);
01365 }
01366 
01367 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val)
01368 {
01369     if (m_subset_stack->has_subsets())
01370         SG_NOTIMPLEMENTED
01371 
01372     SG_FREE(symbol_mask_table);
01373     symbol_mask_table=SG_MALLOC(ST, 256);
01374     symbol_mask_table_len=256;
01375 
01376     uint64_t mask=0;
01377     for (int32_t i=0; i< (int64_t) max_val; i++)
01378         mask=(mask<<1) | 1;
01379 
01380     for (int32_t i=0; i<256; i++)
01381     {
01382         uint8_t bits=(uint8_t) i;
01383         symbol_mask_table[i]=0;
01384 
01385         for (int32_t j=0; j<8; j++)
01386         {
01387             if (bits & 1)
01388                 symbol_mask_table[i]|=mask<<(max_val*j);
01389 
01390             bits>>=1;
01391         }
01392     }
01393 }
01394 
01395 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len)
01396 {
01397     uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01398 
01399     ST mask=0;
01400     for (uint32_t i=0; i<nbits; i++)
01401         mask=(mask<<1) | (ST) 1;
01402 
01403     for (int32_t i=0; i<len; i++)
01404     {
01405         ST w=(word & mask);
01406         seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01407         word>>=nbits;
01408     }
01409 }
01410 
01411 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len)
01412 {
01413     ST value=(ST) 0;
01414     uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01415     for (int32_t i=0; i<len; i++)
01416     {
01417         value<<=nbits;
01418         value|=seq[i];
01419     }
01420 
01421     return value;
01422 }
01423 
01424 template<class ST> void CStringFeatures<ST>::determine_maximum_string_length()
01425 {
01426     max_string_length=0;
01427     index_t num_str=get_num_vectors();
01428 
01429     for (int32_t i=0; i<num_str; i++)
01430     {
01431         max_string_length=CMath::max(max_string_length,
01432             features[m_subset_stack->subset_idx_conversion(i)].slen);
01433     }
01434 }
01435 
01436 template<class ST> ST* CStringFeatures<ST>::get_zero_terminated_string_copy(SGString<ST> str)
01437 {
01438     int32_t l=str.slen;
01439     ST* s=SG_MALLOC(ST, l+1);
01440     memcpy(s, str.string, sizeof(ST)*l);
01441     s[l]='\0';
01442     return s;
01443 }
01444 
01445 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len)
01446 {
01447     ASSERT(features)
01448     ASSERT(num<get_num_vectors())
01449 
01450     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
01451 
01452 
01453     features[real_num].slen=len ;
01454     features[real_num].string=string ;
01455 
01456     max_string_length=CMath::max(len, max_string_length);
01457 }
01458 
01459 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize)
01460 {
01461     int32_t nsym=get_num_symbols();
01462     int32_t slen=get_max_vector_length();
01463     int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
01464     float64_t* h= SG_MALLOC(float64_t, sz);
01465     memset(h, 0, sz);
01466 
01467     float64_t* h_normalizer=SG_MALLOC(float64_t, slen);
01468     memset(h_normalizer, 0, slen*sizeof(float64_t));
01469     int32_t num_str=get_num_vectors();
01470     for (int32_t i=0; i<num_str; i++)
01471     {
01472         int32_t len;
01473         bool free_vec;
01474         ST* vec=get_feature_vector(i, len, free_vec);
01475         for (int32_t j=0; j<len; j++)
01476         {
01477             h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
01478             h_normalizer[j]++;
01479         }
01480         free_feature_vector(vec, i, free_vec);
01481     }
01482 
01483     if (normalize)
01484     {
01485         for (int32_t i=0; i<slen; i++)
01486         {
01487             for (int32_t j=0; j<nsym; j++)
01488             {
01489                 if (h_normalizer && h_normalizer[i])
01490                     h[int64_t(i)*nsym+j]/=h_normalizer[i];
01491             }
01492         }
01493     }
01494     SG_FREE(h_normalizer);
01495 
01496     *hist=h;
01497     *rows=nsym;
01498     *cols=slen;
01499 }
01500 
01501 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
01502 {
01503     ASSERT(rows == get_num_symbols())
01504     cleanup();
01505     float64_t* randoms=SG_MALLOC(float64_t, cols);
01506     SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
01507 
01508     for (int32_t i=0; i<num_vec; i++)
01509     {
01510         sf[i].string=SG_MALLOC(ST, cols);
01511         sf[i].slen=cols;
01512 
01513         SGVector<float64_t>::random_vector(randoms, cols, 0.0, 1.0);
01514 
01515         for (int32_t j=0; j<cols; j++)
01516         {
01517             float64_t lik=hist[int64_t(j)*rows+0];
01518 
01519             int32_t c;
01520             for (c=0; c<rows-1; c++)
01521             {
01522                 if (randoms[j]<=lik)
01523                     break;
01524                 lik+=hist[int64_t(j)*rows+c+1];
01525             }
01526             sf[i].string[j]=alphabet->remap_to_char(c);
01527         }
01528     }
01529     SG_FREE(randoms);
01530     set_features(sf, num_vec, cols);
01531 }
01532 
01533 /*
01534 CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2)
01535 {
01536     int *s;
01537     int32_t nStr=get_num_vectors();
01538 
01539     int32_t nfeat=0;
01540     for (int32_t i=0; i < nStr; ++i)
01541         nfeat += get_vector_length[i] - d1 -d2;
01542     SGString<SSKFeature>* F= SG_MALLOC(SGString<SSKFeature>, nfeat);
01543     int32_t c=0;
01544     for (int32_t i=0; i < nStr; ++i)
01545     {
01546     int32_t len;
01547     bool free_vec;
01548     ST* S=get_feature_vector(vec_num, len, free_vec);
01549     free_feature_vector(vec, vec_num, free_vec);
01550         int32_t n=len - d1 - d2;
01551         s=S[i];
01552         for (int32_t j=0; j < n; ++j)
01553         {
01554             F[c].feature1=s[j];
01555             F[c].feature2=s[j+d1];
01556             F[c].feature3=s[j+d1+d2];
01557             F[c].group=i;
01558             c++;
01559         }
01560     }
01561     ASSERT(nfeat==c)
01562     return F;
01563 }
01564 
01565 CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1)
01566 {
01567     int i, j;
01568     int n, nfeat;
01569     int *group;
01570     int *features;
01571     int *s;
01572     int c;
01573     SSKFeatures *F;
01574 
01575     nfeat=0;
01576     for (i=0; i < nStr; ++i)
01577         nfeat += len[i] - d1;
01578     group=(int *)SG_MALLOC(nfeat*sizeof(int));
01579     features=(int *)SG_MALLOC(nfeat*2*sizeof(int *));
01580     c=0;
01581     for (i=0; i < nStr; ++i)
01582     {
01583         n=len[i] - d1;
01584         s=S[i];
01585         for (j=0; j < n; ++j)
01586         {
01587             features[c]=s[j];
01588             features[c+nfeat]=s[j+d1];
01589             group[c]=i;
01590             c++;
01591         }
01592     }
01593     if (nfeat!=c)
01594         printf("Something is wrong...\n");
01595     F=(SSKFeatures *)SG_MALLOC(sizeof(SSKFeatures));
01596     (*F).features=features;
01597     (*F).group=group;
01598     (*F).n=nfeat;
01599     return F;
01600 }
01601 */
01602 
01603 template<class ST> CFeatures* CStringFeatures<ST>::copy_subset(
01604         SGVector<index_t> indices)
01605 {
01606     /* string list to create new CStringFeatures from */
01607     SGStringList<ST> list_copy(indices.vlen, max_string_length);
01608 
01609     /* copy all features */
01610     for (index_t i=0; i<indices.vlen; ++i)
01611     {
01612         /* index with respect to possible subset */
01613         index_t real_idx=m_subset_stack->subset_idx_conversion(indices.vector[i]);
01614 
01615         /* copy string */
01616         SGString<ST> current_string=features[real_idx];
01617         SGString<ST> string_copy(current_string.slen);
01618         memcpy(string_copy.string, current_string.string,
01619             current_string.slen*sizeof(ST));
01620         list_copy.strings[i]=string_copy;
01621     }
01622 
01623     /* create copy instance */
01624     CStringFeatures* result=new CStringFeatures(list_copy, alphabet);
01625 
01626     /* max string length may have changed */
01627     result->determine_maximum_string_length();
01628 
01629     /* keep things from original features (otherwise assertions in x-val) */
01630     result->order=order;
01631     result->compute_symbol_mask_table(result->alphabet->get_num_symbols());
01632 
01633     SG_REF(result);
01634 
01635     return result;
01636 }
01637 
01638 template<class ST> void CStringFeatures<ST>::subset_changed_post()
01639 {
01640     /* max string length has to be updated */
01641     determine_maximum_string_length();
01642 }
01643 
01644 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len)
01645 {
01646     ASSERT(features && num<get_num_vectors())
01647 
01648     int32_t real_num=m_subset_stack->subset_idx_conversion(num);
01649 
01650     len=features[real_num].slen;
01651     if (len<=0)
01652         return NULL;
01653 
01654     ST* target=SG_MALLOC(ST, len);
01655     memcpy(target, features[real_num].string, len*sizeof(ST));
01656     return target;
01657 }
01658 
01659 template<class ST> void CStringFeatures<ST>::init()
01660 {
01661     set_generic<ST>();
01662 
01663     alphabet=NULL;
01664     num_vectors=0;
01665     features=NULL;
01666     single_string=NULL;
01667     length_of_single_string=0;
01668     max_string_length=0;
01669     order=0;
01670     preprocess_on_get=false;
01671     feature_cache=NULL;
01672     symbol_mask_table=NULL;
01673     symbol_mask_table_len=0;
01674     num_symbols=0.0;
01675     original_num_symbols=0;
01676 
01677     m_parameters->add((CSGObject**) &alphabet, "alphabet");
01678     m_parameters->add_vector(&features, &num_vectors, "features",
01679             "This contains the array of features.");
01680     m_parameters->add_vector(&single_string,
01681             &length_of_single_string,
01682             "single_string",
01683             "Created by sliding window.");
01684     m_parameters->add(&max_string_length, "max_string_length",
01685             "Length of longest string.");
01686     m_parameters->add(&num_symbols, "num_symbols",
01687             "Number of used symbols.");
01688     m_parameters->add(&original_num_symbols, "original_num_symbols",
01689             "Original number of used symbols.");
01690     m_parameters->add(&order, "order",
01691             "Order used in higher order mapping.");
01692     m_parameters->add(&preprocess_on_get, "preprocess_on_get",
01693             "Preprocess on-the-fly?");
01694 
01695     m_parameters->add_vector(&symbol_mask_table, &symbol_mask_table_len, "mask_table", "Symbol mask table - using in higher order mapping");
01696 }
01697 
01702 template<> EFeatureType CStringFeatures<bool>::get_feature_type() const
01703 {
01704     return F_BOOL;
01705 }
01706 
01711 template<> EFeatureType CStringFeatures<char>::get_feature_type() const
01712 {
01713     return F_CHAR;
01714 }
01715 
01720 template<> EFeatureType CStringFeatures<uint8_t>::get_feature_type() const
01721 {
01722     return F_BYTE;
01723 }
01724 
01729 template<> EFeatureType CStringFeatures<int16_t>::get_feature_type() const
01730 {
01731     return F_SHORT;
01732 }
01733 
01738 template<> EFeatureType CStringFeatures<uint16_t>::get_feature_type() const
01739 {
01740     return F_WORD;
01741 }
01742 
01747 template<> EFeatureType CStringFeatures<int32_t>::get_feature_type() const
01748 {
01749     return F_INT;
01750 }
01751 
01756 template<> EFeatureType CStringFeatures<uint32_t>::get_feature_type() const
01757 {
01758     return F_UINT;
01759 }
01760 
01765 template<> EFeatureType CStringFeatures<int64_t>::get_feature_type() const
01766 {
01767     return F_LONG;
01768 }
01769 
01774 template<> EFeatureType CStringFeatures<uint64_t>::get_feature_type() const
01775 {
01776     return F_ULONG;
01777 }
01778 
01783 template<> EFeatureType CStringFeatures<float32_t>::get_feature_type() const
01784 {
01785     return F_SHORTREAL;
01786 }
01787 
01792 template<> EFeatureType CStringFeatures<float64_t>::get_feature_type() const
01793 {
01794     return F_DREAL;
01795 }
01796 
01801 template<> EFeatureType CStringFeatures<floatmax_t>::get_feature_type() const
01802 {
01803     return F_LONGREAL;
01804 }
01805 
01806 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
01807 {
01808     return symbol;
01809 }
01810 template<> float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
01811 {
01812     return symbol;
01813 }
01814 template<> float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
01815 {
01816     return symbol;
01817 }
01818 template<> floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
01819 {
01820     return symbol;
01821 }
01822 
01823 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
01824 {
01825     return false;
01826 }
01827 template<> float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
01828 {
01829     return 0;
01830 }
01831 template<> float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
01832 {
01833     return 0;
01834 }
01835 template<> floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
01836 {
01837     return 0;
01838 }
01839 
01840 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
01841 {
01842     return symbol;
01843 }
01844 template<> float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
01845 {
01846     return symbol;
01847 }
01848 template<> float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
01849 {
01850     return symbol;
01851 }
01852 template<> floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
01853 {
01854     return symbol;
01855 }
01856 
01857 #ifndef SUNOS
01858 template<>  template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01859 {
01860     return false;
01861 }
01862 template<>  template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01863 {
01864     return false;
01865 }
01866 template<>  template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01867 {
01868     return false;
01869 }
01870 #endif
01871 
01872 template<>  void CStringFeatures<float32_t>::embed_features(int32_t p_order)
01873 {
01874 }
01875 template<>  void CStringFeatures<float64_t>::embed_features(int32_t p_order)
01876 {
01877 }
01878 template<>  void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
01879 {
01880 }
01881 
01882 template<>  void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
01883 {
01884 }
01885 template<>  void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
01886 {
01887 }
01888 template<>  void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
01889 {
01890 }
01891 
01892 template<>  float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
01893 {
01894     return 0;
01895 }
01896 template<>  float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
01897 {
01898     return 0;
01899 }
01900 template<>  floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
01901 {
01902     return 0;
01903 }
01904 
01905 template<>  void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
01906 {
01907 }
01908 template<>  void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
01909 {
01910 }
01911 template<>  void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
01912 {
01913 }
01914 #define LOAD(f_load, sg_type)                                               \
01915 template<> void CStringFeatures<sg_type>::load(CFile* loader)       \
01916 {                                                                           \
01917     SG_INFO("loading...\n")                                             \
01918                                                                             \
01919     SG_SET_LOCALE_C;                                                    \
01920     SGString<sg_type>* strs;                                                \
01921     int32_t num_str;                                                        \
01922     int32_t max_len;                                                        \
01923     loader->f_load(strs, num_str, max_len);                                 \
01924     set_features(strs, num_str, max_len);                                   \
01925     SG_RESET_LOCALE;                                                    \
01926 }
01927 
01928 LOAD(get_string_list, bool)
01929 LOAD(get_string_list, char)
01930 LOAD(get_string_list, int8_t)
01931 LOAD(get_string_list, uint8_t)
01932 LOAD(get_string_list, int16_t)
01933 LOAD(get_string_list, uint16_t)
01934 LOAD(get_string_list, int32_t)
01935 LOAD(get_string_list, uint32_t)
01936 LOAD(get_string_list, int64_t)
01937 LOAD(get_string_list, uint64_t)
01938 LOAD(get_string_list, float32_t)
01939 LOAD(get_string_list, float64_t)
01940 LOAD(get_string_list, floatmax_t)
01941 #undef LOAD
01942 
01943 #define SAVE(f_write, sg_type)                                              \
01944 template<> void CStringFeatures<sg_type>::save(CFile* writer)       \
01945 {                                                                           \
01946     if (m_subset_stack->has_subsets())                                                          \
01947         SG_ERROR("save() is not possible on subset")                        \
01948     SG_SET_LOCALE_C;                                                    \
01949     ASSERT(writer)                                                          \
01950     writer->f_write(features, num_vectors);                                 \
01951     SG_RESET_LOCALE;                                                    \
01952 }
01953 
01954 SAVE(set_string_list, bool)
01955 SAVE(set_string_list, char)
01956 SAVE(set_string_list, int8_t)
01957 SAVE(set_string_list, uint8_t)
01958 SAVE(set_string_list, int16_t)
01959 SAVE(set_string_list, uint16_t)
01960 SAVE(set_string_list, int32_t)
01961 SAVE(set_string_list, uint32_t)
01962 SAVE(set_string_list, int64_t)
01963 SAVE(set_string_list, uint64_t)
01964 SAVE(set_string_list, float32_t)
01965 SAVE(set_string_list, float64_t)
01966 SAVE(set_string_list, floatmax_t)
01967 #undef SAVE
01968 
01969 template <class ST> template <class CT>
01970 bool CStringFeatures<ST>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
01971         int32_t p_order, int32_t gap, bool rev)
01972 {
01973     remove_all_subsets();
01974     ASSERT(sf)
01975 
01976     CAlphabet* alpha=sf->get_alphabet();
01977     ASSERT(alpha->get_num_symbols_in_histogram() > 0)
01978 
01979     this->order=p_order;
01980     cleanup();
01981 
01982     num_vectors=sf->get_num_vectors();
01983     ASSERT(num_vectors>0)
01984     max_string_length=sf->get_max_vector_length()-start;
01985     features=SG_MALLOC(SGString<ST>, num_vectors);
01986 
01987     SG_DEBUG("%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01988             alpha->get_num_symbols_in_histogram());
01989 
01990     for (int32_t i=0; i<num_vectors; i++)
01991     {
01992         int32_t len=-1;
01993         bool vfree;
01994         CT* c=sf->get_feature_vector(i, len, vfree);
01995         ASSERT(!vfree) // won't work when preprocessors are attached
01996 
01997         features[i].string=SG_MALLOC(ST, len);
01998         features[i].slen=len;
01999 
02000         ST* str=features[i].string;
02001         for (int32_t j=0; j<len; j++)
02002             str[j]=(ST) alpha->remap_to_bin(c[j]);
02003     }
02004 
02005     original_num_symbols=alpha->get_num_symbols();
02006     int32_t max_val=alpha->get_num_bits();
02007 
02008     SG_UNREF(alpha);
02009 
02010     if (p_order>1)
02011         num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
02012     else
02013         num_symbols=original_num_symbols;
02014     SG_INFO("max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
02015 
02016     if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
02017     {
02018         SG_ERROR("symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val)
02019         return false;
02020     }
02021 
02022     SG_DEBUG("translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST))
02023     for (int32_t line=0; line<num_vectors; line++)
02024     {
02025         int32_t len=0;
02026         bool vfree;
02027         ST* fv=get_feature_vector(line, len, vfree);
02028         ASSERT(!vfree) // won't work when preprocessors are attached
02029 
02030         if (rev)
02031             CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
02032         else
02033             CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
02034 
02035         /* fix the length of the string -- hacky */
02036         features[line].slen-=start+gap ;
02037         if (features[line].slen<0)
02038             features[line].slen=0 ;
02039     }
02040 
02041     compute_symbol_mask_table(max_val);
02042 
02043     return true;
02044 }
02045 
02046 template class CStringFeatures<bool>;
02047 template class CStringFeatures<char>;
02048 template class CStringFeatures<int8_t>;
02049 template class CStringFeatures<uint8_t>;
02050 template class CStringFeatures<int16_t>;
02051 template class CStringFeatures<uint16_t>;
02052 template class CStringFeatures<int32_t>;
02053 template class CStringFeatures<uint32_t>;
02054 template class CStringFeatures<int64_t>;
02055 template class CStringFeatures<uint64_t>;
02056 template class CStringFeatures<float32_t>;
02057 template class CStringFeatures<float64_t>;
02058 template class CStringFeatures<floatmax_t>;
02059 
02060 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02061 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02062 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02063 
02064 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02065 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02066 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
02067 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines

SHOGUN Machine Learning Toolbox - Documentation