SHOGUN
v3.2.0
|
00001 #include <shogun/features/StringFeatures.h> 00002 #include <shogun/preprocessor/Preprocessor.h> 00003 #include <shogun/preprocessor/StringPreprocessor.h> 00004 #include <shogun/io/MemoryMappedFile.h> 00005 #include <shogun/io/SGIO.h> 00006 #include <shogun/mathematics/Math.h> 00007 #include <shogun/base/Parameter.h> 00008 #include <shogun/lib/SGStringList.h> 00009 00010 #include <sys/types.h> 00011 #include <sys/stat.h> 00012 #include <dirent.h> 00013 #include <stdio.h> 00014 #include <stdlib.h> 00015 #include <unistd.h> 00016 00017 00018 namespace shogun 00019 { 00020 00021 template<class ST> CStringFeatures<ST>::CStringFeatures() : CFeatures(0) 00022 { 00023 init(); 00024 alphabet=new CAlphabet(); 00025 } 00026 00027 template<class ST> CStringFeatures<ST>::CStringFeatures(EAlphabet alpha) : CFeatures(0) 00028 { 00029 init(); 00030 00031 alphabet=new CAlphabet(alpha); 00032 SG_REF(alphabet); 00033 num_symbols=alphabet->get_num_symbols(); 00034 original_num_symbols=num_symbols; 00035 } 00036 00037 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha) 00038 : CFeatures(0) 00039 { 00040 init(); 00041 00042 alphabet=new CAlphabet(alpha); 00043 SG_REF(alphabet); 00044 num_symbols=alphabet->get_num_symbols(); 00045 original_num_symbols=num_symbols; 00046 set_features(string_list.strings, string_list.num_strings, string_list.max_string_length); 00047 } 00048 00049 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha) 00050 : CFeatures(0) 00051 { 00052 init(); 00053 00054 alphabet=new CAlphabet(alpha); 00055 SG_REF(alphabet); 00056 num_symbols=alphabet->get_num_symbols(); 00057 original_num_symbols=num_symbols; 00058 set_features(string_list.strings, string_list.num_strings, string_list.max_string_length); 00059 } 00060 00061 template<class ST> CStringFeatures<ST>::CStringFeatures(CAlphabet* alpha) 00062 : CFeatures(0) 00063 { 00064 init(); 00065 00066 ASSERT(alpha) 00067 SG_REF(alpha); 00068 alphabet=alpha; 00069 num_symbols=alphabet->get_num_symbols(); 00070 original_num_symbols=num_symbols; 00071 } 00072 00073 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig) 00074 : CFeatures(orig), num_vectors(orig.num_vectors), 00075 single_string(orig.single_string), 00076 length_of_single_string(orig.length_of_single_string), 00077 max_string_length(orig.max_string_length), 00078 num_symbols(orig.num_symbols), 00079 original_num_symbols(orig.original_num_symbols), 00080 order(orig.order), preprocess_on_get(false), 00081 feature_cache(NULL) 00082 { 00083 init(); 00084 00085 ASSERT(orig.single_string == NULL) //not implemented 00086 00087 alphabet=orig.alphabet; 00088 SG_REF(alphabet); 00089 00090 if (orig.features) 00091 { 00092 features=SG_MALLOC(SGString<ST>, orig.num_vectors); 00093 00094 for (int32_t i=0; i<num_vectors; i++) 00095 { 00096 features[i].string=SG_MALLOC(ST, orig.features[i].slen); 00097 features[i].slen=orig.features[i].slen; 00098 memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen); 00099 } 00100 } 00101 00102 if (orig.symbol_mask_table) 00103 { 00104 symbol_mask_table=SG_MALLOC(ST, 256); 00105 symbol_mask_table_len=256; 00106 00107 for (int32_t i=0; i<256; i++) 00108 symbol_mask_table[i]=orig.symbol_mask_table[i]; 00109 } 00110 00111 m_subset_stack=orig.m_subset_stack; 00112 SG_REF(m_subset_stack); 00113 } 00114 00115 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha) 00116 : CFeatures(), num_vectors(0), 00117 features(NULL), single_string(NULL), length_of_single_string(0), 00118 max_string_length(0), order(0), 00119 preprocess_on_get(false), feature_cache(NULL) 00120 { 00121 init(); 00122 00123 alphabet=new CAlphabet(alpha); 00124 SG_REF(alphabet); 00125 num_symbols=alphabet->get_num_symbols(); 00126 original_num_symbols=num_symbols; 00127 load(loader); 00128 } 00129 00130 template<class ST> CStringFeatures<ST>::~CStringFeatures() 00131 { 00132 cleanup(); 00133 00134 SG_UNREF(alphabet); 00135 } 00136 00137 template<class ST> void CStringFeatures<ST>::cleanup() 00138 { 00139 remove_all_subsets(); 00140 00141 if (single_string) 00142 { 00143 SG_FREE(single_string); 00144 single_string=NULL; 00145 } 00146 else 00147 cleanup_feature_vectors(0, num_vectors-1); 00148 00149 /* 00150 if (single_string) 00151 { 00152 SG_FREE(single_string); 00153 single_string=NULL; 00154 } 00155 else 00156 cleanup_feature_vectors(0, num_vectors-1); 00157 */ 00158 00159 num_vectors=0; 00160 SG_FREE(features); 00161 SG_FREE(symbol_mask_table); 00162 features=NULL; 00163 symbol_mask_table=NULL; 00164 00165 /* start with a fresh alphabet, but instead of emptying the histogram 00166 * create a new object (to leave the alphabet object alone if it is used 00167 * by others) 00168 */ 00169 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 00170 SG_UNREF(alphabet); 00171 alphabet=alpha; 00172 SG_REF(alphabet); 00173 } 00174 00175 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num) 00176 { 00177 ASSERT(num<get_num_vectors()) 00178 00179 if (features) 00180 { 00181 int32_t real_num=m_subset_stack->subset_idx_conversion(num); 00182 SG_FREE(features[real_num].string); 00183 features[real_num].string=NULL; 00184 features[real_num].slen=0; 00185 00186 determine_maximum_string_length(); 00187 } 00188 } 00189 00190 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop) 00191 { 00192 if (features && get_num_vectors()) 00193 { 00194 ASSERT(start<get_num_vectors()) 00195 ASSERT(stop<get_num_vectors()) 00196 00197 for (int32_t i=start; i<=stop; i++) 00198 { 00199 int32_t real_num=m_subset_stack->subset_idx_conversion(i); 00200 SG_FREE(features[real_num].string); 00201 features[real_num].string=NULL; 00202 features[real_num].slen=0; 00203 } 00204 determine_maximum_string_length(); 00205 } 00206 } 00207 00208 template<class ST> EFeatureClass CStringFeatures<ST>::get_feature_class() const { return C_STRING; } 00209 00210 template<class ST> EFeatureType CStringFeatures<ST>::get_feature_type() const { return F_UNKNOWN; } 00211 00212 template<class ST> CAlphabet* CStringFeatures<ST>::get_alphabet() 00213 { 00214 SG_REF(alphabet); 00215 return alphabet; 00216 } 00217 00218 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const 00219 { 00220 return new CStringFeatures<ST>(*this); 00221 } 00222 00223 template<class ST> SGVector<ST> CStringFeatures<ST>::get_feature_vector(int32_t num) 00224 { 00225 ASSERT(features) 00226 if (num>=get_num_vectors()) 00227 { 00228 SG_ERROR("Index out of bounds (number of strings %d, you " 00229 "requested %d)\n", get_num_vectors(), num); 00230 } 00231 00232 int32_t l; 00233 bool free_vec; 00234 ST* vec=get_feature_vector(num, l, free_vec); 00235 ST* dst=SG_MALLOC(ST, l); 00236 memcpy(dst, vec, l*sizeof(ST)); 00237 free_feature_vector(vec, num, free_vec); 00238 return SGVector<ST>(dst, l, true); 00239 } 00240 00241 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num) 00242 { 00243 ASSERT(features) 00244 00245 if (m_subset_stack->has_subsets()) 00246 SG_ERROR("A subset is set, cannot set feature vector\n") 00247 00248 if (num>=num_vectors) 00249 { 00250 SG_ERROR("Index out of bounds (number of strings %d, you " 00251 "requested %d)\n", num_vectors, num); 00252 } 00253 00254 if (vector.vlen<=0) 00255 SG_ERROR("String has zero or negative length\n") 00256 00257 cleanup_feature_vector(num); 00258 features[num].slen=vector.vlen; 00259 features[num].string=SG_MALLOC(ST, vector.vlen); 00260 memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST)); 00261 00262 determine_maximum_string_length(); 00263 } 00264 00265 template<class ST> void CStringFeatures<ST>::enable_on_the_fly_preprocessing() 00266 { 00267 preprocess_on_get=true; 00268 } 00269 00270 template<class ST> void CStringFeatures<ST>::disable_on_the_fly_preprocessing() 00271 { 00272 preprocess_on_get=false; 00273 } 00274 00275 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree) 00276 { 00277 ASSERT(features) 00278 if (num>=get_num_vectors()) 00279 SG_ERROR("Requested feature vector with index %d while total num is", num, get_num_vectors()) 00280 00281 int32_t real_num=m_subset_stack->subset_idx_conversion(num); 00282 00283 if (!preprocess_on_get) 00284 { 00285 dofree=false; 00286 len=features[real_num].slen; 00287 return features[real_num].string; 00288 } 00289 else 00290 { 00291 SG_DEBUG("computing feature vector!\n") 00292 ST* feat=compute_feature_vector(num, len); 00293 dofree=true; 00294 00295 if (get_num_preprocessors()) 00296 { 00297 ST* tmp_feat_before=feat; 00298 00299 for (int32_t i=0; i<get_num_preprocessors(); i++) 00300 { 00301 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i); 00302 feat=p->apply_to_string(tmp_feat_before, len); 00303 SG_UNREF(p); 00304 SG_FREE(tmp_feat_before); 00305 tmp_feat_before=feat; 00306 } 00307 } 00308 // TODO: implement caching 00309 return feat; 00310 } 00311 } 00312 00313 template<class ST> CStringFeatures<ST>* CStringFeatures<ST>::get_transposed() 00314 { 00315 int32_t num_feat; 00316 int32_t num_vec; 00317 SGString<ST>* s=get_transposed(num_feat, num_vec); 00318 SGStringList<ST> string_list; 00319 string_list.strings = s; 00320 string_list.num_strings = num_vec; 00321 string_list.max_string_length = num_feat; 00322 00323 return new CStringFeatures<ST>(string_list, alphabet); 00324 } 00325 00326 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec) 00327 { 00328 num_feat=get_num_vectors(); 00329 num_vec=get_max_vector_length(); 00330 ASSERT(have_same_length()) 00331 00332 SG_DEBUG("Allocating memory for transposed string features of size %ld\n", 00333 int64_t(num_feat)*num_vec); 00334 00335 SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec); 00336 00337 for (int32_t i=0; i<num_vec; i++) 00338 { 00339 sf[i].string=SG_MALLOC(ST, num_feat); 00340 sf[i].slen=num_feat; 00341 } 00342 00343 for (int32_t i=0; i<num_feat; i++) 00344 { 00345 int32_t len=0; 00346 bool free_vec=false; 00347 ST* vec=get_feature_vector(i, len, free_vec); 00348 00349 for (int32_t j=0; j<num_vec; j++) 00350 sf[j].string[i]=vec[j]; 00351 00352 free_feature_vector(vec, i, free_vec); 00353 } 00354 return sf; 00355 } 00356 00357 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree) 00358 { 00359 if (num>=get_num_vectors()) 00360 { 00361 SG_ERROR( 00362 "Trying to access string[%d] but num_str=%d\n", num, 00363 get_num_vectors()); 00364 } 00365 00366 int32_t real_num=m_subset_stack->subset_idx_conversion(num); 00367 00368 if (feature_cache) 00369 feature_cache->unlock_entry(real_num); 00370 00371 if (dofree) 00372 SG_FREE(feat_vec); 00373 } 00374 00375 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num) 00376 { 00377 if (num>=get_num_vectors()) 00378 { 00379 SG_ERROR( 00380 "Trying to access string[%d] but num_str=%d\n", num, 00381 get_num_vectors()); 00382 } 00383 00384 int32_t real_num=m_subset_stack->subset_idx_conversion(num); 00385 00386 if (feature_cache) 00387 feature_cache->unlock_entry(real_num); 00388 } 00389 00390 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num) 00391 { 00392 ASSERT(vec_num<get_num_vectors()) 00393 00394 int32_t len; 00395 bool free_vec; 00396 ST* vec=get_feature_vector(vec_num, len, free_vec); 00397 ASSERT(feat_num<len) 00398 ST result=vec[feat_num]; 00399 free_feature_vector(vec, vec_num, free_vec); 00400 00401 return result; 00402 } 00403 00404 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num) 00405 { 00406 ASSERT(vec_num<get_num_vectors()) 00407 00408 int32_t len; 00409 bool free_vec; 00410 ST* vec=get_feature_vector(vec_num, len, free_vec); 00411 free_feature_vector(vec, vec_num, free_vec); 00412 return len; 00413 } 00414 00415 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length() 00416 { 00417 return max_string_length; 00418 } 00419 00420 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const 00421 { 00422 return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors; 00423 } 00424 00425 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; } 00426 00427 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); } 00428 00429 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; } 00430 00431 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; } 00432 00433 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask) 00434 { 00435 ASSERT(symbol_mask_table) 00436 return symbol_mask_table[mask] & symbol; 00437 } 00438 00439 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount) 00440 { 00441 ASSERT(alphabet) 00442 return (offset << (amount*alphabet->get_num_bits())); 00443 } 00444 00445 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount) 00446 { 00447 ASSERT(alphabet) 00448 return (symbol >> (amount*alphabet->get_num_bits())); 00449 } 00450 00451 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin, 00452 EAlphabet ascii_alphabet, EAlphabet binary_alphabet) 00453 { 00454 remove_all_subsets(); 00455 00456 size_t blocksize=1024*1024; 00457 size_t required_blocksize=0; 00458 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize); 00459 uint8_t* overflow=NULL; 00460 int32_t overflow_len=0; 00461 00462 cleanup(); 00463 00464 CAlphabet* alpha=new CAlphabet(ascii_alphabet); 00465 CAlphabet* alpha_bin=new CAlphabet(binary_alphabet); 00466 00467 FILE* f=fopen(fname, "ro"); 00468 00469 if (f) 00470 { 00471 num_vectors=0; 00472 max_string_length=0; 00473 00474 SG_INFO("counting line numbers in file %s\n", fname) 00475 size_t block_offs=0; 00476 size_t old_block_offs=0; 00477 fseek(f, 0, SEEK_END); 00478 size_t fsize=ftell(f); 00479 rewind(f); 00480 00481 if (blocksize>fsize) 00482 blocksize=fsize; 00483 00484 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize) 00485 00486 size_t sz=blocksize; 00487 while (sz == blocksize) 00488 { 00489 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 00490 for (size_t i=0; i<sz; i++) 00491 { 00492 block_offs++; 00493 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00494 { 00495 num_vectors++; 00496 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs); 00497 old_block_offs=block_offs; 00498 } 00499 } 00500 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t") 00501 } 00502 00503 SG_INFO("found %d strings\n", num_vectors) 00504 SG_FREE(dummy); 00505 blocksize=required_blocksize; 00506 dummy=SG_MALLOC(uint8_t, blocksize); 00507 overflow=SG_MALLOC(uint8_t, blocksize); 00508 features=SG_MALLOC(SGString<ST>, num_vectors); 00509 00510 rewind(f); 00511 sz=blocksize; 00512 int32_t lines=0; 00513 while (sz == blocksize) 00514 { 00515 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 00516 00517 size_t old_sz=0; 00518 for (size_t i=0; i<sz; i++) 00519 { 00520 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00521 { 00522 int32_t len=i-old_sz; 00523 //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz) 00524 max_string_length=CMath::max(max_string_length, len+overflow_len); 00525 00526 features[lines].slen=len; 00527 features[lines].string=SG_MALLOC(ST, len); 00528 00529 if (remap_to_bin) 00530 { 00531 for (int32_t j=0; j<overflow_len; j++) 00532 features[lines].string[j]=alpha->remap_to_bin(overflow[j]); 00533 for (int32_t j=0; j<len; j++) 00534 features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]); 00535 alpha->add_string_to_histogram(&dummy[old_sz], len); 00536 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen); 00537 } 00538 else 00539 { 00540 for (int32_t j=0; j<overflow_len; j++) 00541 features[lines].string[j]=overflow[j]; 00542 for (int32_t j=0; j<len; j++) 00543 features[lines].string[j+overflow_len]=dummy[old_sz+j]; 00544 alpha->add_string_to_histogram(&dummy[old_sz], len); 00545 alpha->add_string_to_histogram(features[lines].string, features[lines].slen); 00546 } 00547 00548 // clear overflow 00549 overflow_len=0; 00550 00551 //CMath::display_vector(features[lines].string, len); 00552 old_sz=i+1; 00553 lines++; 00554 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t") 00555 } 00556 } 00557 for (size_t i=old_sz; i<sz; i++) 00558 overflow[i-old_sz]=dummy[i]; 00559 00560 overflow_len=sz-old_sz; 00561 } 00562 00563 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 00564 { 00565 SG_INFO("file successfully read\n") 00566 SG_INFO("max_string_length=%d\n", max_string_length) 00567 SG_INFO("num_strings=%d\n", num_vectors) 00568 } 00569 fclose(f); 00570 } 00571 00572 SG_FREE(dummy); 00573 00574 SG_UNREF(alphabet); 00575 00576 if (remap_to_bin) 00577 alphabet=alpha_bin; 00578 else 00579 alphabet=alpha; 00580 SG_REF(alphabet); 00581 num_symbols=alphabet->get_num_symbols(); 00582 } 00583 00584 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid) 00585 { 00586 remove_all_subsets(); 00587 00588 int32_t i=0; 00589 uint64_t len=0; 00590 uint64_t offs=0; 00591 int32_t num=0; 00592 int32_t max_len=0; 00593 00594 CMemoryMappedFile<char> f(fname); 00595 00596 while (true) 00597 { 00598 char* s=f.get_line(len, offs); 00599 if (!s) 00600 break; 00601 00602 if (len>0 && s[0]=='>') 00603 num++; 00604 } 00605 00606 if (num==0) 00607 SG_ERROR("No fasta hunks (lines starting with '>') found\n") 00608 00609 cleanup(); 00610 SG_UNREF(alphabet); 00611 alphabet=new CAlphabet(DNA); 00612 num_symbols=alphabet->get_num_symbols(); 00613 00614 SGString<ST>* strings=SG_MALLOC(SGString<ST>, num); 00615 offs=0; 00616 00617 for (i=0;i<num; i++) 00618 { 00619 uint64_t id_len=0; 00620 char* id=f.get_line(id_len, offs); 00621 00622 char* fasta=f.get_line(len, offs); 00623 char* s=fasta; 00624 int32_t fasta_len=0; 00625 int32_t spanned_lines=0; 00626 00627 while (true) 00628 { 00629 if (!s || len==0) 00630 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len) 00631 00632 if (s[0]=='>' || offs==f.get_size()) 00633 { 00634 offs-=len+1; // seek to beginning 00635 if (offs==f.get_size()) 00636 { 00637 SG_DEBUG("at EOF\n") 00638 fasta_len+=len; 00639 } 00640 00641 len=fasta_len-spanned_lines; 00642 strings[i].string=SG_MALLOC(ST, len); 00643 strings[i].slen=len; 00644 00645 ST* str=strings[i].string; 00646 int32_t idx=0; 00647 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines) 00648 00649 for (int32_t j=0; j<fasta_len; j++) 00650 { 00651 if (fasta[j]=='\n') 00652 continue; 00653 00654 ST c=(ST) fasta[j]; 00655 00656 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j])) 00657 c=(ST) 'A'; 00658 00659 if (uint64_t(idx)>=len) 00660 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str) 00661 str[idx++]=c; 00662 } 00663 max_len=CMath::max(max_len, strings[i].slen); 00664 00665 00666 break; 00667 } 00668 00669 spanned_lines++; 00670 fasta_len+=len+1; // including '\n' 00671 s=f.get_line(len, offs); 00672 } 00673 } 00674 return set_features(strings, num, max_len); 00675 } 00676 00677 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname, 00678 bool ignore_invalid, bool bitremap_in_single_string) 00679 { 00680 remove_all_subsets(); 00681 00682 CMemoryMappedFile<char> f(fname); 00683 00684 int32_t i=0; 00685 uint64_t len=0; 00686 uint64_t offs=0; 00687 00688 int32_t num=f.get_num_lines(); 00689 int32_t max_len=0; 00690 00691 if (num%4) 00692 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n") 00693 num/=4; 00694 00695 cleanup(); 00696 SG_UNREF(alphabet); 00697 alphabet=new CAlphabet(DNA); 00698 00699 SGString<ST>* strings; 00700 00701 ST* str=NULL; 00702 if (bitremap_in_single_string) 00703 { 00704 strings=SG_MALLOC(SGString<ST>, 1); 00705 strings[0].string=SG_MALLOC(ST, num); 00706 strings[0].slen=num; 00707 f.get_line(len, offs); 00708 f.get_line(len, offs); 00709 order=len; 00710 max_len=num; 00711 offs=0; 00712 original_num_symbols=alphabet->get_num_symbols(); 00713 str=SG_MALLOC(ST, len); 00714 } 00715 else 00716 strings=SG_MALLOC(SGString<ST>, num); 00717 00718 for (i=0;i<num; i++) 00719 { 00720 if (!f.get_line(len, offs)) 00721 SG_ERROR("Error reading 'read' identifier in line %d", 4*i) 00722 00723 char* s=f.get_line(len, offs); 00724 if (!s || len==0) 00725 SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len) 00726 00727 if (bitremap_in_single_string) 00728 { 00729 if (len!=(uint64_t) order) 00730 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len) 00731 for (int32_t j=0; j<order; j++) 00732 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]); 00733 00734 strings[0].string[i]=embed_word(str, order); 00735 } 00736 else 00737 { 00738 strings[i].string=SG_MALLOC(ST, len); 00739 strings[i].slen=len; 00740 str=strings[i].string; 00741 00742 if (ignore_invalid) 00743 { 00744 for (uint64_t j=0; j<len; j++) 00745 { 00746 if (alphabet->is_valid((uint8_t) s[j])) 00747 str[j]= (ST) s[j]; 00748 else 00749 str[j]= (ST) 'A'; 00750 } 00751 } 00752 else 00753 { 00754 for (uint64_t j=0; j<len; j++) 00755 str[j]= (ST) s[j]; 00756 } 00757 max_len=CMath::max(max_len, (int32_t) len); 00758 } 00759 00760 00761 if (!f.get_line(len, offs)) 00762 SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2) 00763 00764 if (!f.get_line(len, offs)) 00765 SG_ERROR("Error reading 'read' quality in line %d", 4*i+3) 00766 } 00767 00768 if (bitremap_in_single_string) 00769 num=1; 00770 00771 num_vectors=num; 00772 max_string_length=max_len; 00773 features=strings; 00774 00775 return true; 00776 } 00777 00778 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname) 00779 { 00780 remove_all_subsets(); 00781 00782 struct dirent **namelist; 00783 int32_t n; 00784 00785 SGIO::set_dirname(dirname); 00786 00787 SG_DEBUG("dirname '%s'\n", dirname) 00788 00789 n=scandir(dirname, &namelist, &SGIO::filter, alphasort); 00790 if (n <= 0) 00791 { 00792 SG_ERROR("error calling scandir - no files found\n") 00793 return false; 00794 } 00795 else 00796 { 00797 SGString<ST>* strings=NULL; 00798 00799 int32_t num=0; 00800 int32_t max_len=-1; 00801 00802 //usually n==num_vec, but it might not in race conditions 00803 //(file perms modified, file erased) 00804 strings=SG_MALLOC(SGString<ST>, n); 00805 00806 for (int32_t i=0; i<n; i++) 00807 { 00808 char* fname=SGIO::concat_filename(namelist[i]->d_name); 00809 00810 struct stat s; 00811 off_t filesize=0; 00812 00813 if (!stat(fname, &s) && s.st_size>0) 00814 { 00815 filesize=s.st_size/sizeof(ST); 00816 00817 FILE* f=fopen(fname, "ro"); 00818 if (f) 00819 { 00820 ST* str=SG_MALLOC(ST, filesize); 00821 SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize) 00822 if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize) 00823 SG_ERROR("failed to read file\n") 00824 strings[num].string=str; 00825 strings[num].slen=filesize; 00826 max_len=CMath::max(max_len, strings[num].slen); 00827 00828 num++; 00829 fclose(f); 00830 } 00831 } 00832 else 00833 SG_ERROR("empty or non readable file \'%s\'\n", fname) 00834 00835 SG_FREE(namelist[i]); 00836 } 00837 SG_FREE(namelist); 00838 00839 if (num>0 && strings) 00840 { 00841 set_features(strings, num, max_len); 00842 return true; 00843 } 00844 } 00845 return false; 00846 } 00847 00848 template<class ST> void CStringFeatures<ST>::set_features(SGStringList<ST> feats) 00849 { 00850 set_features(feats.strings, feats.num_strings, feats.max_string_length); 00851 } 00852 00853 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length) 00854 { 00855 if (m_subset_stack->has_subsets()) 00856 SG_ERROR("Cannot call set_features() with subset.\n") 00857 00858 if (p_features) 00859 { 00860 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 00861 00862 //compute histogram for char/byte 00863 for (int32_t i=0; i<p_num_vectors; i++) 00864 alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen); 00865 00866 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram()) 00867 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram()) 00868 00869 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 00870 { 00871 cleanup(); 00872 SG_UNREF(alphabet); 00873 00874 alphabet=alpha; 00875 SG_REF(alphabet); 00876 00877 // TODO remove copying 00878 features = SG_MALLOC(SGString<ST>,p_num_vectors); 00879 memcpy(features,p_features,sizeof(SGString<ST>)*p_num_vectors); 00880 num_vectors = p_num_vectors; 00881 max_string_length = p_max_string_length; 00882 00883 return true; 00884 } 00885 else 00886 SG_UNREF(alpha); 00887 } 00888 00889 return false; 00890 } 00891 00892 template<class ST> bool CStringFeatures<ST>::append_features(CStringFeatures<ST>* sf) 00893 { 00894 ASSERT(sf) 00895 00896 if (m_subset_stack->has_subsets()) 00897 SG_ERROR("Cannot call set_features() with subset.\n") 00898 00899 SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors()); 00900 00901 index_t sf_num_str=sf->get_num_vectors(); 00902 for (int32_t i=0; i<sf_num_str; i++) 00903 { 00904 int32_t real_i = sf->m_subset_stack->subset_idx_conversion(i); 00905 int32_t length=sf->features[real_i].slen; 00906 new_features[i].string=SG_MALLOC(ST, length); 00907 memcpy(new_features[i].string, sf->features[real_i].string, length); 00908 new_features[i].slen=length; 00909 } 00910 return append_features(new_features, sf_num_str, 00911 sf->max_string_length); 00912 } 00913 00914 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length) 00915 { 00916 if (m_subset_stack->has_subsets()) 00917 SG_ERROR("Cannot call set_features() with subset.\n") 00918 00919 if (!features) 00920 return set_features(p_features, p_num_vectors, p_max_string_length); 00921 00922 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 00923 00924 //compute histogram for char/byte 00925 for (int32_t i=0; i<p_num_vectors; i++) 00926 alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen); 00927 00928 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram()) 00929 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram()) 00930 00931 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 00932 { 00933 SG_UNREF(alpha); 00934 for (int32_t i=0; i<p_num_vectors; i++) 00935 alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen); 00936 00937 int32_t old_num_vectors=num_vectors; 00938 num_vectors=old_num_vectors+p_num_vectors; 00939 SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors); 00940 00941 for (int32_t i=0; i<num_vectors; i++) 00942 { 00943 if (i<old_num_vectors) 00944 { 00945 new_features[i].string=features[i].string; 00946 new_features[i].slen=features[i].slen; 00947 } 00948 else 00949 { 00950 new_features[i].string=p_features[i-old_num_vectors].string; 00951 new_features[i].slen=p_features[i-old_num_vectors].slen; 00952 } 00953 } 00954 SG_FREE(features); 00955 SG_FREE(p_features); // free now obsolete features 00956 00957 this->features=new_features; 00958 max_string_length=CMath::max(max_string_length, p_max_string_length); 00959 00960 return true; 00961 } 00962 SG_UNREF(alpha); 00963 00964 return false; 00965 } 00966 00967 template<class ST> SGStringList<ST> CStringFeatures<ST>::get_features() 00968 { 00969 SGStringList<ST> sl(NULL,0,0,false); 00970 00971 sl.strings=get_features(sl.num_strings, sl.max_string_length); 00972 return sl; 00973 } 00974 00975 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len) 00976 { 00977 if (m_subset_stack->has_subsets()) 00978 SG_ERROR("get features() is not possible on subset") 00979 00980 num_str=num_vectors; 00981 max_str_len=max_string_length; 00982 return features; 00983 } 00984 00985 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len) 00986 { 00987 ASSERT(num_vectors>0) 00988 00989 num_str=get_num_vectors(); 00990 max_str_len=max_string_length; 00991 SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str); 00992 00993 for (int32_t i=0; i<num_str; i++) 00994 { 00995 int32_t len; 00996 bool free_vec; 00997 ST* vec=get_feature_vector(i, len, free_vec); 00998 new_feat[i].string=SG_MALLOC(ST, len); 00999 new_feat[i].slen=len; 01000 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST)); 01001 free_feature_vector(vec, i, free_vec); 01002 } 01003 01004 return new_feat; 01005 } 01006 01007 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str) 01008 { 01009 int32_t num_vec; 01010 int32_t max_str_len; 01011 *dst=copy_features(num_vec, max_str_len); 01012 *num_str=num_vec; 01013 } 01014 01015 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress) 01016 { 01017 remove_all_subsets(); 01018 01019 FILE* file=NULL; 01020 01021 if (!(file=fopen(src, "r"))) 01022 return false; 01023 cleanup(); 01024 01025 // header shogun v0 01026 char id[4]; 01027 if (fread(&id[0], sizeof(char), 1, file)!=1) 01028 SG_ERROR("failed to read header") 01029 ASSERT(id[0]=='S') 01030 if (fread(&id[1], sizeof(char), 1, file)!=1) 01031 SG_ERROR("failed to read header") 01032 ASSERT(id[1]=='G') 01033 if (fread(&id[2], sizeof(char), 1, file)!=1) 01034 SG_ERROR("failed to read header") 01035 ASSERT(id[2]=='V') 01036 if (fread(&id[3], sizeof(char), 1, file)!=1) 01037 SG_ERROR("failed to read header") 01038 ASSERT(id[3]=='0') 01039 01040 //compression type 01041 uint8_t c; 01042 if (fread(&c, sizeof(uint8_t), 1, file)!=1) 01043 SG_ERROR("failed to read compression type") 01044 CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c); 01045 //alphabet 01046 uint8_t a; 01047 delete alphabet; 01048 if (fread(&a, sizeof(uint8_t), 1, file)!=1) 01049 SG_ERROR("failed to read compression alphabet") 01050 alphabet=new CAlphabet((EAlphabet) a); 01051 // number of vectors 01052 if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1) 01053 SG_ERROR("failed to read compression number of vectors") 01054 ASSERT(num_vectors>0) 01055 // maximum string length 01056 if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1) 01057 SG_ERROR("failed to read maximum string length") 01058 ASSERT(max_string_length>0) 01059 01060 features=SG_MALLOC(SGString<ST>, num_vectors); 01061 01062 // vectors 01063 for (int32_t i=0; i<num_vectors; i++) 01064 { 01065 // vector len compressed 01066 int32_t len_compressed; 01067 if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1) 01068 SG_ERROR("failed to read vector length compressed") 01069 // vector len uncompressed 01070 int32_t len_uncompressed; 01071 if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1) 01072 SG_ERROR("failed to read vector length uncompressed") 01073 01074 // vector raw data 01075 if (decompress) 01076 { 01077 features[i].string=SG_MALLOC(ST, len_uncompressed); 01078 features[i].slen=len_uncompressed; 01079 uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed); 01080 if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed) 01081 SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed) 01082 uint64_t uncompressed_size=len_uncompressed; 01083 uncompressed_size*=sizeof(ST); 01084 compressor->decompress(compressed, len_compressed, 01085 (uint8_t*) features[i].string, uncompressed_size); 01086 SG_FREE(compressed); 01087 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST)) 01088 } 01089 else 01090 { 01091 int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST)); 01092 features[i].string=SG_MALLOC(ST, len_compressed+offs); 01093 features[i].slen=len_compressed+offs; 01094 int32_t* feat32ptr=((int32_t*) (features[i].string)); 01095 memset(features[i].string, 0, offs*sizeof(ST)); 01096 feat32ptr[0]=(int32_t) len_compressed; 01097 feat32ptr[1]=(int32_t) len_uncompressed; 01098 uint8_t* compressed=(uint8_t*) (&features[i].string[offs]); 01099 if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed) 01100 SG_ERROR("failed to read uncompressed data") 01101 } 01102 } 01103 01104 delete compressor; 01105 fclose(file); 01106 01107 return false; 01108 } 01109 01110 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level) 01111 { 01112 if (m_subset_stack->has_subsets()) 01113 SG_ERROR("save_compressed() is not possible on subset") 01114 01115 FILE* file=NULL; 01116 01117 if (!(file=fopen(dest, "wb"))) 01118 return false; 01119 01120 CCompressor* compressor= new CCompressor(compression); 01121 01122 // header shogun v0 01123 const char* id="SGV0"; 01124 fwrite(&id[0], sizeof(char), 1, file); 01125 fwrite(&id[1], sizeof(char), 1, file); 01126 fwrite(&id[2], sizeof(char), 1, file); 01127 fwrite(&id[3], sizeof(char), 1, file); 01128 01129 //compression type 01130 uint8_t c=(uint8_t) compression; 01131 fwrite(&c, sizeof(uint8_t), 1, file); 01132 //alphabet 01133 uint8_t a=(uint8_t) alphabet->get_alphabet(); 01134 fwrite(&a, sizeof(uint8_t), 1, file); 01135 // number of vectors 01136 fwrite(&num_vectors, sizeof(int32_t), 1, file); 01137 // maximum string length 01138 fwrite(&max_string_length, sizeof(int32_t), 1, file); 01139 01140 // vectors 01141 for (int32_t i=0; i<num_vectors; i++) 01142 { 01143 int32_t len=-1; 01144 bool vfree; 01145 ST* vec=get_feature_vector(i, len, vfree); 01146 01147 uint8_t* compressed=NULL; 01148 uint64_t compressed_size=0; 01149 01150 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST), 01151 compressed, compressed_size, level); 01152 01153 int32_t len_compressed=(int32_t) compressed_size; 01154 // vector len compressed in bytes 01155 fwrite(&len_compressed, sizeof(int32_t), 1, file); 01156 // vector len uncompressed in number of elements of type ST 01157 fwrite(&len, sizeof(int32_t), 1, file); 01158 // vector raw data 01159 fwrite(compressed, compressed_size, 1, file); 01160 SG_FREE(compressed); 01161 01162 free_feature_vector(vec, i, vfree); 01163 } 01164 01165 delete compressor; 01166 fclose(file); 01167 return true; 01168 } 01169 01170 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing) 01171 { 01172 SG_DEBUG("force: %d\n", force_preprocessing) 01173 01174 for (int32_t i=0; i<get_num_preprocessors(); i++) 01175 { 01176 if ( (!is_preprocessed(i) || force_preprocessing) ) 01177 { 01178 set_preprocessed(i); 01179 CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i); 01180 SG_INFO("preprocessing using preproc %s\n", p->get_name()) 01181 01182 if (!p->apply_to_string_features(this)) 01183 { 01184 SG_UNREF(p); 01185 return false; 01186 } 01187 else 01188 SG_UNREF(p); 01189 } 01190 } 01191 return true; 01192 } 01193 01194 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip) 01195 { 01196 if (m_subset_stack->has_subsets()) 01197 SG_NOTIMPLEMENTED 01198 01199 ASSERT(step_size>0) 01200 ASSERT(window_size>0) 01201 ASSERT(num_vectors==1 || single_string) 01202 ASSERT(max_string_length>=window_size || 01203 (single_string && length_of_single_string>=window_size)); 01204 01205 //in case we are dealing with a single remapped string 01206 //allow remapping 01207 if (single_string) 01208 num_vectors= (length_of_single_string-window_size)/step_size + 1; 01209 else if (num_vectors==1) 01210 { 01211 num_vectors= (max_string_length-window_size)/step_size + 1; 01212 length_of_single_string=max_string_length; 01213 } 01214 01215 SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors); 01216 int32_t offs=0; 01217 for (int32_t i=0; i<num_vectors; i++) 01218 { 01219 f[i].string=&features[0].string[offs+skip]; 01220 f[i].slen=window_size-skip; 01221 offs+=step_size; 01222 } 01223 single_string=features[0].string; 01224 SG_FREE(features); 01225 features=f; 01226 max_string_length=window_size-skip; 01227 01228 return num_vectors; 01229 } 01230 01231 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, 01232 int32_t skip) 01233 { 01234 if (m_subset_stack->has_subsets()) 01235 SG_NOTIMPLEMENTED 01236 01237 ASSERT(positions) 01238 ASSERT(window_size>0) 01239 ASSERT(num_vectors==1 || single_string) 01240 ASSERT(max_string_length>=window_size || 01241 (single_string && length_of_single_string>=window_size)); 01242 01243 num_vectors= positions->get_num_elements(); 01244 ASSERT(num_vectors>0) 01245 01246 int32_t len; 01247 01248 //in case we are dealing with a single remapped string 01249 //allow remapping 01250 if (single_string) 01251 len=length_of_single_string; 01252 else 01253 { 01254 single_string=features[0].string; 01255 len=max_string_length; 01256 length_of_single_string=max_string_length; 01257 } 01258 01259 SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors); 01260 for (int32_t i=0; i<num_vectors; i++) 01261 { 01262 int32_t p=positions->get_element(i); 01263 01264 if (p>=0 && p<=len-window_size) 01265 { 01266 f[i].string=&features[0].string[p+skip]; 01267 f[i].slen=window_size-skip; 01268 } 01269 else 01270 { 01271 num_vectors=1; 01272 max_string_length=len; 01273 features[0].slen=len; 01274 single_string=NULL; 01275 SG_FREE(f); 01276 SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n", 01277 window_size, i, p, len); 01278 return -1; 01279 } 01280 } 01281 01282 SG_FREE(features); 01283 features=f; 01284 max_string_length=window_size-skip; 01285 01286 return num_vectors; 01287 } 01288 01289 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01290 { 01291 return obtain_from_char_features(sf, start, p_order, gap, rev); 01292 } 01293 01294 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len) 01295 { 01296 if (len!=-1) 01297 { 01298 if (len!=max_string_length) 01299 return false; 01300 } 01301 len=max_string_length; 01302 01303 index_t num_str=get_num_vectors(); 01304 for (int32_t i=0; i<num_str; i++) 01305 { 01306 if (get_vector_length(i)!=len) 01307 return false; 01308 } 01309 01310 return true; 01311 } 01312 01313 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order) 01314 { 01315 if (m_subset_stack->has_subsets()) 01316 SG_NOTIMPLEMENTED 01317 01318 ASSERT(alphabet->get_num_symbols_in_histogram() > 0) 01319 01320 order=p_order; 01321 original_num_symbols=alphabet->get_num_symbols(); 01322 int32_t max_val=alphabet->get_num_bits(); 01323 01324 if (p_order>1) 01325 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order); 01326 else 01327 num_symbols=original_num_symbols; 01328 01329 SG_INFO("max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols) 01330 01331 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) ) 01332 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val) 01333 01334 ST mask=0; 01335 for (int32_t i=0; i<p_order*max_val; i++) 01336 mask= (mask<<1) | ((ST) 1); 01337 01338 for (int32_t i=0; i<num_vectors; i++) 01339 { 01340 int32_t len=features[i].slen; 01341 01342 if (len < p_order) 01343 SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order) 01344 01345 ST* str=features[i].string; 01346 01347 // convert first word 01348 for (int32_t j=0; j<p_order; j++) 01349 str[j]=(ST) alphabet->remap_to_bin(str[j]); 01350 str[0]=embed_word(&str[0], p_order); 01351 01352 // convert the rest 01353 int32_t idx=0; 01354 for (int32_t j=p_order; j<len; j++) 01355 { 01356 str[j]=(ST) alphabet->remap_to_bin(str[j]); 01357 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask; 01358 idx++; 01359 } 01360 01361 features[i].slen=len-p_order+1; 01362 } 01363 01364 compute_symbol_mask_table(max_val); 01365 } 01366 01367 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val) 01368 { 01369 if (m_subset_stack->has_subsets()) 01370 SG_NOTIMPLEMENTED 01371 01372 SG_FREE(symbol_mask_table); 01373 symbol_mask_table=SG_MALLOC(ST, 256); 01374 symbol_mask_table_len=256; 01375 01376 uint64_t mask=0; 01377 for (int32_t i=0; i< (int64_t) max_val; i++) 01378 mask=(mask<<1) | 1; 01379 01380 for (int32_t i=0; i<256; i++) 01381 { 01382 uint8_t bits=(uint8_t) i; 01383 symbol_mask_table[i]=0; 01384 01385 for (int32_t j=0; j<8; j++) 01386 { 01387 if (bits & 1) 01388 symbol_mask_table[i]|=mask<<(max_val*j); 01389 01390 bits>>=1; 01391 } 01392 } 01393 } 01394 01395 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len) 01396 { 01397 uint32_t nbits= (uint32_t) alphabet->get_num_bits(); 01398 01399 ST mask=0; 01400 for (uint32_t i=0; i<nbits; i++) 01401 mask=(mask<<1) | (ST) 1; 01402 01403 for (int32_t i=0; i<len; i++) 01404 { 01405 ST w=(word & mask); 01406 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w); 01407 word>>=nbits; 01408 } 01409 } 01410 01411 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len) 01412 { 01413 ST value=(ST) 0; 01414 uint32_t nbits= (uint32_t) alphabet->get_num_bits(); 01415 for (int32_t i=0; i<len; i++) 01416 { 01417 value<<=nbits; 01418 value|=seq[i]; 01419 } 01420 01421 return value; 01422 } 01423 01424 template<class ST> void CStringFeatures<ST>::determine_maximum_string_length() 01425 { 01426 max_string_length=0; 01427 index_t num_str=get_num_vectors(); 01428 01429 for (int32_t i=0; i<num_str; i++) 01430 { 01431 max_string_length=CMath::max(max_string_length, 01432 features[m_subset_stack->subset_idx_conversion(i)].slen); 01433 } 01434 } 01435 01436 template<class ST> ST* CStringFeatures<ST>::get_zero_terminated_string_copy(SGString<ST> str) 01437 { 01438 int32_t l=str.slen; 01439 ST* s=SG_MALLOC(ST, l+1); 01440 memcpy(s, str.string, sizeof(ST)*l); 01441 s[l]='\0'; 01442 return s; 01443 } 01444 01445 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len) 01446 { 01447 ASSERT(features) 01448 ASSERT(num<get_num_vectors()) 01449 01450 int32_t real_num=m_subset_stack->subset_idx_conversion(num); 01451 01452 01453 features[real_num].slen=len ; 01454 features[real_num].string=string ; 01455 01456 max_string_length=CMath::max(len, max_string_length); 01457 } 01458 01459 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize) 01460 { 01461 int32_t nsym=get_num_symbols(); 01462 int32_t slen=get_max_vector_length(); 01463 int64_t sz=int64_t(nsym)*slen*sizeof(float64_t); 01464 float64_t* h= SG_MALLOC(float64_t, sz); 01465 memset(h, 0, sz); 01466 01467 float64_t* h_normalizer=SG_MALLOC(float64_t, slen); 01468 memset(h_normalizer, 0, slen*sizeof(float64_t)); 01469 int32_t num_str=get_num_vectors(); 01470 for (int32_t i=0; i<num_str; i++) 01471 { 01472 int32_t len; 01473 bool free_vec; 01474 ST* vec=get_feature_vector(i, len, free_vec); 01475 for (int32_t j=0; j<len; j++) 01476 { 01477 h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++; 01478 h_normalizer[j]++; 01479 } 01480 free_feature_vector(vec, i, free_vec); 01481 } 01482 01483 if (normalize) 01484 { 01485 for (int32_t i=0; i<slen; i++) 01486 { 01487 for (int32_t j=0; j<nsym; j++) 01488 { 01489 if (h_normalizer && h_normalizer[i]) 01490 h[int64_t(i)*nsym+j]/=h_normalizer[i]; 01491 } 01492 } 01493 } 01494 SG_FREE(h_normalizer); 01495 01496 *hist=h; 01497 *rows=nsym; 01498 *cols=slen; 01499 } 01500 01501 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec) 01502 { 01503 ASSERT(rows == get_num_symbols()) 01504 cleanup(); 01505 float64_t* randoms=SG_MALLOC(float64_t, cols); 01506 SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec); 01507 01508 for (int32_t i=0; i<num_vec; i++) 01509 { 01510 sf[i].string=SG_MALLOC(ST, cols); 01511 sf[i].slen=cols; 01512 01513 SGVector<float64_t>::random_vector(randoms, cols, 0.0, 1.0); 01514 01515 for (int32_t j=0; j<cols; j++) 01516 { 01517 float64_t lik=hist[int64_t(j)*rows+0]; 01518 01519 int32_t c; 01520 for (c=0; c<rows-1; c++) 01521 { 01522 if (randoms[j]<=lik) 01523 break; 01524 lik+=hist[int64_t(j)*rows+c+1]; 01525 } 01526 sf[i].string[j]=alphabet->remap_to_char(c); 01527 } 01528 } 01529 SG_FREE(randoms); 01530 set_features(sf, num_vec, cols); 01531 } 01532 01533 /* 01534 CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2) 01535 { 01536 int *s; 01537 int32_t nStr=get_num_vectors(); 01538 01539 int32_t nfeat=0; 01540 for (int32_t i=0; i < nStr; ++i) 01541 nfeat += get_vector_length[i] - d1 -d2; 01542 SGString<SSKFeature>* F= SG_MALLOC(SGString<SSKFeature>, nfeat); 01543 int32_t c=0; 01544 for (int32_t i=0; i < nStr; ++i) 01545 { 01546 int32_t len; 01547 bool free_vec; 01548 ST* S=get_feature_vector(vec_num, len, free_vec); 01549 free_feature_vector(vec, vec_num, free_vec); 01550 int32_t n=len - d1 - d2; 01551 s=S[i]; 01552 for (int32_t j=0; j < n; ++j) 01553 { 01554 F[c].feature1=s[j]; 01555 F[c].feature2=s[j+d1]; 01556 F[c].feature3=s[j+d1+d2]; 01557 F[c].group=i; 01558 c++; 01559 } 01560 } 01561 ASSERT(nfeat==c) 01562 return F; 01563 } 01564 01565 CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1) 01566 { 01567 int i, j; 01568 int n, nfeat; 01569 int *group; 01570 int *features; 01571 int *s; 01572 int c; 01573 SSKFeatures *F; 01574 01575 nfeat=0; 01576 for (i=0; i < nStr; ++i) 01577 nfeat += len[i] - d1; 01578 group=(int *)SG_MALLOC(nfeat*sizeof(int)); 01579 features=(int *)SG_MALLOC(nfeat*2*sizeof(int *)); 01580 c=0; 01581 for (i=0; i < nStr; ++i) 01582 { 01583 n=len[i] - d1; 01584 s=S[i]; 01585 for (j=0; j < n; ++j) 01586 { 01587 features[c]=s[j]; 01588 features[c+nfeat]=s[j+d1]; 01589 group[c]=i; 01590 c++; 01591 } 01592 } 01593 if (nfeat!=c) 01594 printf("Something is wrong...\n"); 01595 F=(SSKFeatures *)SG_MALLOC(sizeof(SSKFeatures)); 01596 (*F).features=features; 01597 (*F).group=group; 01598 (*F).n=nfeat; 01599 return F; 01600 } 01601 */ 01602 01603 template<class ST> CFeatures* CStringFeatures<ST>::copy_subset( 01604 SGVector<index_t> indices) 01605 { 01606 /* string list to create new CStringFeatures from */ 01607 SGStringList<ST> list_copy(indices.vlen, max_string_length); 01608 01609 /* copy all features */ 01610 for (index_t i=0; i<indices.vlen; ++i) 01611 { 01612 /* index with respect to possible subset */ 01613 index_t real_idx=m_subset_stack->subset_idx_conversion(indices.vector[i]); 01614 01615 /* copy string */ 01616 SGString<ST> current_string=features[real_idx]; 01617 SGString<ST> string_copy(current_string.slen); 01618 memcpy(string_copy.string, current_string.string, 01619 current_string.slen*sizeof(ST)); 01620 list_copy.strings[i]=string_copy; 01621 } 01622 01623 /* create copy instance */ 01624 CStringFeatures* result=new CStringFeatures(list_copy, alphabet); 01625 01626 /* max string length may have changed */ 01627 result->determine_maximum_string_length(); 01628 01629 /* keep things from original features (otherwise assertions in x-val) */ 01630 result->order=order; 01631 result->compute_symbol_mask_table(result->alphabet->get_num_symbols()); 01632 01633 SG_REF(result); 01634 01635 return result; 01636 } 01637 01638 template<class ST> void CStringFeatures<ST>::subset_changed_post() 01639 { 01640 /* max string length has to be updated */ 01641 determine_maximum_string_length(); 01642 } 01643 01644 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len) 01645 { 01646 ASSERT(features && num<get_num_vectors()) 01647 01648 int32_t real_num=m_subset_stack->subset_idx_conversion(num); 01649 01650 len=features[real_num].slen; 01651 if (len<=0) 01652 return NULL; 01653 01654 ST* target=SG_MALLOC(ST, len); 01655 memcpy(target, features[real_num].string, len*sizeof(ST)); 01656 return target; 01657 } 01658 01659 template<class ST> void CStringFeatures<ST>::init() 01660 { 01661 set_generic<ST>(); 01662 01663 alphabet=NULL; 01664 num_vectors=0; 01665 features=NULL; 01666 single_string=NULL; 01667 length_of_single_string=0; 01668 max_string_length=0; 01669 order=0; 01670 preprocess_on_get=false; 01671 feature_cache=NULL; 01672 symbol_mask_table=NULL; 01673 symbol_mask_table_len=0; 01674 num_symbols=0.0; 01675 original_num_symbols=0; 01676 01677 m_parameters->add((CSGObject**) &alphabet, "alphabet"); 01678 m_parameters->add_vector(&features, &num_vectors, "features", 01679 "This contains the array of features."); 01680 m_parameters->add_vector(&single_string, 01681 &length_of_single_string, 01682 "single_string", 01683 "Created by sliding window."); 01684 m_parameters->add(&max_string_length, "max_string_length", 01685 "Length of longest string."); 01686 m_parameters->add(&num_symbols, "num_symbols", 01687 "Number of used symbols."); 01688 m_parameters->add(&original_num_symbols, "original_num_symbols", 01689 "Original number of used symbols."); 01690 m_parameters->add(&order, "order", 01691 "Order used in higher order mapping."); 01692 m_parameters->add(&preprocess_on_get, "preprocess_on_get", 01693 "Preprocess on-the-fly?"); 01694 01695 m_parameters->add_vector(&symbol_mask_table, &symbol_mask_table_len, "mask_table", "Symbol mask table - using in higher order mapping"); 01696 } 01697 01702 template<> EFeatureType CStringFeatures<bool>::get_feature_type() const 01703 { 01704 return F_BOOL; 01705 } 01706 01711 template<> EFeatureType CStringFeatures<char>::get_feature_type() const 01712 { 01713 return F_CHAR; 01714 } 01715 01720 template<> EFeatureType CStringFeatures<uint8_t>::get_feature_type() const 01721 { 01722 return F_BYTE; 01723 } 01724 01729 template<> EFeatureType CStringFeatures<int16_t>::get_feature_type() const 01730 { 01731 return F_SHORT; 01732 } 01733 01738 template<> EFeatureType CStringFeatures<uint16_t>::get_feature_type() const 01739 { 01740 return F_WORD; 01741 } 01742 01747 template<> EFeatureType CStringFeatures<int32_t>::get_feature_type() const 01748 { 01749 return F_INT; 01750 } 01751 01756 template<> EFeatureType CStringFeatures<uint32_t>::get_feature_type() const 01757 { 01758 return F_UINT; 01759 } 01760 01765 template<> EFeatureType CStringFeatures<int64_t>::get_feature_type() const 01766 { 01767 return F_LONG; 01768 } 01769 01774 template<> EFeatureType CStringFeatures<uint64_t>::get_feature_type() const 01775 { 01776 return F_ULONG; 01777 } 01778 01783 template<> EFeatureType CStringFeatures<float32_t>::get_feature_type() const 01784 { 01785 return F_SHORTREAL; 01786 } 01787 01792 template<> EFeatureType CStringFeatures<float64_t>::get_feature_type() const 01793 { 01794 return F_DREAL; 01795 } 01796 01801 template<> EFeatureType CStringFeatures<floatmax_t>::get_feature_type() const 01802 { 01803 return F_LONGREAL; 01804 } 01805 01806 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask) 01807 { 01808 return symbol; 01809 } 01810 template<> float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask) 01811 { 01812 return symbol; 01813 } 01814 template<> float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask) 01815 { 01816 return symbol; 01817 } 01818 template<> floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask) 01819 { 01820 return symbol; 01821 } 01822 01823 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount) 01824 { 01825 return false; 01826 } 01827 template<> float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount) 01828 { 01829 return 0; 01830 } 01831 template<> float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount) 01832 { 01833 return 0; 01834 } 01835 template<> floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount) 01836 { 01837 return 0; 01838 } 01839 01840 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount) 01841 { 01842 return symbol; 01843 } 01844 template<> float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount) 01845 { 01846 return symbol; 01847 } 01848 template<> float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount) 01849 { 01850 return symbol; 01851 } 01852 template<> floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount) 01853 { 01854 return symbol; 01855 } 01856 01857 #ifndef SUNOS 01858 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01859 { 01860 return false; 01861 } 01862 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01863 { 01864 return false; 01865 } 01866 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01867 { 01868 return false; 01869 } 01870 #endif 01871 01872 template<> void CStringFeatures<float32_t>::embed_features(int32_t p_order) 01873 { 01874 } 01875 template<> void CStringFeatures<float64_t>::embed_features(int32_t p_order) 01876 { 01877 } 01878 template<> void CStringFeatures<floatmax_t>::embed_features(int32_t p_order) 01879 { 01880 } 01881 01882 template<> void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val) 01883 { 01884 } 01885 template<> void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val) 01886 { 01887 } 01888 template<> void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val) 01889 { 01890 } 01891 01892 template<> float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len) 01893 { 01894 return 0; 01895 } 01896 template<> float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len) 01897 { 01898 return 0; 01899 } 01900 template<> floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len) 01901 { 01902 return 0; 01903 } 01904 01905 template<> void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len) 01906 { 01907 } 01908 template<> void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len) 01909 { 01910 } 01911 template<> void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len) 01912 { 01913 } 01914 #define LOAD(f_load, sg_type) \ 01915 template<> void CStringFeatures<sg_type>::load(CFile* loader) \ 01916 { \ 01917 SG_INFO("loading...\n") \ 01918 \ 01919 SG_SET_LOCALE_C; \ 01920 SGString<sg_type>* strs; \ 01921 int32_t num_str; \ 01922 int32_t max_len; \ 01923 loader->f_load(strs, num_str, max_len); \ 01924 set_features(strs, num_str, max_len); \ 01925 SG_RESET_LOCALE; \ 01926 } 01927 01928 LOAD(get_string_list, bool) 01929 LOAD(get_string_list, char) 01930 LOAD(get_string_list, int8_t) 01931 LOAD(get_string_list, uint8_t) 01932 LOAD(get_string_list, int16_t) 01933 LOAD(get_string_list, uint16_t) 01934 LOAD(get_string_list, int32_t) 01935 LOAD(get_string_list, uint32_t) 01936 LOAD(get_string_list, int64_t) 01937 LOAD(get_string_list, uint64_t) 01938 LOAD(get_string_list, float32_t) 01939 LOAD(get_string_list, float64_t) 01940 LOAD(get_string_list, floatmax_t) 01941 #undef LOAD 01942 01943 #define SAVE(f_write, sg_type) \ 01944 template<> void CStringFeatures<sg_type>::save(CFile* writer) \ 01945 { \ 01946 if (m_subset_stack->has_subsets()) \ 01947 SG_ERROR("save() is not possible on subset") \ 01948 SG_SET_LOCALE_C; \ 01949 ASSERT(writer) \ 01950 writer->f_write(features, num_vectors); \ 01951 SG_RESET_LOCALE; \ 01952 } 01953 01954 SAVE(set_string_list, bool) 01955 SAVE(set_string_list, char) 01956 SAVE(set_string_list, int8_t) 01957 SAVE(set_string_list, uint8_t) 01958 SAVE(set_string_list, int16_t) 01959 SAVE(set_string_list, uint16_t) 01960 SAVE(set_string_list, int32_t) 01961 SAVE(set_string_list, uint32_t) 01962 SAVE(set_string_list, int64_t) 01963 SAVE(set_string_list, uint64_t) 01964 SAVE(set_string_list, float32_t) 01965 SAVE(set_string_list, float64_t) 01966 SAVE(set_string_list, floatmax_t) 01967 #undef SAVE 01968 01969 template <class ST> template <class CT> 01970 bool CStringFeatures<ST>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, 01971 int32_t p_order, int32_t gap, bool rev) 01972 { 01973 remove_all_subsets(); 01974 ASSERT(sf) 01975 01976 CAlphabet* alpha=sf->get_alphabet(); 01977 ASSERT(alpha->get_num_symbols_in_histogram() > 0) 01978 01979 this->order=p_order; 01980 cleanup(); 01981 01982 num_vectors=sf->get_num_vectors(); 01983 ASSERT(num_vectors>0) 01984 max_string_length=sf->get_max_vector_length()-start; 01985 features=SG_MALLOC(SGString<ST>, num_vectors); 01986 01987 SG_DEBUG("%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(), 01988 alpha->get_num_symbols_in_histogram()); 01989 01990 for (int32_t i=0; i<num_vectors; i++) 01991 { 01992 int32_t len=-1; 01993 bool vfree; 01994 CT* c=sf->get_feature_vector(i, len, vfree); 01995 ASSERT(!vfree) // won't work when preprocessors are attached 01996 01997 features[i].string=SG_MALLOC(ST, len); 01998 features[i].slen=len; 01999 02000 ST* str=features[i].string; 02001 for (int32_t j=0; j<len; j++) 02002 str[j]=(ST) alpha->remap_to_bin(c[j]); 02003 } 02004 02005 original_num_symbols=alpha->get_num_symbols(); 02006 int32_t max_val=alpha->get_num_bits(); 02007 02008 SG_UNREF(alpha); 02009 02010 if (p_order>1) 02011 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order); 02012 else 02013 num_symbols=original_num_symbols; 02014 SG_INFO("max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols) 02015 02016 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) ) 02017 { 02018 SG_ERROR("symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val) 02019 return false; 02020 } 02021 02022 SG_DEBUG("translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) 02023 for (int32_t line=0; line<num_vectors; line++) 02024 { 02025 int32_t len=0; 02026 bool vfree; 02027 ST* fv=get_feature_vector(line, len, vfree); 02028 ASSERT(!vfree) // won't work when preprocessors are attached 02029 02030 if (rev) 02031 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap); 02032 else 02033 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap); 02034 02035 /* fix the length of the string -- hacky */ 02036 features[line].slen-=start+gap ; 02037 if (features[line].slen<0) 02038 features[line].slen=0 ; 02039 } 02040 02041 compute_symbol_mask_table(max_val); 02042 02043 return true; 02044 } 02045 02046 template class CStringFeatures<bool>; 02047 template class CStringFeatures<char>; 02048 template class CStringFeatures<int8_t>; 02049 template class CStringFeatures<uint8_t>; 02050 template class CStringFeatures<int16_t>; 02051 template class CStringFeatures<uint16_t>; 02052 template class CStringFeatures<int32_t>; 02053 template class CStringFeatures<uint32_t>; 02054 template class CStringFeatures<int64_t>; 02055 template class CStringFeatures<uint64_t>; 02056 template class CStringFeatures<float32_t>; 02057 template class CStringFeatures<float64_t>; 02058 template class CStringFeatures<floatmax_t>; 02059 02060 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02061 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02062 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02063 02064 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02065 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02066 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev); 02067 }