SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2006-2009 Soeren Sonnenburg 00008 * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include <string.h> 00012 #include <math.h> 00013 #include <ctype.h> 00014 00015 #include <shogun/features/Alphabet.h> 00016 #include <shogun/io/SGIO.h> 00017 #include <shogun/base/Parameter.h> 00018 00019 using namespace shogun; 00020 00021 //define numbers for the bases 00022 const uint8_t CAlphabet::B_A=0; 00023 const uint8_t CAlphabet::B_C=1; 00024 const uint8_t CAlphabet::B_G=2; 00025 const uint8_t CAlphabet::B_T=3; 00026 const uint8_t CAlphabet::B_0=4; 00027 const uint8_t CAlphabet::MAPTABLE_UNDEF=0xff; 00028 const char* CAlphabet::alphabet_names[18]={ 00029 "DNA","RAWDNA", "RNA", "PROTEIN", "BINARY", "ALPHANUM", 00030 "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID", 00031 "NONE", "DIGIT", "DIGIT2", "RAWDIGIT", "RAWDIGIT2", "UNKNOWN", 00032 "SNP", "RAWSNP"}; 00033 00034 00035 CAlphabet::CAlphabet() 00036 : CSGObject() 00037 { 00038 init(); 00039 } 00040 00041 CAlphabet::CAlphabet(char* al, int32_t len) 00042 : CSGObject() 00043 { 00044 init(); 00045 00046 EAlphabet alpha=NONE; 00047 00048 if (len>=(int32_t) strlen("DNA") && !strncmp(al, "DNA", strlen("DNA"))) 00049 alpha = DNA; 00050 else if (len>=(int32_t) strlen("RAWDNA") && !strncmp(al, "RAWDNA", strlen("RAWDNA"))) 00051 alpha = RAWDNA; 00052 else if (len>=(int32_t) strlen("RNA") && !strncmp(al, "RNA", strlen("RNA"))) 00053 alpha = RNA; 00054 else if (len>=(int32_t) strlen("PROTEIN") && !strncmp(al, "PROTEIN", strlen("PROTEIN"))) 00055 alpha = PROTEIN; 00056 else if (len>=(int32_t) strlen("BINARY") && !strncmp(al, "BINARY", strlen("IBINARY"))) 00057 alpha = BINARY; 00058 else if (len>=(int32_t) strlen("ALPHANUM") && !strncmp(al, "ALPHANUM", strlen("ALPHANUM"))) 00059 alpha = ALPHANUM; 00060 else if (len>=(int32_t) strlen("CUBE") && !strncmp(al, "CUBE", strlen("CUBE"))) 00061 alpha = CUBE; 00062 else if (len>=(int32_t) strlen("DIGIT2") && !strncmp(al, "DIGIT2", strlen("DIGIT2"))) 00063 alpha = DIGIT2; 00064 else if (len>=(int32_t) strlen("DIGIT") && !strncmp(al, "DIGIT", strlen("DIGIT"))) 00065 alpha = DIGIT; 00066 else if (len>=(int32_t) strlen("RAWDIGIT2") && !strncmp(al, "RAWDIGIT2", strlen("RAWDIGIT2"))) 00067 alpha = RAWDIGIT2; 00068 else if (len>=(int32_t) strlen("RAWDIGIT") && !strncmp(al, "RAWDIGIT", strlen("RAWDIGIT"))) 00069 alpha = RAWDIGIT; 00070 else if (len>=(int32_t) strlen("SNP") && !strncmp(al, "SNP", strlen("SNP"))) 00071 alpha = SNP; 00072 else if (len>=(int32_t) strlen("RAWSNP") && !strncmp(al, "RAWSNP", strlen("RAWSNP"))) 00073 alpha = RAWSNP; 00074 else if ((len>=(int32_t) strlen("BYTE") && !strncmp(al, "BYTE", strlen("BYTE"))) || 00075 (len>=(int32_t) strlen("RAW") && !strncmp(al, "RAW", strlen("RAW")))) 00076 alpha = RAWBYTE; 00077 else if (len>=(int32_t) strlen("IUPAC_NUCLEIC_ACID") && !strncmp(al, "IUPAC_NUCLEIC_ACID", strlen("IUPAC_NUCLEIC_ACID"))) 00078 alpha = IUPAC_NUCLEIC_ACID; 00079 else if (len>=(int32_t) strlen("IUPAC_AMINO_ACID") && !strncmp(al, "IUPAC_AMINO_ACID", strlen("IUPAC_AMINO_ACID"))) 00080 alpha = IUPAC_AMINO_ACID; 00081 else { 00082 SG_ERROR("unknown alphabet %s\n", al) 00083 } 00084 00085 set_alphabet(alpha); 00086 } 00087 00088 CAlphabet::CAlphabet(EAlphabet alpha) 00089 : CSGObject() 00090 { 00091 init(); 00092 set_alphabet(alpha); 00093 } 00094 00095 CAlphabet::CAlphabet(CAlphabet* a) 00096 : CSGObject() 00097 { 00098 init(); 00099 REQUIRE(a, "No Alphabet specified!\n"); 00100 set_alphabet(a->get_alphabet()); 00101 copy_histogram(a); 00102 } 00103 00104 CAlphabet::~CAlphabet() 00105 { 00106 } 00107 00108 bool CAlphabet::set_alphabet(EAlphabet alpha) 00109 { 00110 bool result=true; 00111 alphabet=alpha; 00112 00113 switch (alphabet) 00114 { 00115 case DNA: 00116 case RAWDNA: 00117 num_symbols = 4; 00118 break; 00119 case RNA: 00120 num_symbols = 4; 00121 break; 00122 case PROTEIN: 00123 num_symbols = 26; 00124 break; 00125 case BINARY: 00126 num_symbols = 2; 00127 break; 00128 case ALPHANUM: 00129 num_symbols = 36; 00130 break; 00131 case CUBE: 00132 num_symbols = 6; 00133 break; 00134 case RAWBYTE: 00135 num_symbols = 256; 00136 break; 00137 case IUPAC_NUCLEIC_ACID: 00138 num_symbols = 16; 00139 break; 00140 case IUPAC_AMINO_ACID: 00141 num_symbols = 23; 00142 break; 00143 case NONE: 00144 num_symbols = 0; 00145 break; 00146 case DIGIT2: 00147 num_symbols = 3; 00148 break; 00149 case DIGIT: 00150 num_symbols = 10; 00151 break; 00152 case RAWDIGIT2: 00153 num_symbols = 3; 00154 break; 00155 case RAWDIGIT: 00156 num_symbols = 10; 00157 break; 00158 case SNP: 00159 num_symbols = 5; 00160 break; 00161 case RAWSNP: 00162 num_symbols = 5; 00163 break; 00164 default: 00165 num_symbols = 0; 00166 result=false; 00167 break; 00168 } 00169 00170 num_bits=(int32_t) ceil(log((float64_t) num_symbols)/log((float64_t) 2)); 00171 init_map_table(); 00172 clear_histogram(); 00173 00174 SG_DEBUG("initialised alphabet %s\n", get_alphabet_name(alphabet)) 00175 00176 return result; 00177 } 00178 00179 void CAlphabet::init_map_table() 00180 { 00181 for (int32_t i=0; i<(1<<(8*sizeof(uint8_t))); i++) 00182 { 00183 maptable_to_bin[i] = MAPTABLE_UNDEF; 00184 maptable_to_char[i] = MAPTABLE_UNDEF; 00185 valid_chars[i] = false; 00186 } 00187 00188 switch (alphabet) 00189 { 00190 case RAWDIGIT: 00191 for (uint8_t i=0; i<=9; i++) 00192 { 00193 valid_chars[i]=true; 00194 maptable_to_bin[i]=i; 00195 maptable_to_char[i]=i; 00196 } 00197 break; 00198 00199 case RAWDIGIT2: 00200 for (uint8_t i=0; i<=2; i++) 00201 { 00202 valid_chars[i]=true; 00203 maptable_to_bin[i]=i; 00204 maptable_to_char[i]=i; 00205 } 00206 break; 00207 00208 case DIGIT: 00209 valid_chars[(uint8_t) '0']=true; 00210 valid_chars[(uint8_t) '1']=true; 00211 valid_chars[(uint8_t) '2']=true; 00212 valid_chars[(uint8_t) '3']=true; 00213 valid_chars[(uint8_t) '4']=true; 00214 valid_chars[(uint8_t) '5']=true; 00215 valid_chars[(uint8_t) '6']=true; 00216 valid_chars[(uint8_t) '7']=true; 00217 valid_chars[(uint8_t) '8']=true; 00218 valid_chars[(uint8_t) '9']=true; //Translation '0-9' -> 0-9 00219 00220 maptable_to_bin[(uint8_t) '0']=0; 00221 maptable_to_bin[(uint8_t) '1']=1; 00222 maptable_to_bin[(uint8_t) '2']=2; 00223 maptable_to_bin[(uint8_t) '3']=3; 00224 maptable_to_bin[(uint8_t) '4']=4; 00225 maptable_to_bin[(uint8_t) '5']=5; 00226 maptable_to_bin[(uint8_t) '6']=6; 00227 maptable_to_bin[(uint8_t) '7']=7; 00228 maptable_to_bin[(uint8_t) '8']=8; 00229 maptable_to_bin[(uint8_t) '9']=9; //Translation '0-9' -> 0-9 00230 00231 maptable_to_char[(uint8_t) 0]='0'; 00232 maptable_to_char[(uint8_t) 1]='1'; 00233 maptable_to_char[(uint8_t) 2]='2'; 00234 maptable_to_char[(uint8_t) 3]='3'; 00235 maptable_to_char[(uint8_t) 4]='4'; 00236 maptable_to_char[(uint8_t) 5]='5'; 00237 maptable_to_char[(uint8_t) 6]='6'; 00238 maptable_to_char[(uint8_t) 7]='7'; 00239 maptable_to_char[(uint8_t) 8]='8'; 00240 maptable_to_char[(uint8_t) 9]='9'; //Translation 0-9 -> '0-9' 00241 break; 00242 00243 case DIGIT2: 00244 valid_chars[(uint8_t) '0']=true; 00245 valid_chars[(uint8_t) '1']=true; 00246 valid_chars[(uint8_t) '2']=true; //Translation '0-2' -> 0-2 00247 00248 maptable_to_bin[(uint8_t) '0']=0; 00249 maptable_to_bin[(uint8_t) '1']=1; 00250 maptable_to_bin[(uint8_t) '2']=2; //Translation '0-2' -> 0-2 00251 00252 maptable_to_char[(uint8_t) 0]='0'; 00253 maptable_to_char[(uint8_t) 1]='1'; 00254 maptable_to_char[(uint8_t) 2]='2'; //Translation 0-2 -> '0-2' 00255 break; 00256 00257 case CUBE: 00258 valid_chars[(uint8_t) '1']=true; 00259 valid_chars[(uint8_t) '2']=true; 00260 valid_chars[(uint8_t) '3']=true; 00261 valid_chars[(uint8_t) '4']=true; 00262 valid_chars[(uint8_t) '5']=true; 00263 valid_chars[(uint8_t) '6']=true; //Translation '123456' -> 012345 00264 00265 maptable_to_bin[(uint8_t) '1']=0; 00266 maptable_to_bin[(uint8_t) '2']=1; 00267 maptable_to_bin[(uint8_t) '3']=2; 00268 maptable_to_bin[(uint8_t) '4']=3; 00269 maptable_to_bin[(uint8_t) '5']=4; 00270 maptable_to_bin[(uint8_t) '6']=5; //Translation '123456' -> 012345 00271 00272 maptable_to_char[(uint8_t) 0]='1'; 00273 maptable_to_char[(uint8_t) 1]='2'; 00274 maptable_to_char[(uint8_t) 2]='3'; 00275 maptable_to_char[(uint8_t) 3]='4'; 00276 maptable_to_char[(uint8_t) 4]='5'; 00277 maptable_to_char[(uint8_t) 5]='6'; //Translation 012345->'123456' 00278 break; 00279 00280 case PROTEIN: 00281 { 00282 int32_t skip=0 ; 00283 for (int32_t i=0; i<21; i++) 00284 { 00285 if (i==1) skip++ ; 00286 if (i==8) skip++ ; 00287 if (i==12) skip++ ; 00288 if (i==17) skip++ ; 00289 valid_chars['A'+i+skip]=true; 00290 maptable_to_bin['A'+i+skip]=i ; 00291 maptable_to_char[i]='A'+i+skip ; 00292 } ; //Translation 012345->acde...xy -- the protein code 00293 } ; 00294 break; 00295 00296 case BINARY: 00297 valid_chars[(uint8_t) '0']=true; 00298 valid_chars[(uint8_t) '1']=true; 00299 00300 maptable_to_bin[(uint8_t) '0']=0; 00301 maptable_to_bin[(uint8_t) '1']=1; 00302 00303 maptable_to_char[0]=(uint8_t) '0'; 00304 maptable_to_char[1]=(uint8_t) '1'; 00305 break; 00306 00307 case ALPHANUM: 00308 { 00309 for (int32_t i=0; i<26; i++) 00310 { 00311 valid_chars[(uint8_t) 'A'+i]=true; 00312 maptable_to_bin[(uint8_t) 'A'+i]=i ; 00313 maptable_to_char[i]='A'+i ; 00314 } ; 00315 for (int32_t i=0; i<10; i++) 00316 { 00317 valid_chars[(uint8_t) '0'+i]=true; 00318 maptable_to_bin[(uint8_t) '0'+i]=26+i ; 00319 maptable_to_char[26+i]='0'+i ; 00320 } ; //Translation 012345->acde...xy0123456789 00321 } ; 00322 break; 00323 00324 case RAWBYTE: 00325 { 00326 //identity 00327 for (int32_t i=0; i<256; i++) 00328 { 00329 valid_chars[i]=true; 00330 maptable_to_bin[i]=i; 00331 maptable_to_char[i]=i; 00332 } 00333 } 00334 break; 00335 00336 case DNA: 00337 valid_chars[(uint8_t) 'A']=true; 00338 valid_chars[(uint8_t) 'C']=true; 00339 valid_chars[(uint8_t) 'G']=true; 00340 valid_chars[(uint8_t) 'T']=true; 00341 00342 maptable_to_bin[(uint8_t) 'A']=B_A; 00343 maptable_to_bin[(uint8_t) 'C']=B_C; 00344 maptable_to_bin[(uint8_t) 'G']=B_G; 00345 maptable_to_bin[(uint8_t) 'T']=B_T; 00346 00347 maptable_to_char[B_A]='A'; 00348 maptable_to_char[B_C]='C'; 00349 maptable_to_char[B_G]='G'; 00350 maptable_to_char[B_T]='T'; 00351 break; 00352 case RAWDNA: 00353 { 00354 //identity 00355 for (int32_t i=0; i<4; i++) 00356 { 00357 valid_chars[i]=true; 00358 maptable_to_bin[i]=i; 00359 maptable_to_char[i]=i; 00360 } 00361 } 00362 break; 00363 00364 case SNP: 00365 valid_chars[(uint8_t) 'A']=true; 00366 valid_chars[(uint8_t) 'C']=true; 00367 valid_chars[(uint8_t) 'G']=true; 00368 valid_chars[(uint8_t) 'T']=true; 00369 valid_chars[(uint8_t) '0']=true; 00370 00371 maptable_to_bin[(uint8_t) 'A']=B_A; 00372 maptable_to_bin[(uint8_t) 'C']=B_C; 00373 maptable_to_bin[(uint8_t) 'G']=B_G; 00374 maptable_to_bin[(uint8_t) 'T']=B_T; 00375 maptable_to_bin[(uint8_t) '0']=B_0; 00376 00377 maptable_to_char[B_A]='A'; 00378 maptable_to_char[B_C]='C'; 00379 maptable_to_char[B_G]='G'; 00380 maptable_to_char[B_T]='T'; 00381 maptable_to_char[B_0]='0'; 00382 break; 00383 case RAWSNP: 00384 { 00385 //identity 00386 for (int32_t i=0; i<5; i++) 00387 { 00388 valid_chars[i]=true; 00389 maptable_to_bin[i]=i; 00390 maptable_to_char[i]=i; 00391 } 00392 } 00393 break; 00394 00395 case RNA: 00396 valid_chars[(uint8_t) 'A']=true; 00397 valid_chars[(uint8_t) 'C']=true; 00398 valid_chars[(uint8_t) 'G']=true; 00399 valid_chars[(uint8_t) 'U']=true; 00400 00401 maptable_to_bin[(uint8_t) 'A']=B_A; 00402 maptable_to_bin[(uint8_t) 'C']=B_C; 00403 maptable_to_bin[(uint8_t) 'G']=B_G; 00404 maptable_to_bin[(uint8_t) 'U']=B_T; 00405 00406 maptable_to_char[B_A]='A'; 00407 maptable_to_char[B_C]='C'; 00408 maptable_to_char[B_G]='G'; 00409 maptable_to_char[B_T]='U'; 00410 break; 00411 00412 case IUPAC_NUCLEIC_ACID: 00413 valid_chars[(uint8_t) 'A']=true; // A Adenine 00414 valid_chars[(uint8_t) 'C']=true; // C Cytosine 00415 valid_chars[(uint8_t) 'G']=true; // G Guanine 00416 valid_chars[(uint8_t) 'T']=true; // T Thymine 00417 valid_chars[(uint8_t) 'U']=true; // U Uracil 00418 valid_chars[(uint8_t) 'R']=true; // R Purine (A or G) 00419 valid_chars[(uint8_t) 'Y']=true; // Y Pyrimidine (C, T, or U) 00420 valid_chars[(uint8_t) 'M']=true; // M C or A 00421 valid_chars[(uint8_t) 'K']=true; // K T, U, or G 00422 valid_chars[(uint8_t) 'W']=true; // W T, U, or A 00423 valid_chars[(uint8_t) 'S']=true; // S C or G 00424 valid_chars[(uint8_t) 'B']=true; // B C, T, U, or G (not A) 00425 valid_chars[(uint8_t) 'D']=true; // D A, T, U, or G (not C) 00426 valid_chars[(uint8_t) 'H']=true; // H A, T, U, or C (not G) 00427 valid_chars[(uint8_t) 'V']=true; // V A, C, or G (not T, not U) 00428 valid_chars[(uint8_t) 'N']=true; // N Any base (A, C, G, T, or U) 00429 00430 maptable_to_bin[(uint8_t) 'A']=0; // A Adenine 00431 maptable_to_bin[(uint8_t) 'C']=1; // C Cytosine 00432 maptable_to_bin[(uint8_t) 'G']=2; // G Guanine 00433 maptable_to_bin[(uint8_t) 'T']=3; // T Thymine 00434 maptable_to_bin[(uint8_t) 'U']=4; // U Uracil 00435 maptable_to_bin[(uint8_t) 'R']=5; // R Purine (A or G) 00436 maptable_to_bin[(uint8_t) 'Y']=6; // Y Pyrimidine (C, T, or U) 00437 maptable_to_bin[(uint8_t) 'M']=7; // M C or A 00438 maptable_to_bin[(uint8_t) 'K']=8; // K T, U, or G 00439 maptable_to_bin[(uint8_t) 'W']=9; // W T, U, or A 00440 maptable_to_bin[(uint8_t) 'S']=10; // S C or G 00441 maptable_to_bin[(uint8_t) 'B']=11; // B C, T, U, or G (not A) 00442 maptable_to_bin[(uint8_t) 'D']=12; // D A, T, U, or G (not C) 00443 maptable_to_bin[(uint8_t) 'H']=13; // H A, T, U, or C (not G) 00444 maptable_to_bin[(uint8_t) 'V']=14; // V A, C, or G (not T, not U) 00445 maptable_to_bin[(uint8_t) 'N']=15; // N Any base (A, C, G, T, or U) 00446 00447 maptable_to_char[0]=(uint8_t) 'A'; // A Adenine 00448 maptable_to_char[1]=(uint8_t) 'C'; // C Cytosine 00449 maptable_to_char[2]=(uint8_t) 'G'; // G Guanine 00450 maptable_to_char[3]=(uint8_t) 'T'; // T Thymine 00451 maptable_to_char[4]=(uint8_t) 'U'; // U Uracil 00452 maptable_to_char[5]=(uint8_t) 'R'; // R Purine (A or G) 00453 maptable_to_char[6]=(uint8_t) 'Y'; // Y Pyrimidine (C, T, or U) 00454 maptable_to_char[7]=(uint8_t) 'M'; // M C or A 00455 maptable_to_char[8]=(uint8_t) 'K'; // K T, U, or G 00456 maptable_to_char[9]=(uint8_t) 'W'; // W T, U, or A 00457 maptable_to_char[10]=(uint8_t) 'S'; // S C or G 00458 maptable_to_char[11]=(uint8_t) 'B'; // B C, T, U, or G (not A) 00459 maptable_to_char[12]=(uint8_t) 'D'; // D A, T, U, or G (not C) 00460 maptable_to_char[13]=(uint8_t) 'H'; // H A, T, U, or C (not G) 00461 maptable_to_char[14]=(uint8_t) 'V'; // V A, C, or G (not T, not U) 00462 maptable_to_char[15]=(uint8_t) 'N'; // N Any base (A, C, G, T, or U) 00463 break; 00464 00465 case IUPAC_AMINO_ACID: 00466 valid_chars[(uint8_t) 'A']=true; //A Ala Alanine 00467 valid_chars[(uint8_t) 'R']=true; //R Arg Arginine 00468 valid_chars[(uint8_t) 'N']=true; //N Asn Asparagine 00469 valid_chars[(uint8_t) 'D']=true; //D Asp Aspartic acid 00470 valid_chars[(uint8_t) 'C']=true; //C Cys Cysteine 00471 valid_chars[(uint8_t) 'Q']=true; //Q Gln Glutamine 00472 valid_chars[(uint8_t) 'E']=true; //E Glu Glutamic acid 00473 valid_chars[(uint8_t) 'G']=true; //G Gly Glycine 00474 valid_chars[(uint8_t) 'H']=true; //H His Histidine 00475 valid_chars[(uint8_t) 'I']=true; //I Ile Isoleucine 00476 valid_chars[(uint8_t) 'L']=true; //L Leu Leucine 00477 valid_chars[(uint8_t) 'K']=true; //K Lys Lysine 00478 valid_chars[(uint8_t) 'M']=true; //M Met Methionine 00479 valid_chars[(uint8_t) 'F']=true; //F Phe Phenylalanine 00480 valid_chars[(uint8_t) 'P']=true; //P Pro Proline 00481 valid_chars[(uint8_t) 'S']=true; //S Ser Serine 00482 valid_chars[(uint8_t) 'T']=true; //T Thr Threonine 00483 valid_chars[(uint8_t) 'W']=true; //W Trp Tryptophan 00484 valid_chars[(uint8_t) 'Y']=true; //Y Tyr Tyrosine 00485 valid_chars[(uint8_t) 'V']=true; //V Val Valine 00486 valid_chars[(uint8_t) 'B']=true; //B Asx Aspartic acid or Asparagine 00487 valid_chars[(uint8_t) 'Z']=true; //Z Glx Glutamine or Glutamic acid 00488 valid_chars[(uint8_t) 'X']=true; //X Xaa Any amino acid 00489 00490 maptable_to_bin[(uint8_t) 'A']=0; //A Ala Alanine 00491 maptable_to_bin[(uint8_t) 'R']=1; //R Arg Arginine 00492 maptable_to_bin[(uint8_t) 'N']=2; //N Asn Asparagine 00493 maptable_to_bin[(uint8_t) 'D']=3; //D Asp Aspartic acid 00494 maptable_to_bin[(uint8_t) 'C']=4; //C Cys Cysteine 00495 maptable_to_bin[(uint8_t) 'Q']=5; //Q Gln Glutamine 00496 maptable_to_bin[(uint8_t) 'E']=6; //E Glu Glutamic acid 00497 maptable_to_bin[(uint8_t) 'G']=7; //G Gly Glycine 00498 maptable_to_bin[(uint8_t) 'H']=8; //H His Histidine 00499 maptable_to_bin[(uint8_t) 'I']=9; //I Ile Isoleucine 00500 maptable_to_bin[(uint8_t) 'L']=10; //L Leu Leucine 00501 maptable_to_bin[(uint8_t) 'K']=11; //K Lys Lysine 00502 maptable_to_bin[(uint8_t) 'M']=12; //M Met Methionine 00503 maptable_to_bin[(uint8_t) 'F']=13; //F Phe Phenylalanine 00504 maptable_to_bin[(uint8_t) 'P']=14; //P Pro Proline 00505 maptable_to_bin[(uint8_t) 'S']=15; //S Ser Serine 00506 maptable_to_bin[(uint8_t) 'T']=16; //T Thr Threonine 00507 maptable_to_bin[(uint8_t) 'W']=17; //W Trp Tryptophan 00508 maptable_to_bin[(uint8_t) 'Y']=18; //Y Tyr Tyrosine 00509 maptable_to_bin[(uint8_t) 'V']=19; //V Val Valine 00510 maptable_to_bin[(uint8_t) 'B']=20; //B Asx Aspartic acid or Asparagine 00511 maptable_to_bin[(uint8_t) 'Z']=21; //Z Glx Glutamine or Glutamic acid 00512 maptable_to_bin[(uint8_t) 'X']=22; //X Xaa Any amino acid 00513 00514 maptable_to_char[0]=(uint8_t) 'A'; //A Ala Alanine 00515 maptable_to_char[1]=(uint8_t) 'R'; //R Arg Arginine 00516 maptable_to_char[2]=(uint8_t) 'N'; //N Asn Asparagine 00517 maptable_to_char[3]=(uint8_t) 'D'; //D Asp Aspartic acid 00518 maptable_to_char[4]=(uint8_t) 'C'; //C Cys Cysteine 00519 maptable_to_char[5]=(uint8_t) 'Q'; //Q Gln Glutamine 00520 maptable_to_char[6]=(uint8_t) 'E'; //E Glu Glutamic acid 00521 maptable_to_char[7]=(uint8_t) 'G'; //G Gly Glycine 00522 maptable_to_char[8]=(uint8_t) 'H'; //H His Histidine 00523 maptable_to_char[9]=(uint8_t) 'I'; //I Ile Isoleucine 00524 maptable_to_char[10]=(uint8_t) 'L'; //L Leu Leucine 00525 maptable_to_char[11]=(uint8_t) 'K'; //K Lys Lysine 00526 maptable_to_char[12]=(uint8_t) 'M'; //M Met Methionine 00527 maptable_to_char[13]=(uint8_t) 'F'; //F Phe Phenylalanine 00528 maptable_to_char[14]=(uint8_t) 'P'; //P Pro Proline 00529 maptable_to_char[15]=(uint8_t) 'S'; //S Ser Serine 00530 maptable_to_char[16]=(uint8_t) 'T'; //T Thr Threonine 00531 maptable_to_char[17]=(uint8_t) 'W'; //W Trp Tryptophan 00532 maptable_to_char[18]=(uint8_t) 'Y'; //Y Tyr Tyrosine 00533 maptable_to_char[19]=(uint8_t) 'V'; //V Val Valine 00534 maptable_to_char[20]=(uint8_t) 'B'; //B Asx Aspartic acid or Asparagine 00535 maptable_to_char[21]=(uint8_t) 'Z'; //Z Glx Glutamine or Glutamic acid 00536 maptable_to_char[22]=(uint8_t) 'X'; //X Xaa Any amino acid 00537 break; 00538 00539 default: 00540 break; //leave uninitialised 00541 }; 00542 } 00543 00544 void CAlphabet::clear_histogram() 00545 { 00546 memset(histogram, 0, sizeof(histogram)); 00547 print_histogram(); 00548 } 00549 00550 int32_t CAlphabet::get_max_value_in_histogram() 00551 { 00552 int32_t max_sym=-1; 00553 for (int32_t i=(int32_t) (1 <<(sizeof(uint8_t)*8))-1;i>=0; i--) 00554 { 00555 if (histogram[i]) 00556 { 00557 max_sym=i; 00558 break; 00559 } 00560 } 00561 00562 return max_sym; 00563 } 00564 00565 int32_t CAlphabet::get_num_symbols_in_histogram() 00566 { 00567 int32_t num_sym=0; 00568 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++) 00569 { 00570 if (histogram[i]) 00571 num_sym++; 00572 } 00573 00574 return num_sym; 00575 } 00576 00577 int32_t CAlphabet::get_num_bits_in_histogram() 00578 { 00579 int32_t num_sym=get_num_symbols_in_histogram(); 00580 if (num_sym>0) 00581 return (int32_t) ceil(log((float64_t) num_sym)/log((float64_t) 2)); 00582 else 00583 return 0; 00584 } 00585 00586 00587 void CAlphabet::print_histogram() 00588 { 00589 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++) 00590 { 00591 if (histogram[i]) 00592 { 00593 if (isprint(i)) 00594 SG_PRINT("hist['%c']=%lld", i, histogram[i]) 00595 else if (i == '\t') 00596 SG_PRINT("hist['\\t']=%lld", histogram[i]) 00597 else if (i == '\n') 00598 SG_PRINT("hist['\\n']=%lld", histogram[i]) 00599 else if (i == '\r') 00600 SG_PRINT("hist['\\r']=%lld", histogram[i]) 00601 else 00602 SG_PRINT("hist[%d]=%lld", i, histogram[i]) 00603 00604 if (!valid_chars[i]) 00605 SG_PRINT(" - Character not in Alphabet.\n") 00606 else 00607 SG_PRINT("\n"); 00608 } 00609 } 00610 } 00611 00612 SGVector<int64_t> CAlphabet::get_histogram() 00613 { 00614 return SGVector<int64_t>(&histogram[0], 1 << (sizeof(uint8_t)*8), false); 00615 } 00616 00617 bool CAlphabet::check_alphabet(bool print_error) 00618 { 00619 bool result = true; 00620 00621 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++) 00622 { 00623 if (histogram[i]>0 && valid_chars[i]==0) 00624 { 00625 result=false; 00626 break; 00627 } 00628 } 00629 00630 if (!result && print_error) 00631 { 00632 print_histogram(); 00633 SG_ERROR("ALPHABET does not contain all symbols in histogram\n") 00634 } 00635 00636 return result; 00637 } 00638 00639 bool CAlphabet::check_alphabet_size(bool print_error) 00640 { 00641 if (get_num_bits_in_histogram() > get_num_bits()) 00642 { 00643 if (print_error) 00644 { 00645 print_histogram(); 00646 fprintf(stderr, "get_num_bits_in_histogram()=%i > get_num_bits()=%i\n", get_num_bits_in_histogram(), get_num_bits()) ; 00647 SG_ERROR("ALPHABET too small to contain all symbols in histogram\n") 00648 } 00649 return false; 00650 } 00651 else 00652 return true; 00653 00654 } 00655 00656 void CAlphabet::copy_histogram(CAlphabet* a) 00657 { 00658 SGVector<int64_t> h=a->get_histogram(); 00659 00660 if (h.vlen != sizeof(histogram)/sizeof(histogram[0])) 00661 { 00662 SG_ERROR("Histogram has %d elements, but %d elements where expected\n", 00663 h.vlen, sizeof(histogram)/sizeof(histogram[0])); 00664 } 00665 00666 memcpy(histogram, h.vector, sizeof(histogram)); 00667 } 00668 00669 const char* CAlphabet::get_alphabet_name(EAlphabet alphabet) 00670 { 00671 00672 int32_t idx; 00673 switch (alphabet) 00674 { 00675 case DNA: 00676 idx=0; 00677 break; 00678 case RAWDNA: 00679 idx=1; 00680 break; 00681 case RNA: 00682 idx=2; 00683 break; 00684 case PROTEIN: 00685 idx=3; 00686 break; 00687 case BINARY: 00688 idx=4; 00689 break; 00690 case ALPHANUM: 00691 idx=5; 00692 break; 00693 case CUBE: 00694 idx=6; 00695 break; 00696 case RAWBYTE: 00697 idx=7; 00698 break; 00699 case IUPAC_NUCLEIC_ACID: 00700 idx=8; 00701 break; 00702 case IUPAC_AMINO_ACID: 00703 idx=9; 00704 break; 00705 case NONE: 00706 idx=10; 00707 break; 00708 case DIGIT: 00709 idx=11; 00710 break; 00711 case DIGIT2: 00712 idx=12; 00713 break; 00714 default: 00715 idx=13; 00716 break; 00717 } 00718 return alphabet_names[idx]; 00719 } 00720 00721 void CAlphabet::init() 00722 { 00723 alphabet = NONE; 00724 num_symbols = 0; 00725 num_bits = 0; 00726 00727 memset(valid_chars, 0, sizeof (valid_chars)); 00728 memset(maptable_to_bin, 0, sizeof (maptable_to_bin)); 00729 memset(maptable_to_char, 0, sizeof (maptable_to_char)); 00730 memset(histogram, 0, sizeof (histogram)); 00731 00732 00733 m_parameters->add((machine_int_t*) &alphabet, "alphabet", 00734 "Alphabet enum."); 00735 m_parameters->add(&num_symbols, "num_symbols", 00736 "Number of symbols."); 00737 m_parameters->add(&num_bits, "num_bits", 00738 "Number of bits."); 00739 00740 /* We don't need to serialize the mapping tables / they can be computed 00741 * after de-serializing. Lets not serialize the histogram for now. Doesn't 00742 * really make sense. */ 00743 00744 /* m_parameters->add_histogram(&histogram, sizeof(histogram), 00745 "histogram", 00746 "Histogram."); */ 00747 } 00748 00749 void CAlphabet::load_serializable_post() throw (ShogunException) 00750 { 00751 CSGObject::load_serializable_post(); 00752 00753 init_map_table(); 00754 } 00755 00756 00757 namespace shogun 00758 { 00759 template <class ST> 00760 void CAlphabet::translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val) 00761 { 00762 int32_t i,j; 00763 ST value=0; 00764 00765 for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T 00766 { 00767 value=0; 00768 for (j=i; j>=i-p_order+1; j--) 00769 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1))); 00770 00771 obs[i]= (ST) value; 00772 } 00773 00774 for (i=p_order-2;i>=0;i--) 00775 { 00776 if (i>=sequence_length) 00777 continue; 00778 00779 value=0; 00780 for (j=i; j>=i-p_order+1; j--) 00781 { 00782 value= (value >> max_val); 00783 if (j>=0 && j<sequence_length) 00784 value|=obs[j] << (max_val * (p_order-1)); 00785 } 00786 obs[i]=value; 00787 } 00788 00789 // TODO we should get rid of this loop! 00790 if (start>0) 00791 { 00792 for (i=start; i<sequence_length; i++) 00793 obs[i-start]=obs[i]; 00794 } 00795 } 00796 00797 template <class ST> 00798 void CAlphabet::translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val) 00799 { 00800 int32_t i,j; 00801 ST value=0; 00802 00803 for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T 00804 { 00805 value=0; 00806 for (j=i; j>=i-p_order+1; j--) 00807 value= (value << max_val) | obs[j]; 00808 00809 obs[i]= (ST) value; 00810 } 00811 00812 for (i=p_order-2;i>=0;i--) 00813 { 00814 if (i>=sequence_length) 00815 continue; 00816 00817 value=0; 00818 for (j=i; j>=i-p_order+1; j--) 00819 { 00820 value= (value << max_val); 00821 if (j>=0 && j<sequence_length) 00822 value|=obs[j]; 00823 } 00824 obs[i]=value; 00825 } 00826 00827 // TODO we should get rid of this loop! 00828 if (start>0) 00829 { 00830 for (i=start; i<sequence_length; i++) 00831 obs[i-start]=obs[i]; 00832 } 00833 } 00834 00835 template <class ST> 00836 void CAlphabet::translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00837 { 00838 ASSERT(gap>=0) 00839 00840 const int32_t start_gap=(p_order-gap)/2; 00841 const int32_t end_gap=start_gap+gap; 00842 00843 int32_t i,j; 00844 ST value=0; 00845 00846 // almost all positions 00847 for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T 00848 { 00849 value=0; 00850 for (j=i; j>=i-p_order+1; j--) 00851 { 00852 if (i-j<start_gap) 00853 { 00854 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap))); 00855 } 00856 else if (i-j>=end_gap) 00857 { 00858 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap))); 00859 } 00860 } 00861 obs[i]= (ST) value; 00862 } 00863 00864 // the remaining `order` positions 00865 for (i=p_order-2;i>=0;i--) 00866 { 00867 if (i>=sequence_length) 00868 continue; 00869 00870 value=0; 00871 for (j=i; j>=i-p_order+1; j--) 00872 { 00873 if (i-j<start_gap) 00874 { 00875 value= (value >> max_val); 00876 if (j>=0 && j<sequence_length) 00877 value|=obs[j] << (max_val * (p_order-1-gap)); 00878 } 00879 else if (i-j>=end_gap) 00880 { 00881 value= (value >> max_val); 00882 if (j>=0 && j<sequence_length) 00883 value|=obs[j] << (max_val * (p_order-1-gap)); 00884 } 00885 } 00886 obs[i]=value; 00887 } 00888 00889 // TODO we should get rid of this loop! 00890 if (start>0) 00891 { 00892 for (i=start; i<sequence_length; i++) 00893 obs[i-start]=obs[i]; 00894 } 00895 } 00896 00897 template <class ST> 00898 void CAlphabet::translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00899 { 00900 ASSERT(gap>=0) 00901 00902 const int32_t start_gap=(p_order-gap)/2; 00903 const int32_t end_gap=start_gap+gap; 00904 00905 int32_t i,j; 00906 ST value=0; 00907 00908 // almost all positions 00909 for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T 00910 { 00911 value=0; 00912 for (j=i; j>=i-p_order+1; j--) 00913 { 00914 if (i-j<start_gap) 00915 value= (value << max_val) | obs[j]; 00916 else if (i-j>=end_gap) 00917 value= (value << max_val) | obs[j]; 00918 } 00919 obs[i]= (ST) value; 00920 } 00921 00922 // the remaining `order` positions 00923 for (i=p_order-2;i>=0;i--) 00924 { 00925 if (i>=sequence_length) 00926 continue; 00927 00928 value=0; 00929 for (j=i; j>=i-p_order+1; j--) 00930 { 00931 if (i-j<start_gap) 00932 { 00933 value= value << max_val; 00934 if (j>=0 && j<sequence_length) 00935 value|=obs[j]; 00936 } 00937 else if (i-j>=end_gap) 00938 { 00939 value= value << max_val; 00940 if (j>=0 && j<sequence_length) 00941 value|=obs[j]; 00942 } 00943 } 00944 obs[i]=value; 00945 } 00946 00947 // TODO we should get rid of this loop! 00948 if (start>0) 00949 { 00950 for (i=start; i<sequence_length; i++) 00951 obs[i-start]=obs[i]; 00952 } 00953 } 00954 00955 template<> void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00956 { 00957 } 00958 00959 template<> void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00960 { 00961 } 00962 00963 template<> void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00964 { 00965 } 00966 00967 template<> void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00968 { 00969 } 00970 00971 template<> void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00972 { 00973 } 00974 00975 template<> void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00976 { 00977 } 00978 00979 template void CAlphabet::translate_from_single_order<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 00980 template void CAlphabet::translate_from_single_order<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 00981 template void CAlphabet::translate_from_single_order<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 00982 template void CAlphabet::translate_from_single_order<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 00983 template void CAlphabet::translate_from_single_order<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 00984 template void CAlphabet::translate_from_single_order<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 00985 template void CAlphabet::translate_from_single_order<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 00986 template void CAlphabet::translate_from_single_order<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 00987 template void CAlphabet::translate_from_single_order<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 00988 template void CAlphabet::translate_from_single_order<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 00989 00990 template void CAlphabet::translate_from_single_order<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 00991 template void CAlphabet::translate_from_single_order<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 00992 template void CAlphabet::translate_from_single_order<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 00993 template void CAlphabet::translate_from_single_order<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 00994 template void CAlphabet::translate_from_single_order<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 00995 template void CAlphabet::translate_from_single_order<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 00996 template void CAlphabet::translate_from_single_order<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 00997 template void CAlphabet::translate_from_single_order<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 00998 template void CAlphabet::translate_from_single_order<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 00999 template void CAlphabet::translate_from_single_order<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 01000 01001 template void CAlphabet::translate_from_single_order_reversed<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 01002 template void CAlphabet::translate_from_single_order_reversed<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 01003 template void CAlphabet::translate_from_single_order_reversed<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 01004 template void CAlphabet::translate_from_single_order_reversed<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 01005 template void CAlphabet::translate_from_single_order_reversed<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 01006 template void CAlphabet::translate_from_single_order_reversed<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 01007 template void CAlphabet::translate_from_single_order_reversed<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 01008 template void CAlphabet::translate_from_single_order_reversed<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 01009 template void CAlphabet::translate_from_single_order_reversed<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 01010 template void CAlphabet::translate_from_single_order_reversed<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val); 01011 01012 template void CAlphabet::translate_from_single_order_reversed<bool>(bool* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 01013 template void CAlphabet::translate_from_single_order_reversed<char>(char* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 01014 template void CAlphabet::translate_from_single_order_reversed<int8_t>(int8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 01015 template void CAlphabet::translate_from_single_order_reversed<uint8_t>(uint8_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 01016 template void CAlphabet::translate_from_single_order_reversed<int16_t>(int16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 01017 template void CAlphabet::translate_from_single_order_reversed<uint16_t>(uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 01018 template void CAlphabet::translate_from_single_order_reversed<int32_t>(int32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 01019 template void CAlphabet::translate_from_single_order_reversed<uint32_t>(uint32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 01020 template void CAlphabet::translate_from_single_order_reversed<int64_t>(int64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 01021 template void CAlphabet::translate_from_single_order_reversed<uint64_t>(uint64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 01022 template void CAlphabet::translate_from_single_order_reversed<float32_t>(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 01023 template void CAlphabet::translate_from_single_order_reversed<float64_t>(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 01024 template void CAlphabet::translate_from_single_order_reversed<floatmax_t>(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap); 01025 }