SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <vector> 00013 00014 #include <shogun/lib/common.h> 00015 #include <shogun/io/SGIO.h> 00016 #include <shogun/lib/Signal.h> 00017 #include <shogun/lib/Trie.h> 00018 #include <shogun/base/Parallel.h> 00019 00020 #include <shogun/kernel/string/SpectrumRBFKernel.h> 00021 #include <shogun/features/Features.h> 00022 #include <shogun/features/StringFeatures.h> 00023 #include <shogun/lib/SGStringList.h> 00024 #include <math.h> 00025 00026 #include <vector> 00027 #include <string> 00028 #include <fstream> 00029 #include <cmath> 00030 00031 #include <assert.h> 00032 00033 #ifdef HAVE_PTHREAD 00034 #include <pthread.h> 00035 #endif 00036 00037 00038 using namespace shogun; 00039 00040 CSpectrumRBFKernel::CSpectrumRBFKernel() 00041 : CStringKernel<char>(0) 00042 { 00043 init(); 00044 register_param(); 00045 } 00046 00047 CSpectrumRBFKernel::CSpectrumRBFKernel (int32_t size, float64_t *AA_matrix_, int32_t degree_, float64_t width_) 00048 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0) 00049 { 00050 init(); 00051 register_param(); 00052 00053 target_letter_0=-1 ; 00054 00055 AA_matrix=SGMatrix<float64_t>(128,128); 00056 00057 memcpy(AA_matrix.matrix, AA_matrix_, 128*128*sizeof(float64_t)) ; 00058 00059 read_profiles_and_sequences(); 00060 SGStringList<char> string_list; 00061 string_list.strings = sequences; 00062 string_list.num_strings = nof_sequences; 00063 string_list.max_string_length = max_sequence_length; 00064 00065 //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN); 00066 string_features = new CStringFeatures<char>(string_list, IUPAC_AMINO_ACID); 00067 SG_REF(string_features) 00068 init(string_features, string_features); 00069 } 00070 00071 CSpectrumRBFKernel::CSpectrumRBFKernel( 00072 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_, int32_t degree_, float64_t width_) 00073 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0) 00074 { 00075 target_letter_0=-1 ; 00076 00077 AA_matrix=SGMatrix<float64_t>(128,128); 00078 memcpy(AA_matrix.matrix, AA_matrix_, 128*128*sizeof(float64_t)) ; 00079 00080 init(l, r); 00081 register_param(); 00082 } 00083 00084 CSpectrumRBFKernel::~CSpectrumRBFKernel() 00085 { 00086 cleanup(); 00087 SG_UNREF(string_features); 00088 SG_FREE(sequences); 00089 } 00090 00091 void CSpectrumRBFKernel::read_profiles_and_sequences() 00092 { 00093 00094 int32_t aa_to_index[128];//profile 00095 aa_to_index[(uint8_t) 'A'] = 0; 00096 aa_to_index[(uint8_t) 'R'] = 1; 00097 aa_to_index[(uint8_t) 'N'] = 2; 00098 aa_to_index[(uint8_t) 'D'] = 3; 00099 aa_to_index[(uint8_t) 'C'] = 4; 00100 aa_to_index[(uint8_t) 'Q'] = 5; 00101 aa_to_index[(uint8_t) 'E'] = 6; 00102 aa_to_index[(uint8_t) 'G'] = 7; 00103 aa_to_index[(uint8_t) 'H'] = 8; 00104 aa_to_index[(uint8_t) 'I'] = 9; 00105 aa_to_index[(uint8_t) 'L'] = 10; 00106 aa_to_index[(uint8_t) 'K'] = 11; 00107 aa_to_index[(uint8_t) 'M'] = 12; 00108 aa_to_index[(uint8_t) 'F'] = 13; 00109 aa_to_index[(uint8_t) 'P'] = 14; 00110 aa_to_index[(uint8_t) 'S'] = 15; 00111 aa_to_index[(uint8_t) 'T'] = 16; 00112 aa_to_index[(uint8_t) 'W'] = 17; 00113 aa_to_index[(uint8_t) 'Y'] = 18; 00114 aa_to_index[(uint8_t) 'V'] = 19; 00115 SG_DEBUG("initializing background\n") 00116 double background[20]; // profile 00117 background[0]=0.0799912015849807; //A 00118 background[1]=0.0484482507611578;//R 00119 background[2]=0.044293531582512;//N 00120 background[3]=0.0578891399707563;//D 00121 background[4]=0.0171846021407367;//C 00122 background[5]=0.0380578923048682;//Q 00123 background[6]=0.0638169929675978;//E 00124 background[7]=0.0760659374742852;//G 00125 background[8]=0.0223465499452473;//H 00126 background[9]=0.0550905793661343;//I 00127 background[10]=0.0866897071203864;//L 00128 background[11]=0.060458245507428;//K 00129 background[12]=0.0215379186368154;//M 00130 background[13]=0.0396348024787477;//F 00131 background[14]=0.0465746314476874;//P 00132 background[15]=0.0630028230885602;//S 00133 background[16]=0.0580394726014824;//T 00134 background[17]=0.0144991866213453;//W 00135 background[18]=0.03635438623143;//Y 00136 background[19]=0.0700241481678408;//V 00137 00138 00139 std::vector<std::string> seqs; 00140 //int32_t nof_sequences = 7329; 00141 00142 double C = 0.8; 00143 const char *filename="/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles"; 00144 std::ifstream fin(filename); 00145 00146 SG_DEBUG("Reading profiles from %s\n", filename) 00147 std::string line; 00148 while (!fin.eof()) 00149 { 00150 std::getline(fin, line); 00151 00152 if (line[0] == '>') // new sequence 00153 { 00154 int idx = line.find_first_of(' '); 00155 sequence_labels.push_back(line.substr(1,idx-1)); 00156 std::getline(fin, line); 00157 std::string orig_sequence = line; 00158 std::string sequence=""; 00159 00160 int len_line = line.length(); 00161 00162 // skip 3 lines 00163 00164 std::getline(fin, line); 00165 std::getline(fin, line); 00166 std::getline(fin, line); 00167 00168 profiles.push_back(std::vector<double>()); 00169 00170 std::vector<double>& curr_profile = profiles.back(); 00171 for (int i=0; i < len_line; ++i) 00172 { 00173 std::getline(fin, line); 00174 int a = line.find_first_not_of(' '); // index position 00175 int b = line.find_first_of(' ', a); // index position 00176 a = line.find_first_not_of(' ', b); // aa position 00177 b = line.find_first_of(' ', a); // aa position 00178 std::string aa=line.substr(a,b-a); 00179 if (0) //(aa =="B" || aa == "X" || aa == "Z") 00180 { 00181 int pos = seqs.size()+1; 00182 SG_DEBUG("Skipping aa in sequence %d\n", pos) 00183 continue; 00184 } 00185 else 00186 { 00187 sequence += aa; 00188 00189 a = line.find_first_not_of(' ', b); // beginning of block to ignore 00190 b = line.find_first_of(' ', a); // aa position 00191 00192 for (int j=0; j < 19; ++j) 00193 { 00194 a = line.find_first_not_of(' ', b); 00195 b = line.find_first_of(' ', a); 00196 } 00197 00198 int all_zeros = 1; 00199 // interesting block 00200 for (int j=0; j < 20; ++j) 00201 { 00202 a = line.find_first_not_of(' ', b); 00203 b = line.find_first_of(' ', a); 00204 double p = atof(line.substr(a, b-a).c_str()); 00205 if (p > 0) 00206 { 00207 all_zeros = 0; 00208 } 00209 double value = -1* std::log(C*(p/100)+(1-C)*background[j]); // taken from Leslie's example, C actually corresponds to 1/(1+C) 00210 curr_profile.push_back(value); 00211 //SG_DEBUG("seq %d aa %d value %f p %f bg %f\n", i, j, value,p, background[j]) 00212 } 00213 00214 if (all_zeros) 00215 { 00216 SG_DEBUG(">>>>>>>>>>>>>>> all zeros") 00217 if (aa !="B" && aa != "X" && aa != "Z") 00218 { 00219 //profile[i][temp_profile_index]=-log(C+(1-C)*background[re_candidate[temp_profile_index]]); 00220 int32_t aa_index = aa_to_index[(int)aa.c_str()[0]]; 00221 double value = -1* std::log(C+(1-C)*background[aa_index]); // taken from Leslie's example, C actually corresponds to 1/(1+C) 00222 SG_DEBUG("before %f\n", profiles.back()[(i-1) * 20 + aa_index]) 00223 curr_profile[(i*20) + aa_index] = value; 00224 SG_DEBUG(">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value) 00225 00226 /* 00227 for (int z=0; z <20; ++z) 00228 { 00229 SG_DEBUG(" %d \t %f\t", z, curr_profile[z]) 00230 } 00231 SG_DEBUG("\n") 00232 */ 00233 } 00234 } 00235 } 00236 } 00237 00238 if (curr_profile.size() != 20 * sequence.length()) 00239 { 00240 SG_ERROR("Something's wrong with the profile.\n") 00241 break; 00242 } 00243 00244 seqs.push_back(sequence); 00245 00246 00247 /* 00248 // 6 irrelevant lines 00249 for (int i=0; i < 6; ++i) 00250 { 00251 std::getline(fin, line); 00252 } 00253 // 00254 */ 00255 } 00256 } 00257 00258 fin.close(); 00259 00260 nof_sequences = seqs.size(); 00261 sequences = SG_MALLOC(SGString<char>, nof_sequences); 00262 00263 int max_len = 0; 00264 for (int i=0; i < nof_sequences; ++i) 00265 { 00266 int len = seqs[i].length(); 00267 sequences[i].string = SG_MALLOC(char, len+1); 00268 sequences[i].slen = len; 00269 strcpy(sequences[i].string, seqs[i].c_str()); 00270 00271 if (len > max_len) max_len = len; 00272 } 00273 00274 max_sequence_length = max_len; 00275 //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN); 00276 00277 } 00278 00279 bool CSpectrumRBFKernel::init(CFeatures* l, CFeatures* r) 00280 { 00281 // >> profile 00282 /* 00283 read_profiles_and_sequences(); 00284 l = string_features; 00285 r = string_features; 00286 */ 00287 // << profile 00288 00289 int32_t lhs_changed=(lhs!=l); 00290 int32_t rhs_changed=(rhs!=r); 00291 00292 CStringKernel<char>::init(l,r); 00293 00294 SG_DEBUG("lhs_changed: %i\n", lhs_changed) 00295 SG_DEBUG("rhs_changed: %i\n", rhs_changed) 00296 00297 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l; 00298 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r; 00299 00300 SG_UNREF(alphabet); 00301 alphabet=sf_l->get_alphabet(); 00302 CAlphabet* ralphabet=sf_r->get_alphabet(); 00303 00304 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA))) 00305 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION); 00306 00307 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet()) 00308 SG_UNREF(ralphabet); 00309 00310 00311 return init_normalizer(); 00312 } 00313 00314 void CSpectrumRBFKernel::cleanup() 00315 { 00316 00317 SG_UNREF(alphabet); 00318 alphabet=NULL; 00319 00320 CKernel::cleanup(); 00321 } 00322 00323 inline bool isaa(char c) 00324 { 00325 if (c<65 || c>89 || c=='B' || c=='J' || c=='O' || c=='U' || c=='X' || c=='Z') 00326 return false ; 00327 return true ; 00328 } 00329 00330 float64_t CSpectrumRBFKernel::AA_helper(const char* path, const int seq_degree, const char* joint_seq, unsigned int index) 00331 { 00332 //const char* AA = "ARNDCQEGHILKMFPSTWYV"; 00333 float64_t diff=0.0 ; 00334 00335 for (int i=0; i<seq_degree; i++) 00336 { 00337 if (!isaa(path[i])||!isaa(joint_seq[index+i])) 00338 diff+=1.4 ; 00339 else 00340 { 00341 diff += AA_matrix.matrix[ (path[i]-1)*128 + path[i] - 1] ; 00342 diff -= 2*AA_matrix.matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ; 00343 diff += AA_matrix.matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ; 00344 if (CMath::is_nan(diff)) 00345 fprintf(stderr, "nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ; 00346 } 00347 } 00348 00349 return exp( - diff/width) ; 00350 } 00351 00352 float64_t CSpectrumRBFKernel::compute(int32_t idx_a, int32_t idx_b) 00353 { 00354 int32_t alen, blen; 00355 bool afree, bfree; 00356 00357 char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, afree); 00358 char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, bfree); 00359 00360 float64_t result=0; 00361 for (int32_t i=0; i<alen; i++) 00362 { 00363 for (int32_t j=0; j<blen; j++) 00364 { 00365 if ((i+degree<=alen) && (j+degree<=blen)) 00366 result += AA_helper(&(avec[i]), degree, bvec, j) ; 00367 } 00368 } 00369 00370 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, afree); 00371 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, bfree); 00372 return result; 00373 } 00374 00375 bool CSpectrumRBFKernel::set_AA_matrix( 00376 float64_t* AA_matrix_) 00377 { 00378 00379 if (AA_matrix_) 00380 { 00381 SG_DEBUG("Setting AA_matrix\n") 00382 memcpy(AA_matrix.matrix, AA_matrix_, 128*128*sizeof(float64_t)) ; 00383 return true ; 00384 } 00385 00386 return false; 00387 } 00388 00389 void CSpectrumRBFKernel::register_param() 00390 { 00391 SG_ADD(°ree, "degree", "degree of the kernel", MS_AVAILABLE); 00392 SG_ADD(&AA_matrix, "AA_matrix", "128*128 scalar product matrix", MS_NOT_AVAILABLE); 00393 SG_ADD(&width, "width", "width of Gaussian", MS_AVAILABLE); 00394 SG_ADD(&nof_sequences, "nof_sequences", "length of the sequence", 00395 MS_NOT_AVAILABLE); 00396 m_parameters->add_vector(&sequences, &nof_sequences, "sequences", "the sequences as a part of profile"); 00397 SG_ADD(&max_sequence_length, 00398 "max_sequence_length", "max length of the sequence", MS_NOT_AVAILABLE); 00399 } 00400 00401 void CSpectrumRBFKernel::register_alphabet() 00402 { 00403 SG_ADD((CSGObject**)&alphabet, "alphabet", "the alphabet used by kernel", 00404 MS_NOT_AVAILABLE); 00405 } 00406 00407 void CSpectrumRBFKernel::init() 00408 { 00409 alphabet = NULL; 00410 degree = 0; 00411 width = 0.0; 00412 sequences = NULL; 00413 string_features = NULL; 00414 nof_sequences = 0; 00415 max_sequence_length = 0; 00416 00417 initialized = false; 00418 00419 max_mismatch = 0; 00420 target_letter_0 = 0; 00421 }