SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <shogun/lib/common.h> 00013 #include <shogun/io/SGIO.h> 00014 #include <shogun/lib/Signal.h> 00015 #include <shogun/lib/Trie.h> 00016 #include <shogun/base/Parameter.h> 00017 #include <shogun/base/Parallel.h> 00018 00019 #include <shogun/kernel/string/WeightedDegreeStringKernel.h> 00020 #include <shogun/kernel/normalizer/FirstElementKernelNormalizer.h> 00021 #include <shogun/features/Features.h> 00022 #include <shogun/features/StringFeatures.h> 00023 00024 #ifndef WIN32 00025 #include <pthread.h> 00026 #endif 00027 00028 using namespace shogun; 00029 00030 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00031 struct S_THREAD_PARAM_WD 00032 { 00033 00034 int32_t* vec; 00035 float64_t* result; 00036 float64_t* weights; 00037 CWeightedDegreeStringKernel* kernel; 00038 CTrie<DNATrie>* tries; 00039 float64_t factor; 00040 int32_t j; 00041 int32_t start; 00042 int32_t end; 00043 int32_t length; 00044 int32_t* vec_idx; 00045 }; 00046 #endif // DOXYGEN_SHOULD_SKIP_THIS 00047 00048 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel () 00049 : CStringKernel<char>() 00050 { 00051 init(); 00052 } 00053 00054 00055 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel ( 00056 int32_t d, EWDKernType t) 00057 : CStringKernel<char>() 00058 { 00059 init(); 00060 00061 degree=d; 00062 type=t; 00063 00064 if (type!=E_EXTERNAL) 00065 set_wd_weights_by_type(type); 00066 } 00067 00068 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(SGVector<float64_t> w) 00069 : CStringKernel<char>(10) 00070 { 00071 init(); 00072 00073 type=E_EXTERNAL; 00074 degree=w.vlen; 00075 00076 weights=SG_MALLOC(float64_t, degree*(1+max_mismatch)); 00077 weights_degree=degree; 00078 weights_length=(1+max_mismatch); 00079 00080 for (int32_t i=0; i<degree*(1+max_mismatch); i++) 00081 weights[i]=w.vector[i]; 00082 } 00083 00084 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel( 00085 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t d) 00086 : CStringKernel<char>(10) 00087 { 00088 init(); 00089 degree=d; 00090 type=E_WD; 00091 set_wd_weights_by_type(type); 00092 set_normalizer(new CFirstElementKernelNormalizer()); 00093 init(l, r); 00094 } 00095 00096 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel() 00097 { 00098 cleanup(); 00099 00100 SG_FREE(weights); 00101 weights=NULL; 00102 weights_degree=0; 00103 weights_length=0; 00104 00105 SG_FREE(block_weights); 00106 block_weights=NULL; 00107 00108 SG_FREE(position_weights); 00109 position_weights=NULL; 00110 00111 SG_FREE(weights_buffer); 00112 weights_buffer=NULL; 00113 } 00114 00115 00116 void CWeightedDegreeStringKernel::remove_lhs() 00117 { 00118 SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n") 00119 delete_optimization(); 00120 00121 if (tries!=NULL) 00122 tries->destroy(); 00123 00124 CKernel::remove_lhs(); 00125 } 00126 00127 void CWeightedDegreeStringKernel::create_empty_tries() 00128 { 00129 ASSERT(lhs) 00130 00131 seq_length=((CStringFeatures<char>*) lhs)->get_max_vector_length(); 00132 00133 if (tries!=NULL) 00134 { 00135 tries->destroy() ; 00136 tries->create(seq_length, max_mismatch==0) ; 00137 } 00138 } 00139 00140 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r) 00141 { 00142 int32_t lhs_changed=(lhs!=l); 00143 int32_t rhs_changed=(rhs!=r); 00144 00145 CStringKernel<char>::init(l,r); 00146 00147 SG_DEBUG("lhs_changed: %i\n", lhs_changed) 00148 SG_DEBUG("rhs_changed: %i\n", rhs_changed) 00149 00150 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l; 00151 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r; 00152 00153 int32_t len=sf_l->get_max_vector_length(); 00154 if (lhs_changed && !sf_l->have_same_length(len)) 00155 SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n") 00156 00157 if (rhs_changed && !sf_r->have_same_length(len)) 00158 SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n") 00159 00160 SG_UNREF(alphabet); 00161 alphabet=sf_l->get_alphabet(); 00162 CAlphabet* ralphabet=sf_r->get_alphabet(); 00163 00164 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA))) 00165 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION); 00166 00167 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet()) 00168 SG_UNREF(ralphabet); 00169 00170 if (tries!=NULL) { 00171 tries->delete_trees(max_mismatch==0); 00172 SG_UNREF(tries); 00173 } 00174 tries=new CTrie<DNATrie>(degree, max_mismatch==0); 00175 create_empty_tries(); 00176 00177 init_block_weights(); 00178 00179 return init_normalizer(); 00180 } 00181 00182 void CWeightedDegreeStringKernel::cleanup() 00183 { 00184 SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n") 00185 delete_optimization(); 00186 00187 SG_FREE(block_weights); 00188 block_weights=NULL; 00189 00190 if (tries!=NULL) 00191 { 00192 tries->destroy(); 00193 SG_UNREF(tries); 00194 tries=NULL; 00195 } 00196 00197 seq_length=0; 00198 tree_initialized = false; 00199 00200 SG_UNREF(alphabet); 00201 alphabet=NULL; 00202 00203 CKernel::cleanup(); 00204 } 00205 00206 bool CWeightedDegreeStringKernel::init_optimization(int32_t count, int32_t* IDX, float64_t* alphas, int32_t tree_num) 00207 { 00208 if (tree_num<0) 00209 SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n") 00210 00211 delete_optimization(); 00212 00213 if (tree_num<0) 00214 SG_DEBUG("initializing CWeightedDegreeStringKernel optimization\n") 00215 00216 for (int32_t i=0; i<count; i++) 00217 { 00218 if (tree_num<0) 00219 { 00220 if ( (i % (count/10+1)) == 0) 00221 SG_PROGRESS(i, 0, count) 00222 00223 if (max_mismatch==0) 00224 add_example_to_tree(IDX[i], alphas[i]) ; 00225 else 00226 add_example_to_tree_mismatch(IDX[i], alphas[i]) ; 00227 00228 //SG_DEBUG("number of used trie nodes: %i\n", tries.get_num_used_nodes()) 00229 } 00230 else 00231 { 00232 if (max_mismatch==0) 00233 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ; 00234 else 00235 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ; 00236 } 00237 } 00238 00239 if (tree_num<0) 00240 SG_DONE() 00241 00242 //tries.compact_nodes(NO_CHILD, 0, weights) ; 00243 00244 set_is_initialized(true) ; 00245 return true ; 00246 } 00247 00248 bool CWeightedDegreeStringKernel::delete_optimization() 00249 { 00250 if (get_is_initialized()) 00251 { 00252 if (tries!=NULL) 00253 tries->delete_trees(max_mismatch==0); 00254 set_is_initialized(false); 00255 return true; 00256 } 00257 00258 return false; 00259 } 00260 00261 00262 float64_t CWeightedDegreeStringKernel::compute_with_mismatch( 00263 char* avec, int32_t alen, char* bvec, int32_t blen) 00264 { 00265 float64_t sum = 0.0; 00266 00267 for (int32_t i=0; i<alen; i++) 00268 { 00269 float64_t sumi = 0.0; 00270 int32_t mismatches=0; 00271 00272 for (int32_t j=0; (i+j<alen) && (j<degree); j++) 00273 { 00274 if (avec[i+j]!=bvec[i+j]) 00275 { 00276 mismatches++ ; 00277 if (mismatches>max_mismatch) 00278 break ; 00279 } ; 00280 sumi += weights[j+degree*mismatches]; 00281 } 00282 if (position_weights!=NULL) 00283 sum+=position_weights[i]*sumi ; 00284 else 00285 sum+=sumi ; 00286 } 00287 return sum ; 00288 } 00289 00290 float64_t CWeightedDegreeStringKernel::compute_using_block( 00291 char* avec, int32_t alen, char* bvec, int32_t blen) 00292 { 00293 ASSERT(alen==blen) 00294 00295 float64_t sum=0; 00296 int32_t match_len=-1; 00297 00298 for (int32_t i=0; i<alen; i++) 00299 { 00300 if (avec[i]==bvec[i]) 00301 match_len++; 00302 else 00303 { 00304 if (match_len>=0) 00305 sum+=block_weights[match_len]; 00306 match_len=-1; 00307 } 00308 } 00309 00310 if (match_len>=0) 00311 sum+=block_weights[match_len]; 00312 00313 return sum; 00314 } 00315 00316 float64_t CWeightedDegreeStringKernel::compute_without_mismatch( 00317 char* avec, int32_t alen, char* bvec, int32_t blen) 00318 { 00319 float64_t sum = 0.0; 00320 00321 for (int32_t i=0; i<alen; i++) 00322 { 00323 float64_t sumi = 0.0; 00324 00325 for (int32_t j=0; (i+j<alen) && (j<degree); j++) 00326 { 00327 if (avec[i+j]!=bvec[i+j]) 00328 break ; 00329 sumi += weights[j]; 00330 } 00331 if (position_weights!=NULL) 00332 sum+=position_weights[i]*sumi ; 00333 else 00334 sum+=sumi ; 00335 } 00336 return sum ; 00337 } 00338 00339 float64_t CWeightedDegreeStringKernel::compute_without_mismatch_matrix( 00340 char* avec, int32_t alen, char* bvec, int32_t blen) 00341 { 00342 float64_t sum = 0.0; 00343 00344 for (int32_t i=0; i<alen; i++) 00345 { 00346 float64_t sumi=0.0; 00347 for (int32_t j=0; (i+j<alen) && (j<degree); j++) 00348 { 00349 if (avec[i+j]!=bvec[i+j]) 00350 break; 00351 sumi += weights[i*degree+j]; 00352 } 00353 if (position_weights!=NULL) 00354 sum += position_weights[i]*sumi ; 00355 else 00356 sum += sumi ; 00357 } 00358 00359 return sum ; 00360 } 00361 00362 00363 float64_t CWeightedDegreeStringKernel::compute(int32_t idx_a, int32_t idx_b) 00364 { 00365 int32_t alen, blen; 00366 bool free_avec, free_bvec; 00367 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec); 00368 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec); 00369 float64_t result=0; 00370 00371 if (max_mismatch==0 && length==0 && block_computation) 00372 result=compute_using_block(avec, alen, bvec, blen); 00373 else 00374 { 00375 if (max_mismatch>0) 00376 result=compute_with_mismatch(avec, alen, bvec, blen); 00377 else if (length==0) 00378 result=compute_without_mismatch(avec, alen, bvec, blen); 00379 else 00380 result=compute_without_mismatch_matrix(avec, alen, bvec, blen); 00381 } 00382 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec); 00383 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec); 00384 00385 return result; 00386 } 00387 00388 00389 void CWeightedDegreeStringKernel::add_example_to_tree( 00390 int32_t idx, float64_t alpha) 00391 { 00392 ASSERT(alphabet) 00393 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA) 00394 00395 int32_t len=0; 00396 bool free_vec; 00397 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec); 00398 ASSERT(max_mismatch==0) 00399 int32_t *vec=SG_MALLOC(int32_t, len); 00400 00401 for (int32_t i=0; i<len; i++) 00402 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00403 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00404 00405 if (length == 0 || max_mismatch > 0) 00406 { 00407 for (int32_t i=0; i<len; i++) 00408 { 00409 float64_t alpha_pw=alpha; 00410 /*if (position_weights!=NULL) 00411 alpha_pw *= position_weights[i] ;*/ 00412 if (alpha_pw==0.0) 00413 continue; 00414 ASSERT(tries) 00415 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0)); 00416 } 00417 } 00418 else 00419 { 00420 for (int32_t i=0; i<len; i++) 00421 { 00422 float64_t alpha_pw=alpha; 00423 /*if (position_weights!=NULL) 00424 alpha_pw = alpha*position_weights[i] ;*/ 00425 if (alpha_pw==0.0) 00426 continue ; 00427 ASSERT(tries) 00428 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0)); 00429 } 00430 } 00431 SG_FREE(vec); 00432 tree_initialized=true ; 00433 } 00434 00435 void CWeightedDegreeStringKernel::add_example_to_single_tree( 00436 int32_t idx, float64_t alpha, int32_t tree_num) 00437 { 00438 ASSERT(alphabet) 00439 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA) 00440 00441 int32_t len; 00442 bool free_vec; 00443 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec); 00444 ASSERT(max_mismatch==0) 00445 int32_t *vec = SG_MALLOC(int32_t, len); 00446 00447 for (int32_t i=tree_num; i<tree_num+degree && i<len; i++) 00448 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00449 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00450 00451 00452 ASSERT(tries) 00453 if (alpha!=0.0) 00454 tries->add_to_trie(tree_num, 0, vec, normalizer->normalize_lhs(alpha, idx), weights, (length!=0)); 00455 00456 SG_FREE(vec); 00457 tree_initialized=true ; 00458 } 00459 00460 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(int32_t idx, float64_t alpha) 00461 { 00462 ASSERT(tries) 00463 ASSERT(alphabet) 00464 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA) 00465 00466 int32_t len ; 00467 bool free_vec; 00468 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec); 00469 00470 int32_t *vec = SG_MALLOC(int32_t, len); 00471 00472 for (int32_t i=0; i<len; i++) 00473 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00474 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00475 00476 for (int32_t i=0; i<len; i++) 00477 { 00478 if (alpha!=0.0) 00479 tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, normalizer->normalize_lhs(alpha, idx), &vec[i], len-i, 0, 0, max_mismatch, weights); 00480 } 00481 00482 SG_FREE(vec); 00483 tree_initialized=true ; 00484 } 00485 00486 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch( 00487 int32_t idx, float64_t alpha, int32_t tree_num) 00488 { 00489 ASSERT(tries) 00490 ASSERT(alphabet) 00491 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA) 00492 00493 int32_t len=0; 00494 bool free_vec; 00495 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec); 00496 int32_t *vec=SG_MALLOC(int32_t, len); 00497 00498 for (int32_t i=tree_num; i<len && i<tree_num+degree; i++) 00499 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00500 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00501 00502 if (alpha!=0.0) 00503 { 00504 tries->add_example_to_tree_mismatch_recursion( 00505 NO_CHILD, tree_num, normalizer->normalize_lhs(alpha, idx), &vec[tree_num], len-tree_num, 00506 0, 0, max_mismatch, weights); 00507 } 00508 00509 SG_FREE(vec); 00510 tree_initialized=true; 00511 } 00512 00513 00514 float64_t CWeightedDegreeStringKernel::compute_by_tree(int32_t idx) 00515 { 00516 ASSERT(alphabet) 00517 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA) 00518 00519 int32_t len=0; 00520 bool free_vec; 00521 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec); 00522 ASSERT(char_vec && len>0) 00523 int32_t *vec=SG_MALLOC(int32_t, len); 00524 00525 for (int32_t i=0; i<len; i++) 00526 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00527 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00528 00529 float64_t sum=0; 00530 ASSERT(tries) 00531 for (int32_t i=0; i<len; i++) 00532 sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0)); 00533 00534 SG_FREE(vec); 00535 return normalizer->normalize_rhs(sum, idx); 00536 } 00537 00538 void CWeightedDegreeStringKernel::compute_by_tree( 00539 int32_t idx, float64_t* LevelContrib) 00540 { 00541 ASSERT(alphabet) 00542 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA) 00543 00544 int32_t len ; 00545 bool free_vec; 00546 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec); 00547 00548 int32_t *vec = SG_MALLOC(int32_t, len); 00549 00550 for (int32_t i=0; i<len; i++) 00551 vec[i]=alphabet->remap_to_bin(char_vec[i]); 00552 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec); 00553 00554 ASSERT(tries) 00555 for (int32_t i=0; i<len; i++) 00556 { 00557 tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib, 00558 normalizer->normalize_rhs(1.0, idx), 00559 mkl_stepsize, weights, (length!=0)); 00560 } 00561 00562 SG_FREE(vec); 00563 } 00564 00565 float64_t *CWeightedDegreeStringKernel::compute_abs_weights(int32_t &len) 00566 { 00567 ASSERT(tries) 00568 return tries->compute_abs_weights(len); 00569 } 00570 00571 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type) 00572 { 00573 ASSERT(degree>0) 00574 ASSERT(p_type==E_WD) 00575 00576 SG_FREE(weights); 00577 weights=SG_MALLOC(float64_t, degree); 00578 weights_degree=degree; 00579 weights_length=1; 00580 00581 if (weights) 00582 { 00583 int32_t i; 00584 float64_t sum=0; 00585 for (i=0; i<degree; i++) 00586 { 00587 weights[i]=degree-i; 00588 sum+=weights[i]; 00589 } 00590 for (i=0; i<degree; i++) 00591 weights[i]/=sum; 00592 00593 for (i=0; i<degree; i++) 00594 { 00595 for (int32_t j=1; j<=max_mismatch; j++) 00596 { 00597 if (j<i+1) 00598 { 00599 int32_t nk=CMath::nchoosek(i+1, j); 00600 weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j)); 00601 } 00602 else 00603 weights[i+j*degree]= 0; 00604 } 00605 } 00606 00607 if (which_degree>=0) 00608 { 00609 ASSERT(which_degree<degree) 00610 for (i=0; i<degree; i++) 00611 { 00612 if (i!=which_degree) 00613 weights[i]=0; 00614 else 00615 weights[i]=1; 00616 } 00617 } 00618 return true; 00619 } 00620 else 00621 return false; 00622 } 00623 00624 bool CWeightedDegreeStringKernel::set_weights(SGMatrix<float64_t> new_weights) 00625 { 00626 float64_t* ws=new_weights.matrix; 00627 int32_t d=new_weights.num_rows; 00628 int32_t len=new_weights.num_cols; 00629 00630 if (d!=degree || len<0) 00631 SG_ERROR("WD: Dimension mismatch (should be (seq_length | 1) x degree) got (%d x %d)\n", len, degree) 00632 00633 degree=d; 00634 length=len; 00635 00636 if (len <= 0) 00637 len=1; 00638 00639 weights_degree=degree; 00640 weights_length=len+max_mismatch; 00641 00642 00643 SG_DEBUG("Creating weights of size %dx%d\n", weights_degree, weights_length) 00644 int32_t num_weights=weights_degree*weights_length; 00645 SG_FREE(weights); 00646 weights=SG_MALLOC(float64_t, num_weights); 00647 00648 for (int32_t i=0; i<degree*len; i++) 00649 weights[i]=ws[i]; 00650 00651 return true; 00652 } 00653 00654 bool CWeightedDegreeStringKernel::set_position_weights( 00655 float64_t* pws, int32_t len) 00656 { 00657 if (len==0) 00658 { 00659 SG_FREE(position_weights); 00660 position_weights=NULL; 00661 ASSERT(tries) 00662 tries->set_position_weights(position_weights); 00663 } 00664 00665 if (seq_length!=len) 00666 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len) 00667 00668 SG_FREE(position_weights); 00669 position_weights=SG_MALLOC(float64_t, len); 00670 position_weights_len=len; 00671 ASSERT(tries) 00672 tries->set_position_weights(position_weights); 00673 00674 if (position_weights) 00675 { 00676 for (int32_t i=0; i<len; i++) 00677 position_weights[i]=pws[i]; 00678 return true; 00679 } 00680 else 00681 return false; 00682 } 00683 00684 bool CWeightedDegreeStringKernel::init_block_weights_from_wd() 00685 { 00686 SG_FREE(block_weights); 00687 block_weights=SG_MALLOC(float64_t, CMath::max(seq_length,degree)); 00688 00689 int32_t k; 00690 float64_t d=degree; // use float to evade rounding errors below 00691 00692 for (k=0; k<degree; k++) 00693 block_weights[k]= 00694 (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1)); 00695 for (k=degree; k<seq_length; k++) 00696 block_weights[k]=(-d+3*k+4)/3; 00697 00698 return true; 00699 } 00700 00701 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external() 00702 { 00703 ASSERT(weights) 00704 SG_FREE(block_weights); 00705 block_weights=SG_MALLOC(float64_t, CMath::max(seq_length,degree)); 00706 00707 int32_t i=0; 00708 block_weights[0]=weights[0]; 00709 for (i=1; i<CMath::max(seq_length,degree); i++) 00710 block_weights[i]=0; 00711 00712 for (i=1; i<CMath::max(seq_length,degree); i++) 00713 { 00714 block_weights[i]=block_weights[i-1]; 00715 00716 float64_t contrib=0; 00717 for (int32_t j=0; j<CMath::min(degree,i+1); j++) 00718 contrib+=weights[j]; 00719 00720 block_weights[i]+=contrib; 00721 } 00722 return true; 00723 } 00724 00725 bool CWeightedDegreeStringKernel::init_block_weights_const() 00726 { 00727 SG_FREE(block_weights); 00728 block_weights=SG_MALLOC(float64_t, seq_length); 00729 00730 for (int32_t i=1; i<seq_length+1 ; i++) 00731 block_weights[i-1]=1.0/seq_length; 00732 return true; 00733 } 00734 00735 bool CWeightedDegreeStringKernel::init_block_weights_linear() 00736 { 00737 SG_FREE(block_weights); 00738 block_weights=SG_MALLOC(float64_t, seq_length); 00739 00740 for (int32_t i=1; i<seq_length+1 ; i++) 00741 block_weights[i-1]=degree*i; 00742 00743 return true; 00744 } 00745 00746 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly() 00747 { 00748 SG_FREE(block_weights); 00749 block_weights=SG_MALLOC(float64_t, seq_length); 00750 00751 for (int32_t i=1; i<degree+1 ; i++) 00752 block_weights[i-1]=((float64_t) i)*i; 00753 00754 for (int32_t i=degree+1; i<seq_length+1 ; i++) 00755 block_weights[i-1]=i; 00756 00757 return true; 00758 } 00759 00760 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly() 00761 { 00762 SG_FREE(block_weights); 00763 block_weights=SG_MALLOC(float64_t, seq_length); 00764 00765 for (int32_t i=1; i<degree+1 ; i++) 00766 block_weights[i-1]=((float64_t) i)*i*i; 00767 00768 for (int32_t i=degree+1; i<seq_length+1 ; i++) 00769 block_weights[i-1]=i; 00770 return true; 00771 } 00772 00773 bool CWeightedDegreeStringKernel::init_block_weights_exp() 00774 { 00775 SG_FREE(block_weights); 00776 block_weights=SG_MALLOC(float64_t, seq_length); 00777 00778 for (int32_t i=1; i<degree+1 ; i++) 00779 block_weights[i-1]=exp(((float64_t) i/10.0)); 00780 00781 for (int32_t i=degree+1; i<seq_length+1 ; i++) 00782 block_weights[i-1]=i; 00783 00784 return true; 00785 } 00786 00787 bool CWeightedDegreeStringKernel::init_block_weights_log() 00788 { 00789 SG_FREE(block_weights); 00790 block_weights=SG_MALLOC(float64_t, seq_length); 00791 00792 for (int32_t i=1; i<degree+1 ; i++) 00793 block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2); 00794 00795 for (int32_t i=degree+1; i<seq_length+1 ; i++) 00796 block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2); 00797 00798 return true; 00799 } 00800 00801 bool CWeightedDegreeStringKernel::init_block_weights() 00802 { 00803 switch (type) 00804 { 00805 case E_WD: 00806 return init_block_weights_from_wd(); 00807 case E_EXTERNAL: 00808 return init_block_weights_from_wd_external(); 00809 case E_BLOCK_CONST: 00810 return init_block_weights_const(); 00811 case E_BLOCK_LINEAR: 00812 return init_block_weights_linear(); 00813 case E_BLOCK_SQPOLY: 00814 return init_block_weights_sqpoly(); 00815 case E_BLOCK_CUBICPOLY: 00816 return init_block_weights_cubicpoly(); 00817 case E_BLOCK_EXP: 00818 return init_block_weights_exp(); 00819 case E_BLOCK_LOG: 00820 return init_block_weights_log(); 00821 }; 00822 return false; 00823 } 00824 00825 00826 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p) 00827 { 00828 S_THREAD_PARAM_WD* params = (S_THREAD_PARAM_WD*) p; 00829 int32_t j=params->j; 00830 CWeightedDegreeStringKernel* wd=params->kernel; 00831 CTrie<DNATrie>* tries=params->tries; 00832 float64_t* weights=params->weights; 00833 int32_t length=params->length; 00834 int32_t* vec=params->vec; 00835 float64_t* result=params->result; 00836 float64_t factor=params->factor; 00837 int32_t* vec_idx=params->vec_idx; 00838 00839 CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs()); 00840 CAlphabet* alpha=wd->alphabet; 00841 00842 for (int32_t i=params->start; i<params->end; i++) 00843 { 00844 int32_t len=0; 00845 bool free_vec; 00846 char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec); 00847 for (int32_t k=j; k<CMath::min(len,j+wd->get_degree()); k++) 00848 vec[k]=alpha->remap_to_bin(char_vec[k]); 00849 rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec); 00850 00851 ASSERT(tries) 00852 00853 result[i]+=factor* 00854 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]); 00855 } 00856 00857 SG_UNREF(rhs_feat); 00858 00859 return NULL; 00860 } 00861 00862 void CWeightedDegreeStringKernel::compute_batch( 00863 int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec, 00864 int32_t* IDX, float64_t* alphas, float64_t factor) 00865 { 00866 ASSERT(tries) 00867 ASSERT(alphabet) 00868 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA) 00869 ASSERT(rhs) 00870 ASSERT(num_vec<=rhs->get_num_vectors()) 00871 ASSERT(num_vec>0) 00872 ASSERT(vec_idx) 00873 ASSERT(result) 00874 create_empty_tries(); 00875 00876 int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length(); 00877 ASSERT(num_feat>0) 00878 int32_t num_threads=parallel->get_num_threads(); 00879 ASSERT(num_threads>0) 00880 int32_t* vec=SG_MALLOC(int32_t, num_threads*num_feat); 00881 00882 if (num_threads < 2) 00883 { 00884 #ifdef CYGWIN 00885 for (int32_t j=0; j<num_feat; j++) 00886 #else 00887 CSignal::clear_cancel(); 00888 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++) 00889 #endif 00890 { 00891 init_optimization(num_suppvec, IDX, alphas, j); 00892 S_THREAD_PARAM_WD params; 00893 params.vec=vec; 00894 params.result=result; 00895 params.weights=weights; 00896 params.kernel=this; 00897 params.tries=tries; 00898 params.factor=factor; 00899 params.j=j; 00900 params.start=0; 00901 params.end=num_vec; 00902 params.length=length; 00903 params.vec_idx=vec_idx; 00904 compute_batch_helper((void*) ¶ms); 00905 00906 SG_PROGRESS(j,0,num_feat) 00907 } 00908 } 00909 #ifdef HAVE_PTHREAD 00910 else 00911 { 00912 CSignal::clear_cancel(); 00913 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++) 00914 { 00915 init_optimization(num_suppvec, IDX, alphas, j); 00916 pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1); 00917 S_THREAD_PARAM_WD* params = SG_MALLOC(S_THREAD_PARAM_WD, num_threads); 00918 int32_t step= num_vec/num_threads; 00919 int32_t t; 00920 00921 for (t=0; t<num_threads-1; t++) 00922 { 00923 params[t].vec=&vec[num_feat*t]; 00924 params[t].result=result; 00925 params[t].weights=weights; 00926 params[t].kernel=this; 00927 params[t].tries=tries; 00928 params[t].factor=factor; 00929 params[t].j=j; 00930 params[t].start = t*step; 00931 params[t].end = (t+1)*step; 00932 params[t].length=length; 00933 params[t].vec_idx=vec_idx; 00934 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)¶ms[t]); 00935 } 00936 params[t].vec=&vec[num_feat*t]; 00937 params[t].result=result; 00938 params[t].weights=weights; 00939 params[t].kernel=this; 00940 params[t].tries=tries; 00941 params[t].factor=factor; 00942 params[t].j=j; 00943 params[t].start=t*step; 00944 params[t].end=num_vec; 00945 params[t].length=length; 00946 params[t].vec_idx=vec_idx; 00947 compute_batch_helper((void*) ¶ms[t]); 00948 00949 for (t=0; t<num_threads-1; t++) 00950 pthread_join(threads[t], NULL); 00951 SG_PROGRESS(j,0,num_feat) 00952 00953 SG_FREE(params); 00954 SG_FREE(threads); 00955 } 00956 } 00957 #endif 00958 00959 SG_FREE(vec); 00960 00961 //really also free memory as this can be huge on testing especially when 00962 //using the combined kernel 00963 create_empty_tries(); 00964 } 00965 00966 bool CWeightedDegreeStringKernel::set_max_mismatch(int32_t max) 00967 { 00968 if (type==E_EXTERNAL && max!=0) 00969 return false; 00970 00971 max_mismatch=max; 00972 00973 if (lhs!=NULL && rhs!=NULL) 00974 return init(lhs, rhs); 00975 else 00976 return true; 00977 } 00978 00979 void CWeightedDegreeStringKernel::init() 00980 { 00981 weights=NULL; 00982 weights_degree=0; 00983 weights_length=0; 00984 00985 position_weights=NULL; 00986 position_weights_len=0; 00987 00988 weights_buffer=NULL; 00989 mkl_stepsize=1; 00990 degree=1; 00991 length=0; 00992 00993 max_mismatch=0; 00994 seq_length=0; 00995 00996 block_weights=NULL; 00997 block_computation=true; 00998 type=E_WD; 00999 which_degree=-1; 01000 tries=NULL; 01001 01002 tree_initialized=false; 01003 alphabet=NULL; 01004 01005 lhs=NULL; 01006 rhs=NULL; 01007 01008 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION; 01009 01010 set_normalizer(new CFirstElementKernelNormalizer()); 01011 01012 m_parameters->add_matrix(&weights, &weights_degree, &weights_length, 01013 "weights", "WD Kernel weights."); 01014 m_parameters->add_vector(&position_weights, &position_weights_len, 01015 "position_weights", 01016 "Weights per position."); 01017 SG_ADD(&mkl_stepsize, "mkl_stepsize", "MKL step size.", MS_AVAILABLE); 01018 SG_ADD(°ree, "degree", "Order of WD kernel.", MS_AVAILABLE); 01019 SG_ADD(&max_mismatch, "max_mismatch", 01020 "Number of allowed mismatches.", MS_AVAILABLE); 01021 SG_ADD(&block_computation, "block_computation", 01022 "If block computation shall be used.", MS_NOT_AVAILABLE); 01023 SG_ADD((machine_int_t*) &type, "type", 01024 "WeightedDegree kernel type.", MS_AVAILABLE); 01025 SG_ADD(&which_degree, "which_degree", 01026 "The selected degree. All degrees are used by default (for value -1).", 01027 MS_AVAILABLE); 01028 SG_ADD((CSGObject**) &alphabet, "alphabet", 01029 "Alphabet of Features.", MS_NOT_AVAILABLE); 01030 }