SHOGUN
v3.2.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2008 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <shogun/ui/GUIFeatures.h> 00013 #include <shogun/ui/SGInterface.h> 00014 00015 #include <shogun/lib/config.h> 00016 #include <shogun/io/SGIO.h> 00017 #include <shogun/io/CSVFile.h> 00018 00019 using namespace shogun; 00020 00021 CGUIFeatures::CGUIFeatures(CSGInterface* ui_) 00022 : CSGObject(), ui(ui_), train_features(NULL), test_features(NULL), 00023 ref_features(NULL) 00024 { 00025 } 00026 00027 CGUIFeatures::~CGUIFeatures() 00028 { 00029 SG_UNREF(train_features); 00030 SG_UNREF(test_features); 00031 SG_UNREF(ref_features); 00032 } 00033 00034 void CGUIFeatures::invalidate_train() 00035 { 00036 CKernel *k = ui->ui_kernel->get_kernel(); 00037 if (k) 00038 k->remove_lhs(); 00039 } 00040 00041 void CGUIFeatures::invalidate_test() 00042 { 00043 CKernel *k = ui->ui_kernel->get_kernel(); 00044 if (k) 00045 k->remove_rhs(); 00046 } 00047 00048 bool CGUIFeatures::load( 00049 char* filename, char* fclass, char* type, char* target, int32_t size, 00050 int32_t comp_features) 00051 { 00052 bool result=false; 00053 CFeatures** f_ptr=NULL; 00054 00055 if (strncmp(target, "TRAIN", 5)==0) 00056 { 00057 f_ptr=&train_features; 00058 invalidate_train(); 00059 } 00060 else if (strncmp(target, "TEST", 4)==0) 00061 { 00062 f_ptr=&test_features; 00063 invalidate_test(); 00064 } 00065 else 00066 SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target) 00067 00068 SG_UNREF(*f_ptr); 00069 *f_ptr=NULL; 00070 00071 CCSVFile* file=new CCSVFile(filename); 00072 if (strncmp(fclass, "SIMPLE", 6)==0) 00073 { 00074 if (strncmp(type, "REAL", 4)==0) 00075 { 00076 *f_ptr=new CDenseFeatures<float64_t>(file); 00077 } 00078 else if (strncmp(type, "BYTE", 4)==0) 00079 { 00081 *f_ptr=new CDenseFeatures<uint8_t>(file); 00082 } 00083 else if (strncmp(type, "CHAR", 4)==0) 00084 { 00086 *f_ptr=new CDenseFeatures<char>(file); 00087 } 00088 else if (strncmp(type, "SHORT", 5)==0) 00089 { 00090 *f_ptr=new CDenseFeatures<int16_t>(file); 00091 } 00092 else 00093 { 00094 SG_ERROR("Unknown type.\n") 00095 return false; 00096 } 00097 } 00098 else if (strncmp(fclass, "SPARSE", 6)==0) 00099 { 00100 SG_NOTIMPLEMENTED 00101 } 00102 else if (strncmp(fclass, "STRING", 6)==0) 00103 { 00104 if (strncmp(type, "REAL", 4)==0) 00105 { 00106 *f_ptr=new CStringFeatures<float64_t>(file); 00107 } 00108 else if (strncmp(type, "BYTE", 4)==0) 00109 { 00111 *f_ptr=new CStringFeatures<uint8_t>(file, DNA); 00112 } 00113 else if (strncmp(type, "CHAR", 4)==0) 00114 { 00116 *f_ptr=new CStringFeatures<char>(file, DNA); 00117 } 00118 else if (strncmp(type, "SHORT", 5)==0) 00119 { 00120 *f_ptr=new CStringFeatures<int16_t>(file); 00121 } 00122 else if (strncmp(type, "WORD", 4)==0) 00123 { 00124 *f_ptr=new CStringFeatures<uint16_t>(file); 00125 } 00126 else if (strncmp(type, "ULONG", 5)==0) 00127 { 00128 *f_ptr=new CStringFeatures<uint64_t>(file); 00129 } 00130 else 00131 { 00132 SG_ERROR("Unknown type.\n") 00133 return false; 00134 } 00135 } 00136 SG_UNREF(file); 00137 00138 return result; 00139 } 00140 00141 bool CGUIFeatures::save(char* filename, char* type, char* target) 00142 { 00143 bool result=false; 00144 00145 CFeatures** f_ptr=NULL; 00146 00147 if (strncmp(target, "TRAIN", 5)==0) 00148 { 00149 f_ptr=&train_features; 00150 } 00151 else if (strncmp(target, "TEST", 4)==0) 00152 { 00153 f_ptr=&test_features; 00154 } 00155 else 00156 SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target) 00157 00158 if (*f_ptr) 00159 { 00160 try 00161 { 00162 CCSVFile* file=new CCSVFile(filename, 'w'); 00163 if (strncmp(type, "REAL", 4)==0) 00164 { 00165 ((CDenseFeatures<float64_t>*) (*f_ptr))->save(file); 00166 } 00167 else if (strncmp(type, "BYTE", 4)==0) 00168 { 00169 ((CDenseFeatures<uint8_t>*) (*f_ptr))->save(file); 00170 } 00171 else if (strncmp(type, "CHAR", 4)==0) 00172 { 00173 ((CDenseFeatures<char>*) (*f_ptr))->save(file); 00174 } 00175 else if (strncmp(type, "SHORT", 5)==0) 00176 { 00177 ((CDenseFeatures<int16_t>*) (*f_ptr))->save(file); 00178 } 00179 else if (strncmp(type, "WORD", 4)==0) 00180 { 00181 ((CDenseFeatures<uint16_t>*) (*f_ptr))->save(file); 00182 } 00183 else 00184 { 00185 SG_ERROR("Unknown type.\n") 00186 return false; 00187 } 00188 SG_UNREF(file); 00189 } 00190 catch (...) 00191 { 00192 SG_ERROR("Writing to file %s failed!\n", filename) 00193 } 00194 00195 SG_INFO("Successfully written features into \"%s\" !\n", filename) 00196 result=true; 00197 00198 } else 00199 SG_ERROR("Set features first.\n") 00200 00201 return result; 00202 } 00203 00204 bool CGUIFeatures::clean(char* target) 00205 { 00206 if (strncmp(target, "TRAIN", 5)==0) 00207 set_train_features(NULL); 00208 else if (strncmp(target, "TEST", 4)==0) 00209 set_test_features(NULL); 00210 else 00211 SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target) 00212 00213 return true; 00214 } 00215 00216 bool CGUIFeatures::reshape(char* target, int32_t num_feat, int32_t num_vec) 00217 { 00218 CFeatures** f_ptr=NULL; 00219 00220 if (strncmp(target, "TRAIN", 5)==0) 00221 { 00222 f_ptr=&train_features; 00223 invalidate_train(); 00224 } 00225 else if (strncmp(target, "TEST", 4)==0) 00226 { 00227 f_ptr=&test_features; 00228 invalidate_test(); 00229 } 00230 else 00231 { 00232 SG_ERROR("Invalid target %s\n", target) 00233 return false; 00234 } 00235 00236 bool result=false; 00237 if (f_ptr) 00238 { 00239 SG_INFO("reshape data to %d x %d\n", num_feat, num_vec) 00240 result=(*f_ptr)->reshape(num_feat, num_vec); 00241 00242 if (!result) 00243 SG_ERROR("Reshaping failed.\n") 00244 } 00245 00246 return result; 00247 } 00248 00249 CFeatures* CGUIFeatures::get_convert_features(char* target) 00250 { 00251 CFeatures* features; 00252 00253 if (strncmp(target, "TEST", 4)==0) 00254 features=get_test_features(); 00255 else if (strncmp(target, "TRAIN", 5)==0) 00256 features=get_train_features(); 00257 else 00258 return NULL; 00259 00260 if (features->get_feature_class()==C_COMBINED) 00261 features=((CCombinedFeatures*) features)->get_last_feature_obj(); 00262 00263 return features; 00264 } 00265 00266 bool CGUIFeatures::set_convert_features(CFeatures* features, char* target) 00267 { 00268 CFeatures* features_prev; 00269 00270 if (strncmp(target, "TEST", 4)==0) 00271 features_prev=get_test_features(); 00272 else if (strncmp(target, "TRAIN", 5)==0) 00273 features_prev=get_train_features(); 00274 else 00275 return false; 00276 00277 // in case of combined features delete current (==last) feature obj 00278 // pointer from list (feature object got deleted already above) 00279 // and append *f_ptr which holds the newly created feature object 00280 if (features_prev->get_feature_class()==C_COMBINED) 00281 { 00282 CCombinedFeatures* combined=(CCombinedFeatures*) features_prev; 00283 combined->delete_feature_obj(combined->get_num_feature_obj()-1); 00284 combined->append_feature_obj(features); 00285 combined->list_feature_objs(); 00286 } 00287 else // set features to new test/train features 00288 { 00289 if (strncmp(target, "TEST", 4)==0) 00290 set_test_features(features); 00291 else 00292 set_train_features(features); 00293 } 00294 00295 return true; 00296 } 00297 00298 CSparseFeatures<float64_t>* CGUIFeatures::convert_simple_real_to_sparse_real( 00299 CDenseFeatures<float64_t>* src) 00300 { 00301 if (src && 00302 src->get_feature_class()==C_DENSE && 00303 src->get_feature_type()==F_DREAL) 00304 { 00305 //create sparse features with 0 cache 00306 SG_INFO("Attempting to convert dense feature matrix to a sparse one.\n") 00307 CSparseFeatures<float64_t>* target=new CSparseFeatures<float64_t>(0); 00308 int32_t num_f=0; 00309 int32_t num_v=0; 00310 float64_t* feats=src->get_feature_matrix(num_f, num_v); 00311 target->set_full_feature_matrix(SGMatrix<float64_t>(feats, num_f, num_v)); 00312 return target; 00313 } 00314 else 00315 SG_ERROR("No SIMPLE DREAL features available.\n") 00316 00317 return NULL; 00318 } 00319 00320 CStringFeatures<char>* CGUIFeatures::convert_simple_char_to_string_char( 00321 CDenseFeatures<char>* src) 00322 { 00323 if (src && src->get_feature_class()==C_DENSE) 00324 { 00325 int32_t num_vec=src->get_num_vectors(); 00326 SGString<char>* strings=SG_MALLOC(SGString<char>, num_vec); 00327 int32_t max_len=-1; 00328 00329 for (int32_t i=0; i<num_vec; i++) 00330 { 00331 bool to_free=false; 00332 int32_t len=0; 00333 char* str=src->get_feature_vector(i, len, to_free); 00334 strings[i].slen=len ; 00335 for (int32_t j=0; j<len; j++) 00336 if (str[j]==0) 00337 { 00338 strings[i].slen=j ; 00339 break ; 00340 } ; 00341 strings[i].string=SG_MALLOC(char, strings[i].slen); 00342 00343 for (int32_t j=0; j<strings[i].slen; j++) 00344 strings[i].string[j]=str[j]; 00345 00346 if (strings[i].slen> max_len) 00347 max_len=strings[i].slen; 00348 00349 src->free_feature_vector(str, i, to_free); 00350 } 00351 00352 CStringFeatures<char>* target=new CStringFeatures<char>(new CAlphabet(DNA)); 00353 target->set_features(strings, num_vec, max_len); 00354 return target; 00355 } 00356 else 00357 SG_ERROR("No features of class/type SIMPLE/CHAR available.\n") 00358 00359 return NULL; 00360 } 00361 00362 CDenseFeatures<float64_t>* CGUIFeatures::convert_simple_word_to_simple_salzberg( 00363 CDenseFeatures<uint16_t>* src) 00364 { 00365 CPluginEstimate* pie=ui->ui_pluginestimate->get_estimator(); 00366 00367 if (src && 00368 src->get_feature_type()==F_WORD && 00369 src->get_feature_class()==C_DENSE && 00370 pie) 00371 { 00372 CDenseFeatures<float64_t>* target=new CDenseFeatures<float64_t>(0); 00373 int32_t num_feat=src->get_num_features(); 00374 int32_t num_vec=src->get_num_vectors(); 00375 float64_t* fm=SG_MALLOC(float64_t, num_vec*num_feat); 00376 00377 if (fm) 00378 { 00379 for (int32_t i=0; i<num_vec; i++) 00380 { 00381 int32_t len=0; 00382 bool to_free=false; 00383 uint16_t* vec = src->get_feature_vector(i, len, to_free); 00384 ASSERT(num_feat==len) 00385 00386 for (int32_t j=0; j<num_feat; j++) 00387 fm[i*num_feat+j]= 00388 pie->get_parameterwise_log_odds(vec[j], j); 00389 00390 src->free_feature_vector(vec, i, to_free); 00391 } 00392 target->set_feature_matrix(SGMatrix<float64_t>(fm, num_feat, num_vec)); 00393 00394 } 00395 return target; 00396 } 00397 else 00398 SG_ERROR("No SIMPLE WORD features or PluginEstimator available.\n") 00399 00400 return NULL; 00401 } 00402 00403 00404 CTOPFeatures* CGUIFeatures::convert_string_word_to_simple_top( 00405 CStringFeatures<uint16_t>* src) 00406 { 00407 CTOPFeatures* tf=NULL; 00408 00409 if (src && 00410 src->get_feature_class()==C_DENSE && 00411 src->get_feature_type()==F_WORD) 00412 { 00413 SG_INFO("Converting to TOP features.\n") 00414 00415 if (ui->ui_hmm->get_pos() && ui->ui_hmm->get_neg()) 00416 { 00417 ui->ui_hmm->get_pos()->set_observations(src); 00418 ui->ui_hmm->get_neg()->set_observations(src); 00419 00420 bool neglinear=false; 00421 bool poslinear=false; 00422 00423 tf=new CTOPFeatures( 00424 0, ui->ui_hmm->get_pos(), ui->ui_hmm->get_neg(), 00425 neglinear, poslinear); 00426 ASSERT(tf->set_feature_matrix()) 00427 } 00428 else 00429 SG_ERROR("HMMs not correctly assigned!\n") 00430 } 00431 else 00432 SG_ERROR("No SIMPLE WORD features available.\n") 00433 00434 return tf; 00435 } 00436 00437 CFKFeatures* CGUIFeatures::convert_string_word_to_simple_fk( 00438 CStringFeatures<uint16_t>* src) 00439 { 00440 CFKFeatures* fkf=NULL; 00441 00442 SG_INFO("Converting to FK features.\n") 00443 00444 if (ui->ui_hmm->get_pos() && ui->ui_hmm->get_neg()) 00445 { 00446 CStringFeatures<uint16_t>* old_obs_pos= 00447 ui->ui_hmm->get_pos()->get_observations(); 00448 CStringFeatures<uint16_t>* old_obs_neg= 00449 ui->ui_hmm->get_neg()->get_observations(); 00450 00451 CStringFeatures<uint16_t>* string_feat=src; 00452 ui->ui_hmm->get_pos()->set_observations(string_feat); 00453 ui->ui_hmm->get_neg()->set_observations(string_feat); 00454 00455 fkf=new CFKFeatures( 00456 0, ui->ui_hmm->get_pos(), ui->ui_hmm->get_neg()); 00457 //, neglinear, poslinear); 00458 if (train_features) 00459 fkf->set_opt_a(((CFKFeatures*) train_features)->get_weight_a()); 00460 else 00461 SG_ERROR("Need train features to set optimal a.\n") 00462 00463 ASSERT(fkf->set_feature_matrix()) 00464 00465 ui->ui_hmm->get_pos()->set_observations(old_obs_pos); 00466 ui->ui_hmm->get_neg()->set_observations(old_obs_neg); 00467 } 00468 else 00469 SG_ERROR("HMMs not correctly assigned!\n") 00470 00471 return fkf; 00472 } 00473 00474 00475 CDenseFeatures<float64_t>* CGUIFeatures::convert_sparse_real_to_simple_real( 00476 CSparseFeatures<float64_t>* src) 00477 { 00478 if (src && 00479 src->get_feature_class()==C_SPARSE && 00480 src->get_feature_type() == F_DREAL) 00481 { 00482 //create dense features with 0 cache 00483 SG_INFO("Attempting to convert sparse feature matrix to a dense one.\n") 00484 CDenseFeatures<float64_t>* rf=new CDenseFeatures<float64_t>(0); 00485 if (rf) 00486 { 00487 SGMatrix<float64_t> feats=src->get_full_feature_matrix(); 00488 rf->set_feature_matrix(feats); 00489 return rf; 00490 } 00491 } 00492 else 00493 SG_ERROR("No SPARSE REAL features available.\n") 00494 00495 return NULL; 00496 } 00497 00498 CExplicitSpecFeatures* CGUIFeatures::convert_string_byte_to_spec_word( 00499 CStringFeatures<uint16_t>* src, bool use_norm) 00500 { 00501 return new CExplicitSpecFeatures(src, use_norm); 00502 } 00503 00504 CDenseFeatures<float64_t>* CGUIFeatures::convert_simple_char_to_simple_align( 00505 CDenseFeatures<char>* src, float64_t gap_cost) 00506 { 00507 if (src && 00508 src->get_feature_class()==C_DENSE && 00509 src->get_feature_type()==F_CHAR) 00510 { 00511 //create dense features with 0 cache 00512 SG_INFO("Converting CHAR features to REAL ones.\n") 00513 00514 CDenseFeatures<float64_t>* rf=new CDenseFeatures<float64_t>(0); 00515 if (rf) 00516 { 00517 SG_INFO("Start aligment with gapCost=%1.2f.\n", gap_cost) 00518 /*rf->Align_char_features( 00519 src, (CDenseFeatures<char>*) ref_features, gap_cost);*/ 00520 SG_INFO("Conversion was successful.\n") 00521 return rf; 00522 } 00523 } 00524 else 00525 SG_ERROR("No SIMPLE CHAR features available.\n") 00526 00527 SG_ERROR("Conversion failed.\n") 00528 return NULL; 00529 } 00530 00531 bool CGUIFeatures::set_reference_features(char* target) 00532 { 00533 if (strncmp(target, "TRAIN", 5)==0) 00534 { 00535 SG_UNREF(ref_features); 00536 ref_features=train_features; 00537 train_features=NULL; 00538 invalidate_train(); 00539 return true; 00540 } 00541 else if (strncmp(target, "TEST", 4)==0) 00542 { 00543 SG_UNREF(ref_features); 00544 ref_features=test_features; 00545 test_features=NULL; 00546 invalidate_test(); 00547 return true; 00548 } 00549 00550 return false; 00551 } 00552 00553 void CGUIFeatures::add_train_features(CFeatures* f) 00554 { 00555 ASSERT(f) 00556 invalidate_train(); 00557 00558 if (!train_features) 00559 { 00560 train_features=new CCombinedFeatures(); 00561 SG_REF(train_features); 00562 } 00563 00564 if (train_features->get_feature_class()!=C_COMBINED) 00565 { 00566 CFeatures* first_elem=train_features; 00567 train_features=new CCombinedFeatures(); 00568 SG_REF(train_features); 00569 ((CCombinedFeatures*) train_features)->append_feature_obj(first_elem); 00570 ((CCombinedFeatures*) train_features)->list_feature_objs(); 00571 SG_UNREF(first_elem); 00572 } 00573 00574 bool result=((CCombinedFeatures*) train_features)->append_feature_obj(f); 00575 if (result) 00576 ((CCombinedFeatures*) train_features)->list_feature_objs(); 00577 else 00578 SG_ERROR("appending feature object failed\n") 00579 } 00580 00581 void CGUIFeatures::add_train_dotfeatures(CDotFeatures* f) 00582 { 00583 ASSERT(f) 00584 SG_PRINT("DOTFVEC %d\n", f->get_num_vectors()) 00585 invalidate_train(); 00586 00587 if (!train_features) 00588 { 00589 train_features=new CCombinedDotFeatures(); 00590 SG_REF(train_features); 00591 } 00592 00593 if (train_features->get_feature_class()!=C_COMBINED_DOT) 00594 { 00595 if (!train_features->has_property(FP_DOT)) 00596 SG_ERROR("Trainfeatures not based on DotFeatures.\n") 00597 00598 CDotFeatures* first_elem=(CDotFeatures*) train_features; 00599 train_features=new CCombinedDotFeatures(); 00600 SG_REF(train_features); 00601 ((CCombinedDotFeatures*) train_features)->append_feature_obj(first_elem); 00602 ((CCombinedDotFeatures*) train_features)->list_feature_objs(); 00603 SG_UNREF(first_elem); 00604 } 00605 00606 bool result=((CCombinedDotFeatures*) train_features)->append_feature_obj(f); 00607 if (result) 00608 ((CCombinedDotFeatures*) train_features)->list_feature_objs(); 00609 else 00610 SG_ERROR("appending dot feature object failed\n") 00611 } 00612 00613 void CGUIFeatures::add_test_dotfeatures(CDotFeatures* f) 00614 { 00615 ASSERT(f) 00616 invalidate_test(); 00617 00618 if (!test_features) 00619 { 00620 test_features=new CCombinedDotFeatures(); 00621 SG_REF(test_features); 00622 } 00623 00624 if (test_features->get_feature_class()!=C_COMBINED_DOT) 00625 { 00626 if (!test_features->has_property(FP_DOT)) 00627 SG_ERROR("Trainfeatures not based on DotFeatures.\n") 00628 00629 CDotFeatures* first_elem=(CDotFeatures*) test_features; 00630 test_features=new CCombinedDotFeatures(); 00631 SG_REF(test_features); 00632 ((CCombinedDotFeatures*) test_features)->append_feature_obj(first_elem); 00633 ((CCombinedDotFeatures*) test_features)->list_feature_objs(); 00634 SG_UNREF(first_elem); 00635 } 00636 00637 bool result=((CCombinedDotFeatures*) test_features)->append_feature_obj(f); 00638 if (result) 00639 ((CCombinedDotFeatures*) test_features)->list_feature_objs(); 00640 else 00641 SG_ERROR("Appending feature object failed.\n") 00642 } 00643 00644 void CGUIFeatures::add_test_features(CFeatures* f) 00645 { 00646 ASSERT(f) 00647 invalidate_test(); 00648 00649 if (!test_features) 00650 { 00651 test_features=new CCombinedFeatures(); 00652 SG_REF(test_features); 00653 } 00654 00655 if (test_features->get_feature_class()!=C_COMBINED) 00656 { 00657 CFeatures* first_elem=test_features; 00658 test_features=new CCombinedFeatures(); 00659 SG_REF(test_features); 00660 ((CCombinedFeatures*) test_features)->append_feature_obj(first_elem); 00661 ((CCombinedFeatures*) test_features)->list_feature_objs(); 00662 SG_UNREF(first_elem); 00663 } 00664 00665 bool result=((CCombinedFeatures*) test_features)->append_feature_obj(f); 00666 if (result) 00667 ((CCombinedFeatures*) test_features)->list_feature_objs(); 00668 else 00669 SG_ERROR("Appending feature object failed.\n") 00670 } 00671 00672 bool CGUIFeatures::del_last_feature_obj(char* target) 00673 { 00674 CCombinedFeatures* cf=NULL; 00675 if (strncmp(target, "TRAIN", 5)==0) 00676 { 00677 if (!train_features) 00678 SG_ERROR("No train features available.\n") 00679 if (train_features->get_feature_class()!=C_COMBINED) 00680 SG_ERROR("Train features are not combined features.\n") 00681 00682 cf=(CCombinedFeatures*) train_features; 00683 } 00684 else if (strncmp(target, "TEST", 4)==0) 00685 { 00686 if (!test_features) 00687 SG_ERROR("No test features available.\n") 00688 if (test_features->get_feature_class()!=C_COMBINED) 00689 SG_ERROR("Test features are not combined features.\n") 00690 00691 cf=(CCombinedFeatures*) test_features; 00692 } 00693 else 00694 SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target) 00695 00696 if (!cf->delete_feature_obj(cf->get_num_feature_obj()-1)) 00697 SG_ERROR("No features available to delete.\n") 00698 00699 return false; 00700 }