Marsyas
0.6.0-alpha
|
00001 00008 #include "WekaSource.h" 00009 #include "../common_source.h" 00010 #include <stdexcept> 00011 #include <sstream> 00012 #include <string> 00013 #include <vector> 00014 00015 using namespace std; 00016 using namespace Marsyas; 00017 00018 class argument_list_stream 00019 { 00020 istringstream m_stream; 00021 bool m_ok; 00022 00023 public: 00024 argument_list_stream(const string & text): 00025 m_stream(text), 00026 m_ok(true) 00027 {} 00028 00029 operator bool() { return m_ok; } 00030 00031 template <typename T> 00032 argument_list_stream & operator>>(T & value) 00033 { 00034 string part; 00035 if (std::getline(m_stream, part, ',')) 00036 { 00037 istringstream part_stream(part); 00038 if (part_stream >> value) 00039 { 00040 istringstream::sentry can_read_more(part_stream); 00041 if (!can_read_more) 00042 return *this; 00043 } 00044 } 00045 00046 m_ok = false; 00047 value = T(); 00048 return *this; 00049 } 00050 }; 00051 00052 WekaSource::WekaSource(mrs_string name):MarSystem("WekaSource",name) 00053 { 00054 addControls(); 00055 validationModeEnum_ = None; 00056 currentIndex_ = 0; 00057 } 00058 00059 WekaSource::~WekaSource() 00060 { 00061 data_.Clear(); 00062 useTestSetData_.Clear(); 00063 } 00064 00065 WekaSource::WekaSource(const WekaSource& a) : MarSystem(a) { 00066 validationModeEnum_ = None; 00067 currentIndex_ = 0; 00068 ctrl_regression_ = getctrl("mrs_bool/regression"); 00069 } 00070 00071 MarSystem *WekaSource::clone() const 00072 { 00073 return new WekaSource(*this); 00074 } 00075 00076 void 00077 WekaSource::addControls() 00078 { 00079 addctrl("mrs_string/filename", ""); 00080 setctrlState("mrs_string/filename", true); 00081 00082 //comma seperated list of attributes to extract from the feature data 00083 //Can be attribute name, index, or range of indexes. 00084 //ie: "Mean_Mem40_Centroid, Mean_Mem40_Kurtosis, 4-7, 9, .... " 00085 addctrl("mrs_string/attributesToInclude", ""); 00086 setctrlState("mrs_string/attributesToInclude", true); 00087 00088 //comma seperated list of class names found in feature file 00089 //ie: "Music, Speech, .... " 00090 addctrl("mrs_string/classNames", ""); 00091 00092 //number of classes found 00093 addctrl("mrs_natural/nClasses", 0); 00094 addctrl("mrs_bool/regression", false, ctrl_regression_); 00095 00096 //The mode that the weka source is currently in. 00097 //Can be "train" or "predict" 00098 addctrl("mrs_string/mode", "train"); 00099 00100 //number of output samples will always be 1, regardless of the input samples 00101 setctrl("mrs_natural/onSamples", 1 ); //FIXME: this should not be done here but in myProcess instead... 00102 00103 //number of attributes and attribute names that will be reported. 00104 addctrl("mrs_natural/nAttributes", 0); 00105 addctrl("mrs_string/attributeNames", ""); 00106 00107 //type of classifier validation to do. 00108 //Blank or not set means none. 00109 //Other supported types: 00110 //"kFold,[S,NS],xx" where xx is an integer 2-10 00111 //if S is specified, use Stratified 00112 //if NS is specified, use Non-Stratified 00113 //"UseTestSet,wekafilename" 00114 //"PercentageSplit,percent" where percent is 1-99 00115 //others to come 00116 addctrl("mrs_string/validationMode", ""); 00117 addctrl("mrs_bool/done", false); 00118 00119 addctrl("mrs_natural/nInstances", 0); 00120 00121 // The current filename that we are processing 00122 // We get this from looking for comment strings that begin with "% 00123 // filename" 00124 addctrl("mrs_string/currentFilename", ""); 00125 addctrl("mrs_real/currentSrate", 22050.0); 00126 00127 //TODO: lmartins: document... 00128 addctrl("mrs_realvec/instanceIndexes", realvec()); 00129 00130 addctrl("mrs_bool/normMaxMin", false); 00131 00132 } 00133 00134 void 00135 WekaSource::myUpdate(MarControlPtr sender) 00136 { 00137 (void) sender; //suppress warning of unused parameter(s) 00138 MRSDIAG("WekaSource.cpp - WekaSource:myUpdate"); 00139 00140 // If 'filename' was updated, or the attributes desired from the Weka file has changed, 00141 // parse the header portion of the file to get the required attribute names and possible output labels (if any)... 00142 if (filename_ == getctrl("mrs_string/filename")->to<mrs_string>()) 00143 return; 00144 00145 this->updControl("mrs_bool/done", false); 00146 filename_ = getctrl("mrs_string/filename")->to<mrs_string>(); 00147 attributesToInclude_ = getctrl("mrs_string/attributesToInclude")->to<mrs_string>(); 00148 00149 mrs_bool normMaxMin = getctrl("mrs_bool/normMaxMin")->to<mrs_bool>(); 00150 00151 loadFile(filename_, attributesToInclude_, data_); 00152 if (normMaxMin) 00153 { 00154 data_.NormMaxMin(); 00155 } 00156 00157 // data_.Dump("org.txt", classesFound_); 00158 00159 mrs_string names; 00160 bool first = true; 00161 for(vector<mrs_string>::const_iterator citer = classesFound_.begin(); citer!= classesFound_.end(); citer++) 00162 { 00163 if(!first) 00164 names += ","; 00165 00166 names += (*citer); 00167 first = false; 00168 } 00169 setctrl("mrs_string/classNames", names); 00170 setctrl("mrs_natural/nClasses", (mrs_natural)classesFound_.size()); 00171 00172 names = ""; 00173 first = true; 00174 mrs_natural index = 0; 00175 for(vector<mrs_string>::const_iterator citer = attributesFound_.begin(); citer!= attributesFound_.end(); citer++,index++) 00176 { 00177 if(attributesIncluded_[index]) 00178 { 00179 if(!first) 00180 names += ","; 00181 00182 names += (*citer); 00183 first = false; 00184 }//if 00185 } 00186 MRSASSERT(index == (mrs_natural)attributesIncluded_.size()); 00187 00188 setctrl("mrs_string/attributeNames", names); 00189 ctrl_onObsNames_->setValue(names); 00190 setctrl("mrs_natural/onSamples", 1); 00191 setctrl("mrs_natural/nAttributes", (mrs_natural)attributesFound_.size()); 00192 setctrl("mrs_natural/onObservations", (mrs_natural)attributesFound_.size()+1); 00193 setctrl("mrs_natural/nInstances", (mrs_natural)data_.getRows()); 00194 00195 00196 validationModeEnum_ = None; 00197 currentIndex_ = 0; 00198 00199 const mrs_string & validation_mode_spec = getctrl("mrs_string/validationMode")->to<mrs_string>(); 00200 if (validation_mode_spec.empty()) 00201 return; 00202 00203 argument_list_stream validation_mode_args(validation_mode_spec); 00204 00205 string v_mode_error_msg; 00206 { 00207 ostringstream text; 00208 text << "WekaSource: Error in value of control 'validationMode' = " 00209 << "\"" << validation_mode_spec << "\":"; 00210 v_mode_error_msg = text.str(); 00211 } 00212 00213 string validation_mode; 00214 if (!(validation_mode_args >> validation_mode)) 00215 { 00216 MRSERR(v_mode_error_msg << " Can not parse validation mode."); 00217 return; 00218 } 00219 00220 if (validation_mode == "OutputInstancePair") 00221 { 00222 validationModeEnum_ = OutputInstancePair; 00223 MarControlAccessor acc(getctrl("mrs_realvec/instanceIndexes")); 00224 realvec& instIdxs = acc.to<mrs_realvec>(); 00225 instIdxs.create(0.0, 1, 2); //init row vector 00226 setctrl("mrs_natural/onSamples", 2); 00227 return; 00228 } 00229 00230 if(validation_mode == "kFold") 00231 { //Validation mode is Folding, now extract the fold count. 00232 00233 string fold_type_str; 00234 if (!(validation_mode_args >> fold_type_str)) 00235 { 00236 MRSERR(v_mode_error_msg << " Could not parse fold type."); 00237 return; 00238 } 00239 00240 ValidationModeEnum fold_type; 00241 if(fold_type_str == "NS") 00242 fold_type = kFoldNonStratified; 00243 else if(fold_type_str == "S") 00244 fold_type = kFoldStratified; 00245 else 00246 { 00247 MRSERR(v_mode_error_msg << " Invalid fold type: " << fold_type_str); 00248 return; 00249 } 00250 00251 mrs_natural fold_count = -1; 00252 if ( !(validation_mode_args >> fold_count) ) 00253 { 00254 MRSERR(v_mode_error_msg << " Can not parse fold count."); 00255 return; 00256 } 00257 if ( !(fold_count >= 2 && fold_count <= 10) ) 00258 { 00259 MRSERR(v_mode_error_msg << " Invalid fold count: " << fold_count); 00260 return; 00261 } 00262 00263 validationModeEnum_ = fold_type; 00264 foldCount_ = fold_count; 00265 00266 data_.Shuffle(); 00267 // data_.Dump("shuffle.txt", classesFound_); 00268 00269 if( validationModeEnum_ != kFoldStratified) 00270 { 00271 cout << "=== Non-Stratified cross-validation (" << foldCount_ << " folds) ===" << endl; 00272 00273 //in non-stratified mode we simply use all the available data 00274 foldData_.SetupkFoldSections(data_, foldCount_); 00275 } 00276 else 00277 { 00278 cout << "=== Stratified cross-validation (" << foldCount_ << " folds) ===" << endl; 00279 00280 //in non-stratified we seperate the data according to class 00281 foldClassData_.clear(); 00282 foldClassData_.resize(classesFound_.size()); 00283 00284 //load each dataset with rows for each class 00285 for(mrs_natural ii=0; ii<(mrs_natural)classesFound_.size(); ++ii) 00286 { 00287 WekaFoldData data; 00288 data.setFold(true); 00289 data.SetupkFoldSections(data_, foldCount_, ii); 00290 foldClassData_[ii] = data; 00291 } 00292 foldClassDataIndex_ = 0; 00293 } 00294 00295 foldCurrentMode_ = foldNextMode_ = WekaFoldData::Training; 00296 00297 }//if "kFold" 00298 else if(validation_mode == "UseTestSet") 00299 { 00300 string test_set_filename; 00301 validation_mode_args >> test_set_filename; 00302 if (test_set_filename.empty()) 00303 { 00304 MRSERR(v_mode_error_msg << " Can not parse test set filename."); 00305 return; 00306 } 00307 00308 loadFile(test_set_filename, attributesToInclude_, useTestSetData_); 00309 MRSASSERT(data_.getCols()==useTestSetData_.getCols()); 00310 00311 cout << "=== Evaluation on test set === (" << test_set_filename << ") ===" << endl; 00312 00313 validationModeEnum_ = UseTestSet; 00314 currentIndex_ = 0; 00315 00316 }//else if "UseTestSet" 00317 else if(validation_mode == "PercentageSplit") 00318 { 00319 mrs_natural percentage_split = -1; 00320 00321 if ( !(validation_mode_args >> percentage_split) ) 00322 { 00323 MRSERR(v_mode_error_msg << " Can not parse percentage split."); 00324 return; 00325 } 00326 00327 if ( !(percentage_split > 0 && percentage_split < 100) ) 00328 { 00329 MRSERR(v_mode_error_msg << " Invalid percentage split: " << percentage_split); 00330 return; 00331 } 00332 00333 cout << "=== Evaluation on percentage split " << percentage_split << "% ===" << endl; 00334 00335 data_.Shuffle(); 00336 data_.Dump("shuffle.txt", classesFound_); 00337 00338 percentageIndex_ = ((mrs_natural)data_.size() * percentage_split) / 100; 00339 percentageIndex_--; //adjust to count from 0 00340 if(percentageIndex_ < 1) percentageIndex_ = 1; 00341 00342 validationModeEnum_ = PercentageSplit; 00343 currentIndex_ = 0; 00344 } //else if "PercentageSplit" 00345 else 00346 { 00347 MRSERR("Invalid validation mode: " << validation_mode); 00348 return; 00349 } 00350 }//myUpdate 00351 00352 void WekaSource::myProcess(realvec& in,realvec &out) 00353 { 00354 (void) in; 00355 00356 if(getctrl("mrs_bool/done")->to<mrs_bool>()) return; 00357 bool trainMode = (strcmp(getctrl("mrs_string/mode")->to<mrs_string>().c_str(), "train") == 0); 00358 switch(validationModeEnum_) 00359 { 00360 case kFoldNonStratified: 00361 handleFoldingNonStratifiedValidation(trainMode, out); 00362 break; 00363 case kFoldStratified: 00364 handleFoldingStratifiedValidation(trainMode, out); 00365 break; 00366 case UseTestSet: 00367 handleUseTestSet(trainMode, out); 00368 break; 00369 case PercentageSplit: 00370 handlePercentageSplit(trainMode, out); 00371 break; 00372 case OutputInstancePair: 00373 handleInstancePair(out); 00374 break; 00375 default: 00376 handleDefault(trainMode, out); 00377 }//switch 00378 }//myProcess 00379 00380 00381 void 00382 WekaSource::handleDefault(bool trainMode, realvec &out) 00383 { 00384 //FIXME: Unused parameter 00385 (void) trainMode; 00386 00387 if(currentIndex_ >= (mrs_natural)data_.size()) 00388 { 00389 this->updControl("mrs_bool/done", true); 00390 return; 00391 } 00392 00393 vector<mrs_real> *row = NULL; 00394 mrs_string fname = data_.GetFilename(currentIndex_); 00395 row = data_.at(currentIndex_++); 00396 00397 for(mrs_natural ii=0; ii<(mrs_natural)row->size(); ++ii) 00398 { 00399 out(ii, 0) = row->at(ii); 00400 this->updControl("mrs_string/currentFilename", fname); //???: why are we always updating this control to fname?? (which does not change inside the for loop...) 00401 } 00402 } 00403 00404 void 00405 WekaSource::handleInstancePair(realvec& out) 00406 { 00407 const realvec& instIdxs = getctrl("mrs_realvec/instanceIndexes")->to<mrs_realvec>(); 00408 00409 mrs_natural i = (mrs_natural)instIdxs(0); 00410 mrs_natural j = (mrs_natural)instIdxs(1); 00411 00412 if(i >= (mrs_natural)data_.size() || j >= (mrs_natural)data_.size()) 00413 { 00414 //this->updControl("mrs_bool/done", true); //!!!: done? 00415 MRSWARN("WekaSource::handlePair - out of bound file indexes!"); 00416 return; 00417 } 00418 00419 vector<mrs_real> *rowi = NULL; 00420 vector<mrs_real> *rowj = NULL; 00421 00422 mrs_string fnamei = data_.GetFilename(i); 00423 mrs_string fnamej = data_.GetFilename(j); 00424 00425 rowi = data_.at(i); 00426 rowj = data_.at(j); 00427 00428 for(mrs_natural ii=0; ii<(mrs_natural)rowi->size(); ++ii) 00429 { 00430 out(ii, 0) = rowi->at(ii); 00431 out(ii, 1) = rowj->at(ii); 00432 } 00433 this->updControl("mrs_string/currentFilename", fnamei+"_"+fnamej); 00434 00435 } 00436 00437 void WekaSource::handlePercentageSplit(bool trainMode, realvec &out) 00438 { 00439 vector<mrs_real> *row = NULL; 00440 00441 if(trainMode) 00442 { 00443 if (currentIndex_ >= percentageIndex_) 00444 { 00445 this->updControl("mrs_string/mode", "predict"); 00446 trainMode = false; 00447 } 00448 else 00449 { 00450 row = data_.at(currentIndex_++); 00451 } 00452 } 00453 00454 if(!trainMode) 00455 { 00456 if(currentIndex_ >= (mrs_natural)data_.size()) 00457 { 00458 this->updControl("mrs_bool/done", true); 00459 return; 00460 } 00461 else 00462 { 00463 row = data_.at(currentIndex_++); 00464 } 00465 } 00466 00467 // MRSASSERT(row->size()==out.getCols()); //[!] 00468 for(mrs_natural ii=0; ii<(mrs_natural)row->size(); ++ii) 00469 { 00470 out(ii, 0) = row->at(ii); 00471 } 00472 } //handlePercentageSplit 00473 00474 void WekaSource::handleUseTestSet(bool trainMode, realvec &out) 00475 { 00476 vector<mrs_real> *row = NULL; 00477 if(trainMode) 00478 { 00479 if(currentIndex_ >= (mrs_natural)data_.size()) 00480 { 00481 this->updControl("mrs_string/mode", "predict"); 00482 trainMode = false; 00483 currentIndex_ = 0; 00484 } 00485 else 00486 { 00487 row = data_.at(currentIndex_++); 00488 } 00489 } 00490 00491 if(!trainMode) 00492 { 00493 if(currentIndex_ >= (mrs_natural)useTestSetData_.size()) 00494 { 00495 this->updControl("mrs_bool/done", true); 00496 currentIndex_ = 0; 00497 return; 00498 } 00499 else 00500 { 00501 row = useTestSetData_.at(currentIndex_++); 00502 } 00503 } 00504 00505 MRSASSERT((mrs_natural)row->size() == out.getCols()); 00506 for(mrs_natural ii=0; ii<(mrs_natural)row->size(); ++ii) 00507 { 00508 out(ii, 0 ) = row->at(ii); 00509 } 00510 }//handleUseTestSet 00511 00512 void WekaSource::handleFoldingStratifiedValidation(bool trainMode, realvec &out) 00513 { 00514 (void) trainMode; 00515 00516 if (foldCurrentMode_ != foldNextMode_) 00517 { 00518 foldClassDataIndex_++; 00519 if(foldClassDataIndex_ >= (mrs_natural)foldClassData_.size()) 00520 { 00521 foldClassDataIndex_ = 0; 00522 foldCurrentMode_ = foldNextMode_; 00523 switch(foldCurrentMode_) 00524 { 00525 case WekaFoldData::Training: 00526 updControl("mrs_string/mode", "train"); 00527 break; 00528 case WekaFoldData::Predict: 00529 updControl("mrs_string/mode", "predict"); 00530 break; 00531 case WekaFoldData::None: 00532 updControl("mrs_bool/done", true); 00533 return; 00534 } 00535 } 00536 } 00537 00538 vector<mrs_real> *row = foldClassData_[foldClassDataIndex_].Next(foldNextMode_); 00539 00540 MRSASSERT((mrs_natural)row->size() == out.getRows()); 00541 for(mrs_natural ii=0; ii<(mrs_natural)row->size(); ++ii) 00542 { 00543 out(ii, 0) = row->at(ii); 00544 } 00545 } 00546 00547 void WekaSource::handleFoldingNonStratifiedValidation(bool trainMode, realvec &out) 00548 { 00549 (void) trainMode; 00550 00551 if( foldCurrentMode_ != foldNextMode_ ) 00552 { 00553 foldCurrentMode_ = foldNextMode_; 00554 switch (foldCurrentMode_) 00555 { 00556 case WekaFoldData::Training: 00557 updControl("mrs_string/mode", "train"); 00558 break; 00559 case WekaFoldData::Predict: 00560 updControl("mrs_string/mode", "predict"); 00561 break; 00562 case WekaFoldData::None: 00563 updControl("mrs_bool/done", true); 00564 return; 00565 } 00566 } 00567 00568 vector<mrs_real> *row = foldData_.Next(foldNextMode_); 00569 00570 MRSASSERT((mrs_natural) row->size() == out.getRows()); 00571 00572 for(mrs_natural ii=0; ii<(mrs_natural)row->size(); ++ii) 00573 { 00574 out(ii, 0) = row->at(ii); 00575 } 00576 } 00577 00578 void WekaSource::loadFile(const std::string& filename, const std::string& attributesToExtract, WekaData& data) 00579 { 00580 ifstream mis; 00581 00582 mis.open(filename.c_str()); 00583 00584 if (!mis.is_open()) { 00585 std::string msg = std::string("WekaSource: could not open file: ") + filename; 00586 MRSERR(msg); 00587 throw std::runtime_error(msg); 00588 } 00589 00590 data_.Clear(); 00591 00592 parseHeader(mis, filename, attributesToExtract); 00593 00594 parseData(mis, filename, data); 00595 00596 mis.close(); 00597 }//loadFile 00598 00599 void WekaSource::parseHeader(ifstream& mis, const mrs_string& filename, const std::string& attributesToExtract) 00600 { 00601 (void) attributesToExtract; // FIXME: suspiciously not used! 00602 (void) filename; // only used for debug messages; see ifstream& mis 00603 // FIXME: This method does not parse all valid relation or attribute names. 00604 // The ARFF spec allows for names that include spaces, iff those 00605 // names are quoted. 00606 // FIXME: Parsing errors should probably be fatal. 00607 00608 char str[1024]; 00609 // skip comment lines 00610 while (mis.peek() == '%') 00611 { 00612 mis.getline(str, 1023); 00613 } 00614 00615 00616 mrs_string token1,token2,token3; 00617 mrs_string whitespace = " \t\v\f\r\n"; 00618 mrs_string::size_type startpos; 00619 00620 // Read in the relation line 00621 mis >> token1; 00622 getline(mis, token2); 00623 00624 // Strip leading whitespace from the relation name 00625 startpos = token2.find_first_not_of(whitespace); 00626 if (startpos != mrs_string::npos) 00627 { 00628 token2 = token2.substr(startpos); 00629 } 00630 00631 if ((token1 != "@relation")&&(token1 != "@RELATION")) 00632 { 00633 MRSERR("Badly formatted .arff file: file must begin with @relation." + token1); 00634 return; 00635 } 00636 if (token2.find("\t") != mrs_string::npos) 00637 { 00638 MRSERR("Badly formatted .arff file: Relation name cannot contain tab characters."); 00639 return; 00640 } 00641 if (token2.find_first_of(whitespace) != mrs_string::npos) 00642 { 00643 MRSERR("Badly formatted .arff file: Marsyas cannot handle relation names with whitespace."); 00644 return; 00645 } 00646 relation_ = token2; 00647 00648 attributesFound_.clear(); 00649 attributesIncluded_.clear(); 00650 classesFound_.clear(); 00651 attributesIncludedList_.clear(); 00652 00653 // Parse the attribute definitions and store their names... 00654 //ie: @attribute Mean_Mem40_Centroid real 00655 while( mis >> token1 && (token1 == "@attribute" || (token1 == "@ATTRIBUTE"))) 00656 { 00657 mis >> token2; 00658 getline(mis, token3); 00659 00660 // skip leading spaces of token3 00661 startpos = token3.find_first_not_of(" \t"); 00662 if (mrs_string::npos != startpos) 00663 token3 = token3.substr(startpos); 00664 00665 if ((token3 == "real") || (token3 == "REAL")) 00666 { 00667 attributesFound_.push_back(token2); 00668 attributesIncluded_.push_back(true); 00669 } 00670 else if (token3[0] == '{') 00671 { 00672 mrs_string token = token3.substr( 1, token3.length()-2 ); // Remove curly braces 00673 00674 std::stringstream tokenStream(token); 00675 std::string cell; 00676 while(std::getline(tokenStream,cell,',')) 00677 { 00678 classesFound_.push_back(cell); 00679 } 00680 } 00681 else 00682 { 00683 attributesFound_.push_back(token2); 00684 attributesIncluded_.push_back(false); 00685 MRSWARN("Incompatible datatype " + token3 + " found in file '" + filename + "'. " + 00686 "attribute " + token2 + "will be ignored!"); 00687 }//else 00688 }//while 00689 00690 if ((token1 != "@data") && (token1 != "@DATA")) { 00691 MRSERR("Badly formatted .arff file: Finished parsing attributes but did not find @data section."); 00692 } 00693 00694 //Now we parse the attributes to include string and decide which attributes 00695 //are to be extracted from the arff file. An empty include list means all 00696 //attributes. 00697 00698 for(vector<mrs_string>::const_iterator citer = attributesFound_.begin(); citer!= attributesFound_.end(); citer++) 00699 { 00700 } 00701 00702 if (classesFound_.size() == 0) { 00703 ctrl_regression_->setValue(true); 00704 // remove the final "output" attribute 00705 attributesFound_.pop_back(); 00706 attributesIncluded_.pop_back(); 00707 } 00708 00709 parseAttributesToInclude(attributesToInclude_); 00710 }//parseHeader 00711 00712 void WekaSource::parseData(ifstream& mis, const mrs_string& filename, WekaData& data) 00713 { 00714 // FIXME Unused parameter 00715 (void) filename; 00716 mrs_string currentFname; 00717 00718 MRSASSERT(!mis.eof()); 00719 00720 data.Create((mrs_natural)attributesIncludedList_.size()+1); 00721 00722 char str[1024]; 00723 00724 while (mis.peek() == '%') // skip comment lines 00725 { 00726 mis.getline(str, 1023); 00727 } 00728 00729 00730 mrs_string token; 00731 // mis >> token; 00732 00733 while (token == "") 00734 getline(mis, token); 00735 00736 mrs_natural lineCount = 0; 00737 while(!mis.eof()) 00738 { 00739 char *cp = (char *)token.c_str(); 00740 if (cp[0] != '%') 00741 { 00742 cp = strtok(cp, ","); 00743 00744 vector<mrs_real> *lineBuffer = new vector<mrs_real>(attributesIncludedList_.size()+1); 00745 00746 mrs_natural index = 0; 00747 for(mrs_natural ii=0; ii < (mrs_natural)attributesFound_.size(); ++ii) 00748 { 00749 MRSASSERT( cp!=NULL ); 00750 if(attributesIncluded_[ii]) 00751 { 00752 lineBuffer->at(index++) = ::atof( cp ); 00753 } 00754 cp = strtok(NULL, ","); 00755 }//for index 00756 MRSASSERT(index == (mrs_natural)lineBuffer->size()-1); 00757 00758 if (ctrl_regression_->isTrue()) { 00759 // no change needed 00760 lineBuffer->at(index) = ::atof( cp ); 00761 } else { 00762 //now extract the class name for this record 00763 MRSASSERT( cp!=NULL ); 00764 00765 mrs_natural classIndex = findClass(cp); 00766 MRSASSERT(classIndex>=0); 00767 lineBuffer->at(index) = (mrs_real)classIndex; 00768 } 00769 00770 data.Append(lineBuffer); 00771 data.AppendFilename(currentFname); 00772 lineCount++; 00773 00774 // apparently not copied? 00775 //delete lineBuffer; // info was copied in Append() 00776 } 00777 else // skip comment line 00778 { 00779 // mis.getline(str, 1023); 00780 // If the line starts with "% filename" set the current_filename 00781 if (strncmp(token.c_str(),"% filename",10) == 0) { 00782 currentFname = token.substr(11); 00783 } 00784 00785 if (strncmp(token.c_str(),"% srate",7) == 0) { 00786 mrs_real currentSrate = atof(token.substr(8).c_str()); 00787 this->updControl("mrs_real/currentSrate", currentSrate); 00788 } 00789 } 00790 getline(mis,token); 00791 }//while 00792 00793 00794 00795 } 00796 00797 //Given a string, check if it is an class found in the arff file header. 00798 //If it is, return its index, otherwise return -1 00799 mrs_natural WekaSource::findClass(const char *className) const 00800 { 00801 MRSASSERT(className!=NULL); 00802 mrs_natural index = 0; 00803 for(vector<mrs_string>::const_iterator citer = classesFound_.begin(); citer!= classesFound_.end(); citer++,index++) 00804 { 00805 if(*citer == className) 00806 return index; 00807 }//for citer 00808 return -1; 00809 }//FindClass 00810 00811 //Given a string, check if it is an attribute found in the arff file header. 00812 //If it is, return its index, otherwise return -1 00813 mrs_natural WekaSource::findAttribute(const char *attribute) const 00814 { 00815 MRSASSERT(attribute!=NULL); 00816 00817 mrs_natural index = 0; 00818 for(vector<mrs_string>::const_iterator citer = attributesFound_.begin(); citer!= attributesFound_.end(); citer++,index++) 00819 { 00820 if(*citer == attribute) 00821 return index; 00822 }//for citer 00823 return -1; 00824 }//FindAttribute 00825 00826 //Given a string, determine if it is an attribute name or an integer 00827 //It is an attribute name if it is contained in the list of attributes 00828 //parsed from the header of the arff file. 00829 //If it is neither 00830 mrs_natural WekaSource::parseAttribute(const char *attribute) const 00831 { 00832 MRSASSERT(attribute!=NULL); 00833 00834 //check for attribute in list found in header and if found, 00835 //return its index. 00836 mrs_natural ret = findAttribute(attribute); 00837 if(ret >= 0) return ret; 00838 00839 //otherwise, check if the string is a valid integer. If not return -1 00840 for(mrs_natural ii=0; attribute[ii]!='\0'; ++ii) 00841 if(!isdigit(attribute[ii])) 00842 return -1; 00843 00844 //otherwise return the index. 00845 return ::atoi(attribute); 00846 }//parseAttribute 00847 00848 //Given an attribute string check if it is a single attribute name, or range of attributes. 00849 //Attributes can be expressed as a string or an integer index. 00850 //Some examples are: 00851 // 1-5 ; attributes at index 1 to 5 inclusive 00852 // Mean_Mem40_MFCC_1 - Mean_Mem40_MFCC_3 ; these attributes and all between inclusive 00853 // Mean_Mem40_MFCC_1 - 7 00854 // Mean_Mem40_Centroid 00855 // Note that the attribute index on the left must be <= attribute index on the right 00856 // Once the indexes are known, set those index values to true in the attributes to include array 00857 void WekaSource::parseAttributesToInclude(const std::string& attributesToInclude) 00858 { 00859 // FIXME Unused parameter. 00860 (void) attributesToInclude; 00861 //resize the included attribute bool array to the same size as the actual number of attributes 00862 //in the arff file. 00863 attributesIncluded_.resize(attributesFound_.size()); 00864 00865 //if null string specified, set all attributes to include to true 00866 //and set attributes to include list the same as attributes found 00867 if(attributesToInclude_.size()==0) 00868 { 00869 00870 attributesIncludedList_.assign(attributesFound_.begin(), attributesFound_.end()); 00871 for(mrs_natural ii=0; ii<(mrs_natural)attributesIncluded_.size(); ++ii) 00872 { 00873 attributesIncluded_[ii] = true; 00874 00875 } 00876 00877 return; 00878 }//if 00879 00880 //Otherwise lets assume all attributes are out for now 00881 for(mrs_natural ii=0; ii<(mrs_natural)attributesIncluded_.size(); ++ii) 00882 attributesIncluded_[ii] = false; 00883 00884 //get a copy of the attributes to include list and start parsing for the "," seperators 00885 mrs_string str = attributesToInclude_; 00886 char *cp = strtok((char *)str.c_str(), ","); 00887 00888 //find each string seperated by a "," and parse it for attributes 00889 while(cp) 00890 { 00891 //check if this string has a "-" seperator 00892 char *mp = strstr(cp,"-"); 00893 if(mp) 00894 { 00895 //yes it does, so lets parse each side of the "-" 00896 *mp++ = '\0'; 00897 00898 //check the left side. Check for valid 00899 mrs_natural left = parseAttribute(cp); 00900 MRSASSERT(left>=0&&left<(mrs_natural)attributesFound_.size()); 00901 00902 //check the right side. Check for valid 00903 mrs_natural right = parseAttribute(mp); 00904 MRSASSERT(right>=0&&right<(mrs_natural)attributesFound_.size()); 00905 00906 //make sure numbers are in the right order 00907 MRSASSERT(right>=left); 00908 00909 //and set the attributes included flag for this range of attributes 00910 for(mrs_natural ii=left; ii<=right; ++ii) 00911 attributesIncluded_[ii] = true; 00912 }//if 00913 //No "-" seperator, just parse this one attribute or index 00914 else 00915 { 00916 mrs_natural index = parseAttribute(cp); 00917 MRSASSERT(index>=0&&index<(mrs_natural)attributesFound_.size()); 00918 attributesIncluded_[index] = true; 00919 }//else 00920 00921 //next token 00922 cp = strtok(NULL, ","); 00923 }//while 00924 00925 //Now build the attributes included list from the original attributes found list. 00926 //Use the included flags array to determine which attributes to copy 00927 attributesIncludedList_.clear(); 00928 for(mrs_natural ii=0; ii<(mrs_natural)attributesIncluded_.size(); ++ii) 00929 { 00930 if(attributesIncluded_[ii]) 00931 attributesIncludedList_.push_back(attributesFound_[ii]); 00932 }//for ii 00933 }//parseAttributesToExtract