Marsyas  0.6.0-alpha
/usr/src/RPM/BUILD/marsyas-0.6.0/src/marsyas/marsystems/WekaSource.cpp
Go to the documentation of this file.
00001 
00008 #include "WekaSource.h"
00009 #include "../common_source.h"
00010 #include <stdexcept>
00011 #include <sstream>
00012 #include <string>
00013 #include <vector>
00014 
00015 using namespace std;
00016 using namespace Marsyas;
00017 
00018 class argument_list_stream
00019 {
00020   istringstream m_stream;
00021   bool m_ok;
00022 
00023 public:
00024   argument_list_stream(const string & text):
00025     m_stream(text),
00026     m_ok(true)
00027   {}
00028 
00029   operator bool() { return m_ok; }
00030 
00031   template <typename T>
00032   argument_list_stream & operator>>(T & value)
00033   {
00034     string part;
00035     if (std::getline(m_stream, part, ','))
00036     {
00037       istringstream part_stream(part);
00038       if (part_stream >> value)
00039       {
00040         istringstream::sentry can_read_more(part_stream);
00041         if (!can_read_more)
00042           return *this;
00043       }
00044     }
00045 
00046     m_ok = false;
00047     value = T();
00048     return *this;
00049   }
00050 };
00051 
00052 WekaSource::WekaSource(mrs_string name):MarSystem("WekaSource",name)
00053 {
00054   addControls();
00055   validationModeEnum_ = None;
00056   currentIndex_ = 0;
00057 }
00058 
00059 WekaSource::~WekaSource()
00060 {
00061   data_.Clear();
00062   useTestSetData_.Clear();
00063 }
00064 
00065 WekaSource::WekaSource(const WekaSource& a) : MarSystem(a) {
00066   validationModeEnum_ = None;
00067   currentIndex_  = 0;
00068   ctrl_regression_ = getctrl("mrs_bool/regression");
00069 }
00070 
00071 MarSystem *WekaSource::clone() const
00072 {
00073   return new WekaSource(*this);
00074 }
00075 
00076 void
00077 WekaSource::addControls()
00078 {
00079   addctrl("mrs_string/filename", "");
00080   setctrlState("mrs_string/filename", true);
00081 
00082   //comma seperated list of attributes to extract from the feature data
00083   //Can be attribute name, index, or range of indexes.
00084   //ie: "Mean_Mem40_Centroid, Mean_Mem40_Kurtosis, 4-7, 9,  .... "
00085   addctrl("mrs_string/attributesToInclude", "");
00086   setctrlState("mrs_string/attributesToInclude", true);
00087 
00088   //comma seperated list of class names found in feature file
00089   //ie: "Music, Speech, .... "
00090   addctrl("mrs_string/classNames", "");
00091 
00092   //number of classes found
00093   addctrl("mrs_natural/nClasses", 0);
00094   addctrl("mrs_bool/regression", false, ctrl_regression_);
00095 
00096   //The mode that the weka source is currently in.
00097   //Can be  "train" or "predict"
00098   addctrl("mrs_string/mode", "train");
00099 
00100   //number of output samples will always be 1, regardless of the input samples
00101   setctrl("mrs_natural/onSamples", 1 ); //FIXME: this should not be done here but in myProcess instead...
00102 
00103   //number of attributes and attribute names that will be reported.
00104   addctrl("mrs_natural/nAttributes", 0);
00105   addctrl("mrs_string/attributeNames", "");
00106 
00107   //type of classifier validation to do.
00108   //Blank or not set means none.
00109   //Other supported types:
00110   //"kFold,[S,NS],xx"  where xx is an integer 2-10
00111   //if S is specified, use Stratified
00112   //if NS is specified, use Non-Stratified
00113   //"UseTestSet,wekafilename"
00114   //"PercentageSplit,percent" where percent is 1-99
00115   //others to come
00116   addctrl("mrs_string/validationMode", "");
00117   addctrl("mrs_bool/done", false);
00118 
00119   addctrl("mrs_natural/nInstances", 0);
00120 
00121   // The current filename that we are processing
00122   // We get this from looking for comment strings that begin with "%
00123   // filename"
00124   addctrl("mrs_string/currentFilename", "");
00125   addctrl("mrs_real/currentSrate", 22050.0);
00126 
00127   //TODO: lmartins: document...
00128   addctrl("mrs_realvec/instanceIndexes", realvec());
00129 
00130   addctrl("mrs_bool/normMaxMin", false);
00131 
00132 }
00133 
00134 void
00135 WekaSource::myUpdate(MarControlPtr sender)
00136 {
00137   (void) sender;  //suppress warning of unused parameter(s)
00138   MRSDIAG("WekaSource.cpp - WekaSource:myUpdate");
00139 
00140   // If 'filename' was updated, or the attributes desired from the Weka file has changed,
00141   // parse the header portion of the file to get the required attribute names and possible output labels (if any)...
00142   if (filename_ == getctrl("mrs_string/filename")->to<mrs_string>())
00143     return;
00144 
00145   this->updControl("mrs_bool/done", false);
00146   filename_ = getctrl("mrs_string/filename")->to<mrs_string>();
00147   attributesToInclude_ = getctrl("mrs_string/attributesToInclude")->to<mrs_string>();
00148 
00149   mrs_bool normMaxMin = getctrl("mrs_bool/normMaxMin")->to<mrs_bool>();
00150 
00151   loadFile(filename_, attributesToInclude_, data_);
00152   if (normMaxMin)
00153   {
00154     data_.NormMaxMin();
00155   }
00156 
00157   // data_.Dump("org.txt", classesFound_);
00158 
00159   mrs_string names;
00160   bool first = true;
00161   for(vector<mrs_string>::const_iterator citer = classesFound_.begin(); citer!= classesFound_.end(); citer++)
00162   {
00163     if(!first)
00164       names += ",";
00165 
00166     names += (*citer);
00167     first = false;
00168   }
00169   setctrl("mrs_string/classNames", names);
00170   setctrl("mrs_natural/nClasses", (mrs_natural)classesFound_.size());
00171 
00172   names = "";
00173   first = true;
00174   mrs_natural index = 0;
00175   for(vector<mrs_string>::const_iterator citer = attributesFound_.begin(); citer!= attributesFound_.end(); citer++,index++)
00176   {
00177     if(attributesIncluded_[index])
00178     {
00179       if(!first)
00180         names += ",";
00181 
00182       names += (*citer);
00183       first = false;
00184     }//if
00185   }
00186   MRSASSERT(index == (mrs_natural)attributesIncluded_.size());
00187 
00188   setctrl("mrs_string/attributeNames", names);
00189   ctrl_onObsNames_->setValue(names);
00190   setctrl("mrs_natural/onSamples", 1);
00191   setctrl("mrs_natural/nAttributes", (mrs_natural)attributesFound_.size());
00192   setctrl("mrs_natural/onObservations", (mrs_natural)attributesFound_.size()+1);
00193   setctrl("mrs_natural/nInstances", (mrs_natural)data_.getRows());
00194 
00195 
00196   validationModeEnum_ = None;
00197   currentIndex_ = 0;
00198 
00199   const mrs_string & validation_mode_spec = getctrl("mrs_string/validationMode")->to<mrs_string>();
00200   if (validation_mode_spec.empty())
00201     return;
00202 
00203   argument_list_stream validation_mode_args(validation_mode_spec);
00204 
00205   string v_mode_error_msg;
00206   {
00207     ostringstream text;
00208     text << "WekaSource: Error in value of control 'validationMode' = "
00209          << "\"" << validation_mode_spec << "\":";
00210     v_mode_error_msg = text.str();
00211   }
00212 
00213   string validation_mode;
00214   if (!(validation_mode_args >> validation_mode))
00215   {
00216     MRSERR(v_mode_error_msg << " Can not parse validation mode.");
00217     return;
00218   }
00219 
00220   if (validation_mode == "OutputInstancePair")
00221   {
00222     validationModeEnum_ = OutputInstancePair;
00223     MarControlAccessor acc(getctrl("mrs_realvec/instanceIndexes"));
00224     realvec& instIdxs = acc.to<mrs_realvec>();
00225     instIdxs.create(0.0, 1, 2); //init row vector
00226     setctrl("mrs_natural/onSamples", 2);
00227     return;
00228   }
00229 
00230   if(validation_mode == "kFold")
00231   { //Validation mode is Folding, now extract the fold count.
00232 
00233     string fold_type_str;
00234     if (!(validation_mode_args >> fold_type_str))
00235     {
00236       MRSERR(v_mode_error_msg << " Could not parse fold type.");
00237       return;
00238     }
00239 
00240     ValidationModeEnum fold_type;
00241     if(fold_type_str == "NS")
00242       fold_type = kFoldNonStratified;
00243     else if(fold_type_str == "S")
00244       fold_type = kFoldStratified;
00245     else
00246     {
00247       MRSERR(v_mode_error_msg << " Invalid fold type: " << fold_type_str);
00248       return;
00249     }
00250 
00251     mrs_natural fold_count = -1;
00252     if ( !(validation_mode_args >> fold_count) )
00253     {
00254       MRSERR(v_mode_error_msg << " Can not parse fold count.");
00255       return;
00256     }
00257     if ( !(fold_count >= 2 && fold_count <= 10) )
00258     {
00259       MRSERR(v_mode_error_msg << " Invalid fold count: " << fold_count);
00260       return;
00261     }
00262 
00263     validationModeEnum_ = fold_type;
00264     foldCount_ = fold_count;
00265 
00266     data_.Shuffle();
00267     // data_.Dump("shuffle.txt", classesFound_);
00268 
00269     if( validationModeEnum_ != kFoldStratified)
00270     {
00271       cout << "=== Non-Stratified cross-validation (" <<  foldCount_ << " folds) ===" << endl;
00272 
00273       //in non-stratified mode we simply use all the available data
00274       foldData_.SetupkFoldSections(data_, foldCount_);
00275     }
00276     else
00277     {
00278       cout << "=== Stratified cross-validation (" <<  foldCount_ << " folds) ===" << endl;
00279 
00280       //in non-stratified we seperate the data according to class
00281       foldClassData_.clear();
00282       foldClassData_.resize(classesFound_.size());
00283 
00284       //load each dataset with rows for each class
00285       for(mrs_natural ii=0; ii<(mrs_natural)classesFound_.size(); ++ii)
00286       {
00287         WekaFoldData data;
00288         data.setFold(true);
00289         data.SetupkFoldSections(data_, foldCount_, ii);
00290         foldClassData_[ii] = data;
00291       }
00292       foldClassDataIndex_ = 0;
00293     }
00294 
00295     foldCurrentMode_ = foldNextMode_ = WekaFoldData::Training;
00296 
00297   }//if "kFold"
00298   else if(validation_mode == "UseTestSet")
00299   {
00300     string test_set_filename;
00301     validation_mode_args >> test_set_filename;
00302     if (test_set_filename.empty())
00303     {
00304       MRSERR(v_mode_error_msg << " Can not parse test set filename.");
00305       return;
00306     }
00307 
00308     loadFile(test_set_filename, attributesToInclude_, useTestSetData_);
00309     MRSASSERT(data_.getCols()==useTestSetData_.getCols());
00310 
00311     cout << "=== Evaluation on test set === (" <<  test_set_filename << ") ===" << endl;
00312 
00313     validationModeEnum_ = UseTestSet;
00314     currentIndex_ = 0;
00315 
00316   }//else if "UseTestSet"
00317   else if(validation_mode == "PercentageSplit")
00318   {
00319     mrs_natural percentage_split = -1;
00320 
00321     if ( !(validation_mode_args >> percentage_split) )
00322     {
00323       MRSERR(v_mode_error_msg << " Can not parse percentage split.");
00324       return;
00325     }
00326 
00327     if ( !(percentage_split > 0 && percentage_split < 100) )
00328     {
00329       MRSERR(v_mode_error_msg << " Invalid percentage split: " << percentage_split);
00330       return;
00331     }
00332 
00333     cout << "=== Evaluation on percentage split " << percentage_split << "% ===" << endl;
00334 
00335     data_.Shuffle();
00336     data_.Dump("shuffle.txt", classesFound_);
00337 
00338     percentageIndex_ = ((mrs_natural)data_.size() * percentage_split) / 100;
00339     percentageIndex_--; //adjust to count from 0
00340     if(percentageIndex_ < 1) percentageIndex_ = 1;
00341 
00342     validationModeEnum_ = PercentageSplit;
00343     currentIndex_ = 0;
00344   } //else if "PercentageSplit"
00345   else
00346   {
00347     MRSERR("Invalid validation mode: " << validation_mode);
00348     return;
00349   }
00350 }//myUpdate
00351 
00352 void WekaSource::myProcess(realvec& in,realvec &out)
00353 {
00354   (void) in;
00355 
00356   if(getctrl("mrs_bool/done")->to<mrs_bool>()) return;
00357   bool trainMode = (strcmp(getctrl("mrs_string/mode")->to<mrs_string>().c_str(), "train") == 0);
00358   switch(validationModeEnum_)
00359   {
00360   case kFoldNonStratified:
00361     handleFoldingNonStratifiedValidation(trainMode, out);
00362     break;
00363   case kFoldStratified:
00364     handleFoldingStratifiedValidation(trainMode, out);
00365     break;
00366   case UseTestSet:
00367     handleUseTestSet(trainMode, out);
00368     break;
00369   case PercentageSplit:
00370     handlePercentageSplit(trainMode, out);
00371     break;
00372   case OutputInstancePair:
00373     handleInstancePair(out);
00374     break;
00375   default:
00376     handleDefault(trainMode, out);
00377   }//switch
00378 }//myProcess
00379 
00380 
00381 void
00382 WekaSource::handleDefault(bool trainMode, realvec &out)
00383 {
00384   //FIXME: Unused parameter
00385   (void) trainMode;
00386 
00387   if(currentIndex_ >= (mrs_natural)data_.size())
00388   {
00389     this->updControl("mrs_bool/done", true);
00390     return;
00391   }
00392 
00393   vector<mrs_real> *row = NULL;
00394   mrs_string fname = data_.GetFilename(currentIndex_);
00395   row = data_.at(currentIndex_++);
00396 
00397   for(mrs_natural ii=0; ii<(mrs_natural)row->size(); ++ii)
00398   {
00399     out(ii, 0) = row->at(ii);
00400     this->updControl("mrs_string/currentFilename", fname); //???: why are we always updating this control to fname?? (which does not change inside the for loop...)
00401   }
00402 }
00403 
00404 void
00405 WekaSource::handleInstancePair(realvec& out)
00406 {
00407   const realvec& instIdxs = getctrl("mrs_realvec/instanceIndexes")->to<mrs_realvec>();
00408 
00409   mrs_natural i = (mrs_natural)instIdxs(0);
00410   mrs_natural j = (mrs_natural)instIdxs(1);
00411 
00412   if(i >= (mrs_natural)data_.size() || j >= (mrs_natural)data_.size())
00413   {
00414     //this->updControl("mrs_bool/done", true); //!!!: done?
00415     MRSWARN("WekaSource::handlePair - out of bound file indexes!");
00416     return;
00417   }
00418 
00419   vector<mrs_real> *rowi = NULL;
00420   vector<mrs_real> *rowj = NULL;
00421 
00422   mrs_string fnamei = data_.GetFilename(i);
00423   mrs_string fnamej = data_.GetFilename(j);
00424 
00425   rowi = data_.at(i);
00426   rowj = data_.at(j);
00427 
00428   for(mrs_natural ii=0; ii<(mrs_natural)rowi->size(); ++ii)
00429   {
00430     out(ii, 0) = rowi->at(ii);
00431     out(ii, 1) = rowj->at(ii);
00432   }
00433   this->updControl("mrs_string/currentFilename", fnamei+"_"+fnamej);
00434 
00435 }
00436 
00437 void WekaSource::handlePercentageSplit(bool trainMode, realvec &out)
00438 {
00439   vector<mrs_real> *row = NULL;
00440 
00441   if(trainMode)
00442   {
00443     if (currentIndex_ >= percentageIndex_)
00444     {
00445       this->updControl("mrs_string/mode", "predict");
00446       trainMode = false;
00447     }
00448     else
00449     {
00450       row = data_.at(currentIndex_++);
00451     }
00452   }
00453 
00454   if(!trainMode)
00455   {
00456     if(currentIndex_ >= (mrs_natural)data_.size())
00457     {
00458       this->updControl("mrs_bool/done", true);
00459       return;
00460     }
00461     else
00462     {
00463       row = data_.at(currentIndex_++);
00464     }
00465   }
00466 
00467   //  MRSASSERT(row->size()==out.getCols()); //[!]
00468   for(mrs_natural ii=0; ii<(mrs_natural)row->size(); ++ii)
00469   {
00470     out(ii, 0) = row->at(ii);
00471   }
00472 } //handlePercentageSplit
00473 
00474 void WekaSource::handleUseTestSet(bool trainMode, realvec &out)
00475 {
00476   vector<mrs_real> *row = NULL;
00477   if(trainMode)
00478   {
00479     if(currentIndex_ >= (mrs_natural)data_.size())
00480     {
00481       this->updControl("mrs_string/mode", "predict");
00482       trainMode = false;
00483       currentIndex_ = 0;
00484     }
00485     else
00486     {
00487       row = data_.at(currentIndex_++);
00488     }
00489   }
00490 
00491   if(!trainMode)
00492   {
00493     if(currentIndex_ >= (mrs_natural)useTestSetData_.size())
00494     {
00495       this->updControl("mrs_bool/done", true);
00496       currentIndex_ = 0;
00497       return;
00498     }
00499     else
00500     {
00501       row = useTestSetData_.at(currentIndex_++);
00502     }
00503   }
00504 
00505   MRSASSERT((mrs_natural)row->size() == out.getCols());
00506   for(mrs_natural ii=0; ii<(mrs_natural)row->size(); ++ii)
00507   {
00508     out(ii, 0 ) = row->at(ii);
00509   }
00510 }//handleUseTestSet
00511 
00512 void WekaSource::handleFoldingStratifiedValidation(bool trainMode, realvec &out)
00513 {
00514   (void) trainMode;
00515 
00516   if (foldCurrentMode_ != foldNextMode_)
00517   {
00518     foldClassDataIndex_++;
00519     if(foldClassDataIndex_ >= (mrs_natural)foldClassData_.size())
00520     {
00521       foldClassDataIndex_ = 0;
00522       foldCurrentMode_ = foldNextMode_;
00523       switch(foldCurrentMode_)
00524       {
00525       case WekaFoldData::Training:
00526         updControl("mrs_string/mode", "train");
00527         break;
00528       case WekaFoldData::Predict:
00529         updControl("mrs_string/mode", "predict");
00530         break;
00531       case WekaFoldData::None:
00532         updControl("mrs_bool/done", true);
00533         return;
00534       }
00535     }
00536   }
00537 
00538   vector<mrs_real> *row = foldClassData_[foldClassDataIndex_].Next(foldNextMode_);
00539 
00540   MRSASSERT((mrs_natural)row->size() == out.getRows());
00541   for(mrs_natural ii=0; ii<(mrs_natural)row->size(); ++ii)
00542   {
00543     out(ii, 0) = row->at(ii);
00544   }
00545 }
00546 
00547 void WekaSource::handleFoldingNonStratifiedValidation(bool trainMode, realvec &out)
00548 {
00549   (void) trainMode;
00550 
00551   if( foldCurrentMode_ != foldNextMode_ )
00552   {
00553     foldCurrentMode_ = foldNextMode_;
00554     switch (foldCurrentMode_)
00555     {
00556     case WekaFoldData::Training:
00557       updControl("mrs_string/mode", "train");
00558       break;
00559     case WekaFoldData::Predict:
00560       updControl("mrs_string/mode", "predict");
00561       break;
00562     case WekaFoldData::None:
00563       updControl("mrs_bool/done", true);
00564       return;
00565     }
00566   }
00567 
00568   vector<mrs_real> *row = foldData_.Next(foldNextMode_);
00569 
00570   MRSASSERT((mrs_natural) row->size() == out.getRows());
00571 
00572   for(mrs_natural ii=0; ii<(mrs_natural)row->size(); ++ii)
00573   {
00574     out(ii, 0) = row->at(ii);
00575   }
00576 }
00577 
00578 void WekaSource::loadFile(const std::string& filename, const std::string& attributesToExtract, WekaData& data)
00579 {
00580   ifstream mis;
00581 
00582   mis.open(filename.c_str());
00583 
00584   if (!mis.is_open()) {
00585     std::string msg = std::string("WekaSource: could not open file: ") + filename;
00586     MRSERR(msg);
00587     throw std::runtime_error(msg);
00588   }
00589 
00590   data_.Clear();
00591 
00592   parseHeader(mis, filename, attributesToExtract);
00593 
00594   parseData(mis, filename, data);
00595 
00596   mis.close();
00597 }//loadFile
00598 
00599 void WekaSource::parseHeader(ifstream& mis, const mrs_string& filename, const std::string& attributesToExtract)
00600 {
00601   (void) attributesToExtract; // FIXME: suspiciously not used!
00602   (void) filename; // only used for debug messages; see ifstream& mis
00603   // FIXME: This method does not parse all valid relation or attribute names.
00604   //        The ARFF spec allows for names that include spaces, iff those
00605   //        names are quoted.
00606   // FIXME: Parsing errors should probably be fatal.
00607 
00608   char str[1024];
00609   // skip comment lines
00610   while (mis.peek() == '%')
00611   {
00612     mis.getline(str, 1023);
00613   }
00614 
00615 
00616   mrs_string token1,token2,token3;
00617   mrs_string whitespace = " \t\v\f\r\n";
00618   mrs_string::size_type startpos;
00619 
00620   // Read in the relation line
00621   mis >> token1;
00622   getline(mis, token2);
00623 
00624   // Strip leading whitespace from the relation name
00625   startpos = token2.find_first_not_of(whitespace);
00626   if (startpos != mrs_string::npos)
00627   {
00628     token2 = token2.substr(startpos);
00629   }
00630 
00631   if ((token1 != "@relation")&&(token1 != "@RELATION"))
00632   {
00633     MRSERR("Badly formatted .arff file: file must begin with @relation." + token1);
00634     return;
00635   }
00636   if (token2.find("\t") != mrs_string::npos)
00637   {
00638     MRSERR("Badly formatted .arff file: Relation name cannot contain tab characters.");
00639     return;
00640   }
00641   if (token2.find_first_of(whitespace) != mrs_string::npos)
00642   {
00643     MRSERR("Badly formatted .arff file: Marsyas cannot handle relation names with whitespace.");
00644     return;
00645   }
00646   relation_ = token2;
00647 
00648   attributesFound_.clear();
00649   attributesIncluded_.clear();
00650   classesFound_.clear();
00651   attributesIncludedList_.clear();
00652 
00653   // Parse the attribute definitions and store their names...
00654   //ie: @attribute Mean_Mem40_Centroid real
00655   while( mis >> token1 && (token1 == "@attribute" || (token1 == "@ATTRIBUTE")))
00656   {
00657     mis >> token2;
00658     getline(mis, token3);
00659 
00660     // skip leading spaces of token3
00661     startpos = token3.find_first_not_of(" \t");
00662     if (mrs_string::npos != startpos)
00663       token3 = token3.substr(startpos);
00664 
00665     if ((token3 == "real") || (token3 == "REAL"))
00666     {
00667       attributesFound_.push_back(token2);
00668       attributesIncluded_.push_back(true);
00669     }
00670     else if (token3[0] == '{')
00671     {
00672       mrs_string token = token3.substr( 1, token3.length()-2 ); // Remove curly braces
00673 
00674       std::stringstream  tokenStream(token);
00675       std::string        cell;
00676       while(std::getline(tokenStream,cell,','))
00677       {
00678         classesFound_.push_back(cell);
00679       }
00680     }
00681     else
00682     {
00683       attributesFound_.push_back(token2);
00684       attributesIncluded_.push_back(false);
00685       MRSWARN("Incompatible datatype " + token3 + " found in file '" + filename + "'.  " +
00686               "attribute " + token2 + "will be ignored!");
00687     }//else
00688   }//while
00689 
00690   if ((token1 != "@data") && (token1 != "@DATA")) {
00691     MRSERR("Badly formatted .arff file: Finished parsing attributes but did not find @data section.");
00692   }
00693 
00694   //Now we parse the attributes to include string and decide which attributes
00695   //are to be extracted from the arff file. An empty include list means all
00696   //attributes.
00697 
00698   for(vector<mrs_string>::const_iterator citer = attributesFound_.begin(); citer!= attributesFound_.end(); citer++)
00699   {
00700   }
00701 
00702   if (classesFound_.size() == 0) {
00703     ctrl_regression_->setValue(true);
00704     // remove the final "output" attribute
00705     attributesFound_.pop_back();
00706     attributesIncluded_.pop_back();
00707   }
00708 
00709   parseAttributesToInclude(attributesToInclude_);
00710 }//parseHeader
00711 
00712 void WekaSource::parseData(ifstream& mis, const mrs_string& filename, WekaData& data)
00713 {
00714   // FIXME Unused parameter
00715   (void) filename;
00716   mrs_string currentFname;
00717 
00718   MRSASSERT(!mis.eof());
00719 
00720   data.Create((mrs_natural)attributesIncludedList_.size()+1);
00721 
00722   char str[1024];
00723 
00724   while (mis.peek() == '%')             // skip comment lines
00725   {
00726     mis.getline(str, 1023);
00727   }
00728 
00729 
00730   mrs_string token;
00731   // mis >> token;
00732 
00733   while (token == "")
00734     getline(mis, token);
00735 
00736   mrs_natural lineCount = 0;
00737   while(!mis.eof())
00738   {
00739     char *cp = (char *)token.c_str();
00740     if (cp[0] != '%')
00741     {
00742       cp = strtok(cp, ",");
00743 
00744       vector<mrs_real> *lineBuffer = new vector<mrs_real>(attributesIncludedList_.size()+1);
00745 
00746       mrs_natural index = 0;
00747       for(mrs_natural ii=0; ii < (mrs_natural)attributesFound_.size(); ++ii)
00748       {
00749         MRSASSERT( cp!=NULL );
00750         if(attributesIncluded_[ii])
00751         {
00752           lineBuffer->at(index++) = ::atof( cp );
00753         }
00754         cp = strtok(NULL, ",");
00755       }//for index
00756       MRSASSERT(index == (mrs_natural)lineBuffer->size()-1);
00757 
00758       if (ctrl_regression_->isTrue()) {
00759         // no change needed
00760         lineBuffer->at(index) =  ::atof( cp );
00761       } else {
00762         //now extract the class name for this record
00763         MRSASSERT( cp!=NULL );
00764 
00765         mrs_natural classIndex = findClass(cp);
00766         MRSASSERT(classIndex>=0);
00767         lineBuffer->at(index) = (mrs_real)classIndex;
00768       }
00769 
00770       data.Append(lineBuffer);
00771       data.AppendFilename(currentFname);
00772       lineCount++;
00773 
00774       // apparently not copied?
00775       //delete lineBuffer; // info was copied in Append()
00776     }
00777     else // skip comment line
00778     {
00779 //          mis.getline(str, 1023);
00780       // If the line starts with "% filename" set the current_filename
00781       if (strncmp(token.c_str(),"% filename",10) == 0) {
00782         currentFname = token.substr(11);
00783       }
00784 
00785       if (strncmp(token.c_str(),"% srate",7) == 0) {
00786         mrs_real currentSrate = atof(token.substr(8).c_str());
00787         this->updControl("mrs_real/currentSrate", currentSrate);
00788       }
00789     }
00790     getline(mis,token);
00791   }//while
00792 
00793 
00794 
00795 }
00796 
00797 //Given a string, check if it is an class found in the arff file header.
00798 //If it is, return its index, otherwise return -1
00799 mrs_natural WekaSource::findClass(const char *className) const
00800 {
00801   MRSASSERT(className!=NULL);
00802   mrs_natural index = 0;
00803   for(vector<mrs_string>::const_iterator citer = classesFound_.begin(); citer!= classesFound_.end(); citer++,index++)
00804   {
00805     if(*citer == className)
00806       return index;
00807   }//for citer
00808   return -1;
00809 }//FindClass
00810 
00811 //Given a string, check if it is an attribute found in the arff file header.
00812 //If it is, return its index, otherwise return -1
00813 mrs_natural WekaSource::findAttribute(const char *attribute) const
00814 {
00815   MRSASSERT(attribute!=NULL);
00816 
00817   mrs_natural index = 0;
00818   for(vector<mrs_string>::const_iterator citer = attributesFound_.begin(); citer!= attributesFound_.end(); citer++,index++)
00819   {
00820     if(*citer == attribute)
00821       return index;
00822   }//for citer
00823   return -1;
00824 }//FindAttribute
00825 
00826 //Given a string, determine if it is an attribute name or an integer
00827 //It is an attribute name if it is contained in the list of attributes
00828 //parsed from the header of the arff file.
00829 //If it is neither
00830 mrs_natural WekaSource::parseAttribute(const char *attribute) const
00831 {
00832   MRSASSERT(attribute!=NULL);
00833 
00834   //check for attribute in list found in header and if found,
00835   //return its index.
00836   mrs_natural ret = findAttribute(attribute);
00837   if(ret >= 0) return ret;
00838 
00839   //otherwise, check if the string is a valid integer. If not return -1
00840   for(mrs_natural ii=0; attribute[ii]!='\0'; ++ii)
00841     if(!isdigit(attribute[ii]))
00842       return -1;
00843 
00844   //otherwise return the index.
00845   return ::atoi(attribute);
00846 }//parseAttribute
00847 
00848 //Given an attribute string check if it is a single attribute name, or range of attributes.
00849 //Attributes can be expressed as a string or an integer index.
00850 //Some examples are:
00851 // 1-5                                      ; attributes at index 1 to 5 inclusive
00852 // Mean_Mem40_MFCC_1 - Mean_Mem40_MFCC_3    ; these attributes and all between inclusive
00853 // Mean_Mem40_MFCC_1 - 7
00854 // Mean_Mem40_Centroid
00855 // Note that the attribute index on the left must be <= attribute index on the right
00856 // Once the indexes are known, set those index values to true in the attributes to include array
00857 void WekaSource::parseAttributesToInclude(const std::string& attributesToInclude)
00858 {
00859   // FIXME Unused parameter.
00860   (void) attributesToInclude;
00861   //resize the included attribute bool array to the same size as the actual number of attributes
00862   //in the arff file.
00863   attributesIncluded_.resize(attributesFound_.size());
00864 
00865   //if null string specified, set all attributes to include to true
00866   //and set attributes to include list the same as attributes found
00867   if(attributesToInclude_.size()==0)
00868   {
00869 
00870     attributesIncludedList_.assign(attributesFound_.begin(), attributesFound_.end());
00871     for(mrs_natural ii=0; ii<(mrs_natural)attributesIncluded_.size(); ++ii)
00872     {
00873       attributesIncluded_[ii] = true;
00874 
00875     }
00876 
00877     return;
00878   }//if
00879 
00880   //Otherwise lets assume all attributes are out for now
00881   for(mrs_natural ii=0; ii<(mrs_natural)attributesIncluded_.size(); ++ii)
00882     attributesIncluded_[ii] = false;
00883 
00884   //get a copy of the attributes to include list and start parsing for the "," seperators
00885   mrs_string str = attributesToInclude_;
00886   char *cp = strtok((char *)str.c_str(), ",");
00887 
00888   //find each string seperated by a "," and parse it for attributes
00889   while(cp)
00890   {
00891     //check if this string has a "-" seperator
00892     char *mp = strstr(cp,"-");
00893     if(mp)
00894     {
00895       //yes it does, so lets parse each side of the "-"
00896       *mp++ = '\0';
00897 
00898       //check the left side. Check for valid
00899       mrs_natural left = parseAttribute(cp);
00900       MRSASSERT(left>=0&&left<(mrs_natural)attributesFound_.size());
00901 
00902       //check the right side. Check for valid
00903       mrs_natural right = parseAttribute(mp);
00904       MRSASSERT(right>=0&&right<(mrs_natural)attributesFound_.size());
00905 
00906       //make sure numbers are in the right order
00907       MRSASSERT(right>=left);
00908 
00909       //and set the attributes included flag for this range of attributes
00910       for(mrs_natural ii=left; ii<=right; ++ii)
00911         attributesIncluded_[ii] = true;
00912     }//if
00913     //No "-" seperator, just parse this one attribute or index
00914     else
00915     {
00916       mrs_natural index = parseAttribute(cp);
00917       MRSASSERT(index>=0&&index<(mrs_natural)attributesFound_.size());
00918       attributesIncluded_[index] = true;
00919     }//else
00920 
00921     //next token
00922     cp = strtok(NULL, ",");
00923   }//while
00924 
00925   //Now build the attributes included list from the original attributes found list.
00926   //Use the included flags array to determine which attributes to copy
00927   attributesIncludedList_.clear();
00928   for(mrs_natural ii=0; ii<(mrs_natural)attributesIncluded_.size(); ++ii)
00929   {
00930     if(attributesIncluded_[ii])
00931       attributesIncludedList_.push_back(attributesFound_[ii]);
00932   }//for ii
00933 }//parseAttributesToExtract