Marsyas
0.6.0-alpha
|
00001 #include <marsyas/WekaData.h> 00002 00003 using namespace std; 00004 using namespace Marsyas; 00005 00006 //This class represents a collection of row data read from a weka arff file. 00007 //It is organized as a vector collection of vector pointers. 00008 //It is optimized for fast sorting and shuffling of the data. It is not intended 00009 //that the data change once it is loaded. 00010 // 00011 //It is also assumed that the last column of each row is the class attribute. 00012 //All data items are mrs_real, including the class attribute, however the class 00013 //attribute should be interpreted as an mrs_natural. 00014 WekaData::WekaData():cols_(0),rows_(0), isFold_(false) 00015 { 00016 } 00017 00018 WekaData::~WekaData() 00019 { 00020 00021 // if it is a fold then the pointers refers 00022 // to rows in the original data so the data 00023 // they point to doesn't need to be deallocated 00024 // The "original" WekaData for which the folds 00025 // where computed takes care of it 00026 if (!isFold_) 00027 Clear(); 00028 } 00029 00030 void 00031 WekaData::setFold(bool isFold) 00032 { 00033 isFold_ = isFold; 00034 } 00035 00036 00037 //create the table. Will clear contents first and fix the number of columns. 00038 void WekaData::Create(mrs_natural cols) 00039 { 00040 MRSASSERT(cols>=0); 00041 this->Clear(); 00042 cols_ = cols; 00043 rows_ = 0; 00044 } 00045 00046 //clear all data from the table 00047 //Requires that the vector rows be freed 00048 void WekaData::Clear() 00049 { 00050 if (rows_ > 0) { 00051 vector<vector<mrs_real>*>::iterator iter = this->begin(); 00052 while (iter != this->end()) { 00053 delete (*iter); 00054 ++iter; 00055 } 00056 } 00057 this->clear(); 00058 filenames_.clear(); 00059 00060 }//Clear 00061 00062 00063 void 00064 WekaData::NormMaxMinRow(realvec& in) 00065 { 00066 int ii; 00067 for(ii=0; ii<(int)in.getSize()-1; ++ii) 00068 { 00069 in(ii) = (in(ii) - minimums_(ii)) / (maximums_(ii) - minimums_(ii)); 00070 } 00071 } 00072 00073 void 00074 WekaData::NormMaxMin() 00075 { 00076 minimums_.create(cols_-1); 00077 maximums_.create(cols_-1); 00078 maximums_.setval(DBL_MIN); 00079 minimums_.setval(DBL_MAX); 00080 00081 // find minimums_ and maximums_ 00082 for(vector<vector<mrs_real>*>::const_iterator citer = this->begin(); citer!=this->end(); citer++) 00083 { 00084 const vector<mrs_real> *row = (*citer); 00085 int ii; 00086 for(ii=0; ii<(int)row->size()-1; ++ii) 00087 { 00088 if (row->at(ii) > maximums_(ii)) 00089 maximums_(ii) = row->at(ii); 00090 if (row->at(ii) < minimums_(ii)) 00091 minimums_(ii) = row->at(ii); 00092 } 00093 } 00094 00095 00096 // normalize 00097 for(vector<vector<mrs_real>*>::const_iterator citer = this->begin(); citer!=this->end(); citer++) 00098 { 00099 vector<mrs_real> *row = (*citer); 00100 int ii; 00101 for(ii=0; ii<(int)row->size()-1; ++ii) 00102 { 00103 // don't divide by zero 00104 if (maximums_(ii) - minimums_(ii) == 0) 00105 row->at(ii) = 0; 00106 else 00107 row->at(ii) = ((row->at(ii) - minimums_(ii)) / (maximums_(ii) - minimums_(ii))); 00108 } 00109 } 00110 00111 00112 00113 00114 } 00115 00116 mrs_realvec WekaData::GetMinimums() const 00117 { 00118 return minimums_; 00119 } 00120 00121 mrs_realvec WekaData::GetMaximums() const 00122 { 00123 return maximums_; 00124 } 00125 00126 00127 //randomly shuffle the data in the table 00128 //Need only to swap the pointers to row data, nice and fast! 00129 void WekaData::Shuffle() 00130 { 00131 srand(0); 00132 00133 size_t size = this->size()-1; 00134 for (size_t ii=0; ii<size; ++ii) 00135 { 00136 mrs_natural rind = (mrs_natural)(((mrs_real)rand() / (mrs_real)(RAND_MAX))*size); 00137 //swap row ii with row rind 00138 swapRows((mrs_natural)ii, rind); 00139 }//for ii 00140 }//Shuffle 00141 00142 //SwapRows will exchange one row for another. 00143 //Just need to swap the 2 vector pointers. 00144 void WekaData::swapRows(mrs_natural l, mrs_natural r) 00145 { 00146 vector<mrs_real> *temp = this->at(l); 00147 this->at(l) = this->at(r); 00148 this->at(r) = temp; 00149 } 00150 00151 mrs_natural WekaData::partition(mrs_natural attIndex, mrs_natural l, mrs_natural r) 00152 { 00153 mrs_real pivot = this->at((l+r)/2)->at(attIndex); 00154 while (l < r) 00155 { 00156 while ((this->at(l)->at(attIndex) < pivot) && (l < r)) 00157 { 00158 l++; 00159 }//while 00160 00161 while ((this->at(r)->at(attIndex) > pivot) && (l < r)) 00162 { 00163 r--; 00164 }//while 00165 00166 if (l < r) 00167 { 00168 swapRows(l, r); 00169 l++; 00170 r--; 00171 }//if 00172 } 00173 if ((l == r) && (this->at(r)->at(attIndex) > pivot)) 00174 { 00175 r--; 00176 } //if 00177 00178 return r; 00179 }//partition 00180 00189 //@ requires 0 <= attIndex && attIndex < numAttributes(); 00190 //@ requires 0 <= first && first <= right && right < numInstances(); 00191 //Shamelessly ripped off from the weka library of code. - dale 00192 void WekaData::quickSort(mrs_natural attIndex, mrs_natural left, mrs_natural right) 00193 { 00194 if (left < right) 00195 { 00196 int middle = partition(attIndex, left, right); 00197 quickSort(attIndex, left, middle); 00198 quickSort(attIndex, middle + 1, right); 00199 }//if 00200 }//quicksort 00201 00202 //Sort the instances dataset based on the column attr 00203 //Note that the entire table must be sorted on the attribute, 00204 //not just the attribute itself. 00205 void WekaData::Sort(mrs_natural attr) 00206 { 00207 MRSASSERT(attr>=0&&attr<cols_); 00208 quickSort(attr, 0, (mrs_natural) this->size()-1); 00209 } 00210 00211 //add rows of data to the table 00212 void WekaData::Append(const realvec& in) 00213 { 00214 MRSASSERT(in.getRows()==cols_); 00215 // skip feature vectors labeled with negative labels 00216 00217 if (in(in.getRows()-1, 0) >=0) 00218 { 00219 data_ = new vector<mrs_real>(cols_); 00220 for(mrs_natural ii=0; ii<in.getRows(); ++ii) 00221 { 00222 data_->at(ii) = in(ii, 0); 00223 } 00224 Append(data_); 00225 } 00226 00227 } 00228 00229 00230 00231 00232 //add rows of data to the table 00233 void WekaData::Append(vector<mrs_real> *data) 00234 { 00235 MRSASSERT(data!=NULL && (int)data->size()==cols_); 00236 rows_++; 00237 00238 this->push_back(data); 00239 }//Append 00240 00241 00242 //add rows of data to the table 00243 void WekaData::AppendFilename(mrs_string fname) 00244 { 00245 filenames_.push_back(fname); 00246 }//AppendFilename 00247 00248 mrs_string WekaData::GetFilename(mrs_natural row) const 00249 { 00250 return (mrs_string)filenames_.at(row); 00251 } 00252 00253 //get the class attribute for a row and convert to a int 00254 //class attribute is last column of row 00255 mrs_natural WekaData::GetClass(mrs_natural row) const 00256 { 00257 return (mrs_natural)this->at(row)->at(cols_-1); 00258 } 00259 00260 //debug helper funtion to dump table to an ascii file 00261 void WekaData::Dump(const mrs_string& filename, const vector<mrs_string>& classNames) const 00262 { 00263 char buffer[32]; 00264 00265 ofstream *mis = new ofstream; 00266 00267 mis->open(filename.c_str(), ios_base::out | ios_base::trunc ); 00268 MRSASSERT( mis->is_open() ); 00269 00270 for(vector<vector<mrs_real>*>::const_iterator citer = this->begin(); citer!=this->end(); citer++) 00271 { 00272 bool first = true; 00273 const vector<mrs_real> *row = (*citer); 00274 int ii; 00275 for(ii=0; ii<(int)row->size()-1; ++ii) 00276 { 00277 if(!first) 00278 mis->write(", ", 2); 00279 first = false; 00280 00281 sprintf(buffer, "%09.4f", row->at(ii)); 00282 mis->write(buffer, strlen(buffer)); 00283 } 00284 mis->write(", ", 2); 00285 mrs_natural classIndex = (mrs_natural)row->at(ii); 00286 mis->write(classNames[classIndex].c_str(), strlen(classNames[classIndex].c_str())); 00287 mis->write("\n", 1); 00288 } 00289 00290 mis->close(); 00291 delete mis; 00292 }//Dump