svcore  1.9
CSVFormat.cpp
Go to the documentation of this file.
00001 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
00002 
00003 /*
00004     Sonic Visualiser
00005     An audio file viewer and annotation editor.
00006     Centre for Digital Music, Queen Mary, University of London.
00007     This file copyright 2006 Chris Cannam.
00008     
00009     This program is free software; you can redistribute it and/or
00010     modify it under the terms of the GNU General Public License as
00011     published by the Free Software Foundation; either version 2 of the
00012     License, or (at your option) any later version.  See the file
00013     COPYING included with this distribution for more information.
00014 */
00015 
00016 #include "CSVFormat.h"
00017 
00018 #include "base/StringBits.h"
00019 
00020 #include <QFile>
00021 #include <QString>
00022 #include <QRegExp>
00023 #include <QStringList>
00024 #include <QTextStream>
00025 
00026 #include <iostream>
00027 
00028 CSVFormat::CSVFormat(QString path) :
00029     m_separator(""),
00030     m_sampleRate(44100),
00031     m_windowSize(1024),
00032     m_allowQuoting(true)
00033 {
00034     guessFormatFor(path);
00035 }
00036 
00037 void
00038 CSVFormat::guessFormatFor(QString path)
00039 {
00040     m_modelType = TwoDimensionalModel;
00041     m_timingType = ExplicitTiming;
00042     m_timeUnits = TimeSeconds;
00043 
00044     m_maxExampleCols = 0;
00045     m_columnCount = 0;
00046     m_variableColumnCount = false;
00047 
00048     m_example.clear();
00049     m_columnQualities.clear();
00050     m_columnPurposes.clear();
00051     m_prevValues.clear();
00052 
00053     QFile file(path);
00054     if (!file.exists()) return;
00055     if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) return;
00056 
00057     QTextStream in(&file);
00058     in.seek(0);
00059 
00060     int lineno = 0;
00061 
00062     while (!in.atEnd()) {
00063 
00064         // See comment about line endings in CSVFileReader::load() 
00065 
00066         QString chunk = in.readLine();
00067         QStringList lines = chunk.split('\r', QString::SkipEmptyParts);
00068 
00069         for (int li = 0; li < lines.size(); ++li) {
00070 
00071             QString line = lines[li];
00072             if (line.startsWith("#") || line == "") continue;
00073 
00074             guessQualities(line, lineno);
00075 
00076             ++lineno;
00077         }
00078 
00079         if (lineno >= 50) break;
00080     }
00081 
00082     guessPurposes();
00083 }
00084 
00085 void
00086 CSVFormat::guessSeparator(QString line)
00087 {
00088     char candidates[] = { ',', '\t', ' ', '|', '/', ':' };
00089     for (int i = 0; i < int(sizeof(candidates)/sizeof(candidates[0])); ++i) {
00090         if (StringBits::split(line, candidates[i], m_allowQuoting).size() >= 2) {
00091             m_separator = candidates[i];
00092             return;
00093         }
00094     }
00095     m_separator = " ";
00096 }
00097 
00098 void
00099 CSVFormat::guessQualities(QString line, int lineno)
00100 {
00101     if (m_separator == "") guessSeparator(line);
00102 
00103     QStringList list = StringBits::split(line, m_separator[0], m_allowQuoting);
00104 
00105     int cols = list.size();
00106     if (lineno == 0 || (cols < m_columnCount)) m_columnCount = cols;
00107     if (cols != m_columnCount) m_variableColumnCount = true;
00108 
00109     // All columns are regarded as having these qualities until we see
00110     // something that indicates otherwise:
00111 
00112     ColumnQualities defaultQualities =
00113         ColumnNumeric | ColumnIntegral | ColumnIncreasing;
00114     
00115     for (int i = 0; i < cols; ++i) {
00116             
00117         while (m_columnQualities.size() <= i) {
00118             m_columnQualities.push_back(defaultQualities);
00119             m_prevValues.push_back(0.f);
00120         }
00121 
00122         QString s(list[i]);
00123         bool ok = false;
00124 
00125         ColumnQualities qualities = m_columnQualities[i];
00126 
00127         bool numeric    = (qualities & ColumnNumeric);
00128         bool integral   = (qualities & ColumnIntegral);
00129         bool increasing = (qualities & ColumnIncreasing);
00130         bool large      = (qualities & ColumnLarge); // this one defaults to off
00131 
00132         float value = 0.f;
00133 
00135 
00136         if (numeric) {
00137             value = s.toFloat(&ok);
00138             if (!ok) {
00139                 value = (float)StringBits::stringToDoubleLocaleFree(s, &ok);
00140             }
00141             if (ok) {
00142                 if (lineno < 2 && value > 1000.f) large = true;
00143             } else {
00144                 numeric = false;
00145             }
00146         }
00147 
00148         if (numeric) {
00149 
00150             if (integral) {
00151                 if (s.contains('.') || s.contains(',')) {
00152                     integral = false;
00153                 }
00154             }
00155 
00156             if (increasing) {
00157                 if (lineno > 0 && value <= m_prevValues[i]) {
00158                     increasing = false;
00159                 }
00160             }
00161 
00162             m_prevValues[i] = value;
00163         }
00164 
00165         m_columnQualities[i] =
00166             (numeric    ? ColumnNumeric : 0) |
00167             (integral   ? ColumnIntegral : 0) |
00168             (increasing ? ColumnIncreasing : 0) |
00169             (large      ? ColumnLarge : 0);
00170     }
00171 
00172     if (lineno < 10) {
00173         m_example.push_back(list);
00174         if (lineno == 0 || cols > m_maxExampleCols) {
00175             m_maxExampleCols = cols;
00176         }
00177     }
00178 
00179 //    cerr << "Estimated column qualities: ";
00180 //    for (int i = 0; i < m_columnCount; ++i) {
00181 //        cerr << int(m_columnQualities[i]) << " ";
00182 //    }
00183 //    cerr << endl;
00184 }
00185 
00186 void
00187 CSVFormat::guessPurposes()
00188 {
00189     m_timingType = CSVFormat::ImplicitTiming;
00190     m_timeUnits = CSVFormat::TimeWindows;
00191         
00192     int timingColumnCount = 0;
00193     
00194     for (int i = 0; i < m_columnCount; ++i) {
00195         
00196         ColumnPurpose purpose = ColumnUnknown;
00197         bool primary = (i == 0);
00198 
00199         ColumnQualities qualities = m_columnQualities[i];
00200 
00201         bool numeric    = (qualities & ColumnNumeric);
00202         bool integral   = (qualities & ColumnIntegral);
00203         bool increasing = (qualities & ColumnIncreasing);
00204         bool large      = (qualities & ColumnLarge);
00205 
00206         bool timingColumn = (numeric && increasing);
00207 
00208         if (timingColumn) {
00209 
00210             ++timingColumnCount;
00211                               
00212             if (primary) {
00213 
00214                 purpose = ColumnStartTime;
00215 
00216                 m_timingType = ExplicitTiming;
00217 
00218                 if (integral && large) {
00219                     m_timeUnits = TimeAudioFrames;
00220                 } else {
00221                     m_timeUnits = TimeSeconds;
00222                 }
00223 
00224             } else {
00225 
00226                 if (timingColumnCount == 2 && m_timingType == ExplicitTiming) {
00227                     purpose = ColumnEndTime;
00228                 }
00229             }
00230         }
00231 
00232         if (purpose == ColumnUnknown) {
00233             if (numeric) {
00234                 purpose = ColumnValue;
00235             } else {
00236                 purpose = ColumnLabel;
00237             }
00238         }
00239 
00240         setColumnPurpose(i, purpose);
00241     }            
00242 
00243     int valueCount = 0;
00244     for (int i = 0; i < m_columnCount; ++i) {
00245         if (m_columnPurposes[i] == ColumnValue) ++valueCount;
00246     }
00247 
00248     if (valueCount == 2 && timingColumnCount == 1) {
00249         // If we have exactly two apparent value columns and only one
00250         // timing column, but one value column is integral and the
00251         // other is not, guess that whichever one matches the integral
00252         // status of the time column is either duration or end time
00253         if (m_timingType == ExplicitTiming) {
00254             int a = -1, b = -1;
00255             for (int i = 0; i < m_columnCount; ++i) {
00256                 if (m_columnPurposes[i] == ColumnValue) {
00257                     if (a == -1) a = i;
00258                     else b = i;
00259                 }
00260             }
00261             if ((m_columnQualities[a] & ColumnIntegral) !=
00262                 (m_columnQualities[b] & ColumnIntegral)) {
00263                 int timecol = a;
00264                 if ((m_columnQualities[a] & ColumnIntegral) !=
00265                     (m_columnQualities[0] & ColumnIntegral)) {
00266                     timecol = b;
00267                 }
00268                 if (m_columnQualities[timecol] & ColumnIncreasing) {
00269                     // This shouldn't happen; should have been settled above
00270                     m_columnPurposes[timecol] = ColumnEndTime;
00271                 } else {
00272                     m_columnPurposes[timecol] = ColumnDuration;
00273                 }
00274                 --valueCount;
00275             }
00276         }
00277     }
00278 
00279     if (timingColumnCount > 1) {
00280         m_modelType = TwoDimensionalModelWithDuration;
00281     } else {
00282         if (valueCount == 0) {
00283             m_modelType = OneDimensionalModel;
00284         } else if (valueCount == 1) {
00285             m_modelType = TwoDimensionalModel;
00286         } else {
00287             m_modelType = ThreeDimensionalModel;
00288         }
00289     }
00290 
00291 //    cerr << "Estimated column purposes: ";
00292 //    for (int i = 0; i < m_columnCount; ++i) {
00293 //        cerr << int(m_columnPurposes[i]) << " ";
00294 //    }
00295 //    cerr << endl;
00296 
00297 //    cerr << "Estimated model type: " << m_modelType << endl;
00298 //    cerr << "Estimated timing type: " << m_timingType << endl;
00299 //    cerr << "Estimated units: " << m_timeUnits << endl;
00300 }
00301 
00302 CSVFormat::ColumnPurpose
00303 CSVFormat::getColumnPurpose(int i)
00304 {
00305     while (m_columnPurposes.size() <= i) {
00306         m_columnPurposes.push_back(ColumnUnknown);
00307     }
00308     return m_columnPurposes[i];
00309 }
00310 
00311 CSVFormat::ColumnPurpose
00312 CSVFormat::getColumnPurpose(int i) const
00313 {
00314     if (m_columnPurposes.size() <= i) {
00315         return ColumnUnknown;
00316     }
00317     return m_columnPurposes[i];
00318 }
00319 
00320 void
00321 CSVFormat::setColumnPurpose(int i, ColumnPurpose p)
00322 {
00323     while (m_columnPurposes.size() <= i) {
00324         m_columnPurposes.push_back(ColumnUnknown);
00325     }
00326     m_columnPurposes[i] = p;
00327 }
00328 
00329 
00330 
00331