Marsyas  0.6.0-alpha
/usr/src/RPM/BUILD/marsyas-0.6.0/src/marsyas/marsystems/PeakClusterSelect.cpp
Go to the documentation of this file.
00001 /*
00002 ** Copyright (C) 1998-2006 George Tzanetakis <gtzan@cs.uvic.ca>
00003 **
00004 ** This program is free software; you can redistribute it and/or modify
00005 ** it under the terms of the GNU General Public License as published by
00006 ** the Free Software Foundation; either version 2 of the License, or
00007 ** (at your option) any later version.
00008 **
00009 ** This program is distributed in the hope that it will be useful,
00010 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
00011 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012 ** GNU General Public License for more details.
00013 **
00014 ** You should have received a copy of the GNU General Public License
00015 ** along with this program; if not, write to the Free Software
00016 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
00017 */
00018 
00019 #include "PeakClusterSelect.h"
00020 #include "../common_source.h"
00021 #include <algorithm>
00022 
00023 using std::min;
00024 using std::max;
00025 using std::ostringstream;
00026 using namespace Marsyas;
00027 
00028 static std::ofstream outTextFile;
00029 //#define MTLB_DBG_LOG
00030 //#define DBG_FILE_OUT
00031 #ifdef DBG_FILE_OUT
00032 static const std::string outFileName("d:/temp/peakcluster.txt");
00033 #endif
00034 
00035 
00036 PeakClusterSelect::PeakClusterSelect(mrs_string name):MarSystem("PeakClusterSelect", name)
00037 {
00038   addControls();
00039 }
00040 
00041 PeakClusterSelect::PeakClusterSelect(const PeakClusterSelect& a) : MarSystem(a)
00042 {
00043   ctrl_numClustersToKeep_ = getctrl("mrs_natural/numClustersToKeep");
00044 }
00045 
00046 PeakClusterSelect::~PeakClusterSelect()
00047 {
00048 #ifdef DBG_FILE_OUT
00049   outTextFile.close ();
00050 #endif
00051 }
00052 
00053 MarSystem*
00054 PeakClusterSelect::clone() const
00055 {
00056   return new PeakClusterSelect(*this);
00057 }
00058 
00059 void
00060 PeakClusterSelect::addControls()
00061 {
00062   addctrl("mrs_natural/numClustersToKeep", 1, ctrl_numClustersToKeep_);
00063   ctrl_numClustersToKeep_->setState(false);
00064 }
00065 
00066 void
00067 PeakClusterSelect::myUpdate(MarControlPtr sender)
00068 {
00069   (void) sender;  //suppress warning of unused parameter(s)
00070   MRSDIAG("PeakClusterSelect.cpp - PeakClusterSelect:myUpdate");
00071 
00072   ctrl_onObservations_->setValue(1, NOUPDATE);
00073   ctrl_onSamples_->setValue(ctrl_inSamples_, NOUPDATE);
00074   ctrl_osrate_->setValue(ctrl_israte_, NOUPDATE);
00075   ctrl_onObsNames_->setValue("peakLabels", NOUPDATE);
00076 
00077 #ifdef DBG_FILE_OUT
00078   if (!outTextFile.good () || ! outTextFile.is_open ())
00079     outTextFile.open(outFileName.c_str ());
00080 #endif
00081 }
00082 
00083 void
00084 PeakClusterSelect::sort(realvec& rv, mrs_natural dimension, mrs_natural left, mrs_natural right, mrs_bool sortColumns)
00085 {
00086   if( left < right )
00087   {
00088     int part = partition(rv, dimension, left, right, sortColumns);
00089     sort(rv, dimension, left, part-1, sortColumns);
00090     sort(rv, dimension, part+1, right, sortColumns);
00091   }
00092 }
00093 
00094 int
00095 PeakClusterSelect::partition(realvec& rv, mrs_natural dimension, mrs_natural left, mrs_natural right, mrs_bool sortColumns)
00096 {
00097   // Not quite fair, but good enough random partitioning
00098   int pivot_i = rand()%(right-left+1) + left;
00099   // Place pivot val at the end of the series
00100   swap(rv, pivot_i, right, sortColumns);
00101 
00102   mrs_real pivot_val;
00103   if( sortColumns )
00104     pivot_val = rv(dimension,right);
00105   else
00106     pivot_val = rv(right,dimension);
00107 
00108   int i = left-1;
00109 
00110   if( sortColumns )
00111   {
00112     for( int j=left ; j<right ; j++ )
00113     {
00114       if( rv(dimension,j) <= pivot_val )
00115       {
00116         ++i;
00117         swap(rv, i, j, sortColumns);
00118       }
00119     }
00120     // re-insert pivot val
00121     swap(rv, i+1, right, sortColumns);
00122   }
00123   else
00124   {
00125     for( int j=left ; j<right ; j++ )
00126     {
00127       if( rv(j,dimension) <= pivot_val )
00128       {
00129         ++i;
00130         swap(rv, i, j, sortColumns);
00131       }
00132     }
00133     // re-insert pivot val
00134     swap(rv, i+1, right, sortColumns);
00135   }
00136 
00137   return i+1;
00138 }
00139 
00140 void
00141 PeakClusterSelect::swap(realvec& rv, mrs_natural sample1, mrs_natural sample2, mrs_bool swapColumns)
00142 {
00143   if( swapColumns ) // swap two columns
00144   {
00145     int rows = rv.getRows();
00146     mrs_real tmp;
00147 
00148     for( int i=0 ; i<rows ; ++i )
00149     {
00150       tmp = rv(i, sample1);
00151       rv(i,sample1) = rv(i,sample2);
00152       rv(i,sample2) = tmp;
00153     }
00154   }
00155   else // swap two rows
00156   {
00157     int cols = rv.getCols();
00158     mrs_real tmp;
00159 
00160     for( int i=0 ; i<cols ; ++i )
00161     {
00162       tmp = rv(sample1,i);
00163       rv(sample1,i) = rv(sample2,i);
00164       rv(sample2,i) = tmp;
00165     }
00166   }
00167 }
00168 
00169 void
00170 PeakClusterSelect::myProcess(realvec& in, realvec& out)
00171 {
00172   mrs_natural t;
00173   mrs_natural numClustersToKeep = ctrl_numClustersToKeep_->to<mrs_natural>();
00174   mrs_natural curNumClusters=-1, i, j, k, curClusterLabel;
00175   mrs_natural numPeaks = ctrl_inSamples_->to<mrs_natural>();
00176 
00177   // Determine number of clusters in input by finding maximum index plus 1
00178   for (t = 0; t < inSamples_; t++)
00179     if( in(0,t) > curNumClusters )
00180       curNumClusters = (mrs_natural)in(0,t);
00181   curNumClusters += 1;
00182 
00183   mrs_realvec intraClusterSimilarities(3, curNumClusters);
00184   mrs_realvec clusterSimilarity (curNumClusters, curNumClusters);
00185   mrs_realvec clusterNorm (curNumClusters, curNumClusters);
00186   mrs_realvec keepMe(curNumClusters);
00187   mrs_real  intraSimKeepThresh  = .5;
00188 
00189   clusterSimilarity.setval (0.);
00190   clusterNorm.setval (0.);
00191   keepMe.setval (1.);
00192 
00193   for( i=0 ; i<curNumClusters ; ++i )
00194   {
00195     // Store cluster index in first row
00196     // (so upon sorting of density values,
00197     // we have reference to the cluster)
00198     intraClusterSimilarities(0,i) = i;
00199     // Keep track of the number of peaks in each cluster
00200     intraClusterSimilarities(1,i) = 0;
00201     // Finally, keep track of the sum of intra-cluster similarities
00202     intraClusterSimilarities(2,i) = 0;
00203   }
00204 
00205 
00206   // compute the similarity of each peak to each each other
00207   // accumulate per cluster pair
00208   for( i=0 ; i<numPeaks ; ++i )
00209   {
00210     mrs_realvec numAcc(curNumClusters);
00211     mrs_realvec similarity(curNumClusters);
00212 
00213     numAcc.setval (0.);
00214     similarity.setval (0.);
00215     for( j=0 ; j<numPeaks ; j++ )
00216     {
00217       if (i==j)
00218         continue;
00219       similarity((mrs_natural)(in(0,j)+.1))                             += in (i+1, j);
00220       clusterNorm ((mrs_natural)(in(0,i)+.1),(mrs_natural)(in(0,j)+.1)) += 1;
00221     }
00222     for (k = 0; k < curNumClusters; k++)
00223       clusterSimilarity ((mrs_natural)(in(0,i)+.1), k)  += similarity(k);
00224   }
00225 
00226   for (i = 0; i < curNumClusters; i++)
00227     for (j = 0; j < curNumClusters; j++)
00228       clusterSimilarity(i,j)    /= (clusterNorm(i,j)>0)? clusterNorm(i,j) : 1.;
00229 
00230   // compute something similar to silhouette coefficients
00231   mrs_realvec silhouetteCoeffs (curNumClusters);
00232   silhouetteCoeffs.setval(0.);
00233 
00234   for (k = 0; k < curNumClusters; k++)
00235   {
00236     mrs_real selfSim, maximum, minSim = 0;//1e37;
00237     selfSim = clusterSimilarity(k,k);
00238     for (i = 0; i < curNumClusters; i++)
00239     {
00240       if (i == k)
00241         continue;
00242       //minSim  = (clusterSimilarity(k,i) < minSim)?clusterSimilarity(k,i) :  minSim;
00243       minSim    += clusterSimilarity(k,i);
00244     }
00245     minSim  /= (curNumClusters-1);
00246 
00247     if ((maximum = max(selfSim, minSim)) != 0)
00248       silhouetteCoeffs(k)   = (selfSim - minSim)/ maximum;
00249   }
00250 
00251   // update output values
00252   for (k = 0; k < curNumClusters; k++)
00253     intraClusterSimilarities(2,k)   = clusterSimilarity(k,k);
00254 
00255 #ifdef MARSYAS_MATLAB
00256 #ifdef MTLB_DBG_LOG
00257   //MATLAB_PUT(clusterSimilarity, "sim");
00258   //MATLAB_PUT(curNumClusters, "numClust");
00259   //MATLAB_EVAL ("figure(13),imagesc(0:(numClust-1),0:(numClust-1),sim,[0 1]),colorbar");
00260   //MATLAB_PUT(silhouetteCoeffs, "sil");
00261   //MATLAB_EVAL ("figure(14),plot(0:(length(sil)-1), sil),axis([0 length(sil)-1 0 1]), grid on");
00262 #endif
00263 #endif
00264 
00265 
00266 
00267   // (Quick) Sort by intra-cluster similarity density
00268   sort( intraClusterSimilarities, 2, 0, curNumClusters-1 );
00269 
00270 
00271 #ifdef DBG_FILE_OUT
00272   if (outTextFile.good ())
00273   {
00274     if (curNumClusters < 6)
00275     {
00276       for (k = 0; k < 6-curNumClusters; k++)
00277         outTextFile << 0 <<"\t";
00278     }
00279     for (k = 0; k < curNumClusters; k++)
00280       outTextFile << silhouetteCoeffs(k) <<"\t"; ;
00281     //outTextFile << intraClusterSimilarities(2,k) <<"\t"; ;
00282 
00283     outTextFile << std::endl;
00284   }
00285 #endif
00286 
00287   // ignore setting of numclusters2Keep and set it by threshold of the intraclustersimilarity
00288   if (numClustersToKeep == 0)
00289   {
00290     const mrs_real intraThreshBounds[2] = {.3,.6};
00291     const mrs_real silThreshBound       = 1./curNumClusters;
00292     intraSimKeepThresh  = max((mrs_real)intraThreshBounds[0],min((mrs_real)intraThreshBounds[1],(mrs_real).5*((mrs_real)intraClusterSimilarities(2,0) + (mrs_real)intraClusterSimilarities(2, curNumClusters-1))));
00293     for (k = 0; k < curNumClusters; k++)
00294     {
00295       // intra cluster similarity threshold
00296       if (intraClusterSimilarities(2,k) < intraSimKeepThresh)
00297         keepMe(k)   = 0;
00298 
00299       // silhouette coefficient threshold
00300       if (silhouetteCoeffs((mrs_natural)(intraClusterSimilarities(0,k)+.1)) < silThreshBound)
00301         keepMe(k)   = 0;
00302     }
00303     numClustersToKeep   = (mrs_natural)(keepMe.sum () +.1);
00304 
00305     if (numClustersToKeep == curNumClusters)
00306       keepMe(0) = 0; // always abandon one cluster!
00307     //std::cout << numClustersToKeep << " " << std::endl;
00308   }
00309   else
00310   {
00311     for (k = 0; k < (curNumClusters - numClustersToKeep); k++)
00312       keepMe(k) = 0;
00313   }
00314 
00315   for (t = 0; t < inSamples_; t++)
00316   {
00317     curClusterLabel = (mrs_natural)in(0,t);
00318     out(0,t) = curClusterLabel; //signals clusters to be synthesized
00319 
00320     for( k=0 ; k < curNumClusters; k++)
00321     {
00322       if( curClusterLabel == intraClusterSimilarities(0,k) )
00323       {
00324         if (keepMe(k) < .5)
00325           out(0,t) = (curClusterLabel)? -curClusterLabel : -1; //negative value signals clusters to be discarded, zero is mapped to -1
00326         break;
00327       }
00328     }
00329   }
00330 }