Home  · Classes  · Annotated Classes  · Modules  · Members  · Namespaces  · Related Pages

HierarchicalClustering.h (Maintainer: Chris Bielow)

Go to the documentation of this file.
00001 // -*- Mode: C++; tab-width: 2; -*-
00002 // vi: set ts=2:
00003 //
00004 // --------------------------------------------------------------------------
00005 //                   OpenMS Mass Spectrometry Framework
00006 // --------------------------------------------------------------------------
00007 //  Copyright (C) 2003-2008 -- Oliver Kohlbacher, Knut Reinert
00008 //
00009 //  This library is free software; you can redistribute it and/or
00010 //  modify it under the terms of the GNU Lesser General Public
00011 //  License as published by the Free Software Foundation; either
00012 //  version 2.1 of the License, or (at your option) any later version.
00013 //
00014 //  This library is distributed in the hope that it will be useful,
00015 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00017 //  Lesser General Public License for more details.
00018 //
00019 //  You should have received a copy of the GNU Lesser General Public
00020 //  License along with this library; if not, write to the Free Software
00021 //  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00022 //
00023 // --------------------------------------------------------------------------
00024 // $Maintainer: Chris Bielow $
00025 // --------------------------------------------------------------------------
00026 
00027 #ifndef OPENMS_COMPARISON_CLUSTERING_HIERARCHICALCLUSTERING_H
00028 #define OPENMS_COMPARISON_CLUSTERING_HIERARCHICALCLUSTERING_H
00029 
00030 // OpenMS
00031 #include <OpenMS/DATASTRUCTURES/DPosition.h>
00032 #include <OpenMS/DATASTRUCTURES/DefaultParamHandler.h>
00033 
00034 // STL
00035 #include <vector>
00036 #include <math.h>
00037 
00038 
00039 namespace OpenMS
00040 {
00054   template <typename ClusterPoint = DPosition<2> >
00055   class HierarchicalClustering : public DefaultParamHandler
00056   {
00057     public:
00058     
00059       typedef ClusterPoint ClusterPointType;
00060       typedef std::vector<uint> ClusterIdxType;
00061       typedef std::vector< ClusterIdxType > ClusterIdxVectorType;
00062       using DefaultParamHandler::param_;
00063       using DefaultParamHandler::defaults_;
00064       
00065       typedef std::vector <double> DistanceLineType;
00066       typedef std::vector < DistanceLineType > DistanceMatrixType;
00067       
00069       enum LINKAGE_TYPE {COMPLETE_LINKAGE=0, SINGLE_LINKAGE};    
00070           
00074 
00075       HierarchicalClustering()
00076       : DefaultParamHandler("HierarchicalClustering")
00077       {
00078         defaults_.setValue("cluster_cutoff", 40.0, "maximal distance allowed between two clusters before merging them", false);           // maximal distance allowed between two clusters before merging them
00079         defaults_.setValue("linkage_type", COMPLETE_LINKAGE,"clustering method: 0=complete linkage 1=single_linkage", true);
00080     
00081         defaultsToParam_();
00082       }
00083 
00085       inline HierarchicalClustering(const HierarchicalClustering& source)
00086           : DefaultParamHandler(source),
00087           clustermap_(source.clustermap_)
00088       {}
00089 
00091       inline HierarchicalClustering& operator=(const HierarchicalClustering& source)
00092       {
00093         if (&source==this)
00094         {
00095           return *this;
00096         }
00097 
00098         DefaultParamHandler::operator=(source);
00099         clustermap_ = source.clustermap_;
00100 
00101         return *this;          
00102       };
00103 
00104         
00105 
00107       virtual ~HierarchicalClustering() 
00108       {
00109       };
00111 
00115 
00117       const ClusterIdxVectorType & getClusters() const
00118       {
00119         return clustermap_; 
00120       }
00121 
00123       
00125       void compute(const std::vector<ClusterPointType>& points) throw (Exception::NotImplemented)
00126       {
00127         double cutoff = param_.getValue("cluster_cutoff");
00128         int linkage_method = param_.getValue("linkage_type");
00129 
00130                 
00131         // initial result: every point represents a cluster
00132         clustermap_.clear();
00133         ClusterIdxVectorType clustermap_tmp(points.size(), ClusterIdxType(1));
00134         clustermap_.assign(clustermap_tmp.begin(),clustermap_tmp.end());
00135         
00136         for (uint i=0; i<points.size(); ++i)
00137         {
00138           clustermap_[i][0] = i;
00139         }
00140         
00141         
00142         // initial distances between points
00143         DistanceMatrixType distanceMatrix(points.size(), DistanceLineType(1));
00144                 
00145         // compute distance matrix
00146         for (uint i=1; i<points.size(); ++i)
00147         {
00148           distanceMatrix[i].resize(i);
00149           for (uint j=0; j<i; ++j)
00150           {
00151             distanceMatrix[i][j] = getDistance_(points[i], points[j]);
00152             //std::cout << getDistance_(points[i], points[j]) << " ";
00153           }
00154           //std::cout << "\n";
00155         }
00156         
00157         for (uint i=0; i<points.size(); ++i)
00158         {
00159           //std::cout << "size: " <<  distanceMatrix[i].size();
00160         }
00161         
00162         
00163         double number_of_clusters = points.size()-1;
00164         
00165         double minDistance;
00166         uint minRow=0, minColumn=0;
00167         
00168         getMinDistance_(distanceMatrix, minDistance, minRow, minColumn);
00169         
00170         // iterate until cutoff reached
00171         while ((minDistance < cutoff) && (number_of_clusters>=2))
00172         {
00173            //std::cout << "starting ...";
00174 
00175            // find closest two clusters
00176            getMinDistance_(distanceMatrix, minDistance, minRow, minColumn);
00177            
00178            //std::cout << "combining clusters " << (minRow) << " and " << minColumn << "\n";
00179            
00180            // combine clusters
00181            // IDEA? append to larger cluster (and swap afterwards if neccessary)
00182            while (!clustermap_[minColumn].empty())
00183            {
00184              //std::cout << "putting " << clustermap[minColumn].back() << " from " << minColumn << "->" << minRow << "\n";
00185              clustermap_[minRow].push_back( clustermap_[minColumn].back() );
00186              clustermap_[minColumn].pop_back();
00187            }
00188            
00189            // partially recompute distance matrix
00190            if (linkage_method==COMPLETE_LINKAGE)
00191            {
00192              linkageComplete_(distanceMatrix, minRow, minColumn);
00193            } 
00194            else if (linkage_method==SINGLE_LINKAGE)
00195            {
00196              linkageSingle_(distanceMatrix, minRow, minColumn);
00197            } 
00198            else
00199            {
00200               std::cerr << "Linkage method not available" << linkage_method << "\n";
00201               throw Exception::NotImplemented(__FILE__,__LINE__,__FUNCTION__);
00202            }
00203            
00204            //delete 2nd clusters row from matrix (don´t bother about columns - they won´t save memory)
00205            distanceMatrix[minColumn].clear();
00206 
00207            --number_of_clusters;
00208         }
00209         
00210         //std::cout << "HC: remove dead clusters ... \n";
00211                 
00212         // remove empty indices 
00213         ClusterIdxVectorType cl_tmp;
00214         ClusterIdxVectorType::iterator iter = clustermap_.begin();
00215         while ( iter != clustermap_.end())
00216         {
00217           if (iter->empty())
00218           {
00219             iter = clustermap_.erase(iter, iter+1);
00220           } else
00221           {
00222             ++iter;
00223           }       
00224         }
00225        
00226         return;
00227       }
00228 
00229      
00230       void printStatistics (std::ostream& os)
00231       {
00232         if (clustermap_.empty())
00233         {
00234           os << "no clusters defined! call 'compute()' first!\n";
00235         }
00236         else
00237         {
00238           os << "\nCluster size statistics:\n";
00239           uint maxsize = 0;
00240           for (uint i=0; i<clustermap_.size(); ++i)
00241           {
00242             if (clustermap_[i].size() > maxsize) maxsize = clustermap_[i].size();
00243           }
00244 
00245           std::vector<uint> size_dist(maxsize+1);
00246           for (uint i=0; i<clustermap_.size(); ++i)
00247           {
00248             ++size_dist[clustermap_[i].size()];  
00249           }
00250           
00251           for (uint i=0; i<maxsize+1; ++i)
00252           {
00253             if (size_dist[i]>0)
00254               os << size_dist[i] <<  " of size " << i << "\n";  
00255           }
00256 
00257         }
00258         return;
00259       }
00260 
00261     protected:
00262     
00265       inline void getMinDistance_(const DistanceMatrixType& distanceMatrix, double& minDistance, uint& minRow, uint& minColumn)
00266       {
00267         minDistance = std::numeric_limits<double>::max();
00268 
00269         for (uint i=1; i<distanceMatrix.size(); ++i)
00270         {
00271           if (!distanceMatrix[i].empty())
00272           {
00273             //std::cout << "line length: " << distanceMatrix[i].size() << " -- ";
00274             for (uint j=0; j<i; ++j)
00275             {
00276               //std::cout << "i,j " << i << "," << j << "- " << distanceMatrix[i][j] << "#  ";
00277               if (!distanceMatrix[j].empty())
00278               {
00279                 if (minDistance > distanceMatrix[i][j])
00280                 {
00281                   minDistance = distanceMatrix[i][j];
00282                   minRow = i;
00283                   minColumn = j;
00284                 }
00285               }
00286             }  // for j
00287             //std::cout << "\n";
00288           } 
00289         } // for i 
00290         
00291         //std::cout << "Distance_min " << minDistance << " row, column " << minRow << "," << minColumn << "\n";
00292         
00293         return;        
00294       }
00295     
00297       inline double getDistance_(const ClusterPointType& a, const ClusterPointType& b)
00298       {
00299          // (insert new distance calculation here)
00300          double distance = 0;
00301          for (uint i=0; i<a.size(); ++i)
00302          {
00303             distance += pow(a[i]-b[i], 2);
00304          }
00305          
00306          return sqrt(distance);
00307       }
00308 
00311       inline void linkageComplete_(DistanceMatrixType& distanceMatrix, const uint&  minRow, const uint& minColumn)
00312       {
00313         // recompute data matrix entries
00314         for (uint i=0; i<distanceMatrix.size(); ++i)
00315         {
00316           // we only have a triangular matrix ... so be careful with indices
00317           if (i>minRow) // && (i>minColumn) <-- can be assured, because minRow>minColumn!
00318           {
00319             distanceMatrix[i][minRow] = std::max(distanceMatrix[i][minRow], distanceMatrix[i][minColumn]); 
00320           }
00321           else if (i<minColumn) // && (i<minRow) <-- can be assured, because minRow>minColumn!
00322           {
00323             distanceMatrix[minRow][i] = std::max(distanceMatrix[minRow][i], distanceMatrix[minColumn][i]);  
00324           }
00325           else if ((i<minRow) && (i>minColumn))
00326           {  
00327             distanceMatrix[minRow][i] = std::max(distanceMatrix[minRow][i], distanceMatrix[i][minColumn]);  
00328           }
00329           
00330           
00331         }
00332         
00333         return;
00334       };
00335 
00338       inline void linkageSingle_(DistanceMatrixType& distanceMatrix, const uint& minRow, const uint& minColumn)
00339       {
00340         // recompute data matrix entries
00341         for (uint i=0; i<distanceMatrix.size(); ++i)
00342         {
00343           // we only have a triangular matrix ... so be careful with indices
00344           if (i>minRow) // && (i>minColumn) <-- can be assured, because minRow>minColumn!
00345           {
00346             distanceMatrix[i][minRow] = std::min(distanceMatrix[i][minRow], distanceMatrix[i][minColumn]); 
00347           }
00348           else if (i<minColumn) // && (i<minRow) <-- can be assured, because minRow>minColumn!
00349           {
00350             distanceMatrix[minRow][i] = std::min(distanceMatrix[minRow][i], distanceMatrix[minColumn][i]);  
00351           }
00352           else if ((i<minRow) && (i>minColumn))
00353           {  
00354             distanceMatrix[minRow][i] = std::min(distanceMatrix[minRow][i], distanceMatrix[i][minColumn]);  
00355           }
00356           
00357         }
00358         
00359         return;
00360       };    
00361     
00363       ClusterIdxVectorType clustermap_;
00364   };
00365 } // namespace OpenMS
00366 
00367 #endif // OPENMS_COMPARISON_CLUSTERING_HIERARCHICALCLUSTERING_H   

Generated Tue Apr 1 15:36:35 2008 -- using doxygen 1.5.4 OpenMS / TOPP 1.1