00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #ifndef OPENMS_COMPARISON_CLUSTERING_HIERARCHICALCLUSTERING_H
00028 #define OPENMS_COMPARISON_CLUSTERING_HIERARCHICALCLUSTERING_H
00029
00030
00031 #include <OpenMS/DATASTRUCTURES/DPosition.h>
00032 #include <OpenMS/DATASTRUCTURES/DefaultParamHandler.h>
00033
00034
00035 #include <vector>
00036 #include <math.h>
00037
00038
00039 namespace OpenMS
00040 {
00054 template <typename ClusterPoint = DPosition<2> >
00055 class HierarchicalClustering : public DefaultParamHandler
00056 {
00057 public:
00058
00059 typedef ClusterPoint ClusterPointType;
00060 typedef std::vector<uint> ClusterIdxType;
00061 typedef std::vector< ClusterIdxType > ClusterIdxVectorType;
00062 using DefaultParamHandler::param_;
00063 using DefaultParamHandler::defaults_;
00064
00065 typedef std::vector <double> DistanceLineType;
00066 typedef std::vector < DistanceLineType > DistanceMatrixType;
00067
00069 enum LINKAGE_TYPE {COMPLETE_LINKAGE=0, SINGLE_LINKAGE};
00070
00074
00075 HierarchicalClustering()
00076 : DefaultParamHandler("HierarchicalClustering")
00077 {
00078 defaults_.setValue("cluster_cutoff", 40.0, "maximal distance allowed between two clusters before merging them", false);
00079 defaults_.setValue("linkage_type", COMPLETE_LINKAGE,"clustering method: 0=complete linkage 1=single_linkage", true);
00080
00081 defaultsToParam_();
00082 }
00083
00085 inline HierarchicalClustering(const HierarchicalClustering& source)
00086 : DefaultParamHandler(source),
00087 clustermap_(source.clustermap_)
00088 {}
00089
00091 inline HierarchicalClustering& operator=(const HierarchicalClustering& source)
00092 {
00093 if (&source==this)
00094 {
00095 return *this;
00096 }
00097
00098 DefaultParamHandler::operator=(source);
00099 clustermap_ = source.clustermap_;
00100
00101 return *this;
00102 };
00103
00104
00105
00107 virtual ~HierarchicalClustering()
00108 {
00109 };
00111
00115
00117 const ClusterIdxVectorType & getClusters() const
00118 {
00119 return clustermap_;
00120 }
00121
00123
00125 void compute(const std::vector<ClusterPointType>& points) throw (Exception::NotImplemented)
00126 {
00127 double cutoff = param_.getValue("cluster_cutoff");
00128 int linkage_method = param_.getValue("linkage_type");
00129
00130
00131
00132 clustermap_.clear();
00133 ClusterIdxVectorType clustermap_tmp(points.size(), ClusterIdxType(1));
00134 clustermap_.assign(clustermap_tmp.begin(),clustermap_tmp.end());
00135
00136 for (uint i=0; i<points.size(); ++i)
00137 {
00138 clustermap_[i][0] = i;
00139 }
00140
00141
00142
00143 DistanceMatrixType distanceMatrix(points.size(), DistanceLineType(1));
00144
00145
00146 for (uint i=1; i<points.size(); ++i)
00147 {
00148 distanceMatrix[i].resize(i);
00149 for (uint j=0; j<i; ++j)
00150 {
00151 distanceMatrix[i][j] = getDistance_(points[i], points[j]);
00152
00153 }
00154
00155 }
00156
00157 for (uint i=0; i<points.size(); ++i)
00158 {
00159
00160 }
00161
00162
00163 double number_of_clusters = points.size()-1;
00164
00165 double minDistance;
00166 uint minRow=0, minColumn=0;
00167
00168 getMinDistance_(distanceMatrix, minDistance, minRow, minColumn);
00169
00170
00171 while ((minDistance < cutoff) && (number_of_clusters>=2))
00172 {
00173
00174
00175
00176 getMinDistance_(distanceMatrix, minDistance, minRow, minColumn);
00177
00178
00179
00180
00181
00182 while (!clustermap_[minColumn].empty())
00183 {
00184
00185 clustermap_[minRow].push_back( clustermap_[minColumn].back() );
00186 clustermap_[minColumn].pop_back();
00187 }
00188
00189
00190 if (linkage_method==COMPLETE_LINKAGE)
00191 {
00192 linkageComplete_(distanceMatrix, minRow, minColumn);
00193 }
00194 else if (linkage_method==SINGLE_LINKAGE)
00195 {
00196 linkageSingle_(distanceMatrix, minRow, minColumn);
00197 }
00198 else
00199 {
00200 std::cerr << "Linkage method not available" << linkage_method << "\n";
00201 throw Exception::NotImplemented(__FILE__,__LINE__,__FUNCTION__);
00202 }
00203
00204
00205 distanceMatrix[minColumn].clear();
00206
00207 --number_of_clusters;
00208 }
00209
00210
00211
00212
00213 ClusterIdxVectorType cl_tmp;
00214 ClusterIdxVectorType::iterator iter = clustermap_.begin();
00215 while ( iter != clustermap_.end())
00216 {
00217 if (iter->empty())
00218 {
00219 iter = clustermap_.erase(iter, iter+1);
00220 } else
00221 {
00222 ++iter;
00223 }
00224 }
00225
00226 return;
00227 }
00228
00229
00230 void printStatistics (std::ostream& os)
00231 {
00232 if (clustermap_.empty())
00233 {
00234 os << "no clusters defined! call 'compute()' first!\n";
00235 }
00236 else
00237 {
00238 os << "\nCluster size statistics:\n";
00239 uint maxsize = 0;
00240 for (uint i=0; i<clustermap_.size(); ++i)
00241 {
00242 if (clustermap_[i].size() > maxsize) maxsize = clustermap_[i].size();
00243 }
00244
00245 std::vector<uint> size_dist(maxsize+1);
00246 for (uint i=0; i<clustermap_.size(); ++i)
00247 {
00248 ++size_dist[clustermap_[i].size()];
00249 }
00250
00251 for (uint i=0; i<maxsize+1; ++i)
00252 {
00253 if (size_dist[i]>0)
00254 os << size_dist[i] << " of size " << i << "\n";
00255 }
00256
00257 }
00258 return;
00259 }
00260
00261 protected:
00262
00265 inline void getMinDistance_(const DistanceMatrixType& distanceMatrix, double& minDistance, uint& minRow, uint& minColumn)
00266 {
00267 minDistance = std::numeric_limits<double>::max();
00268
00269 for (uint i=1; i<distanceMatrix.size(); ++i)
00270 {
00271 if (!distanceMatrix[i].empty())
00272 {
00273
00274 for (uint j=0; j<i; ++j)
00275 {
00276
00277 if (!distanceMatrix[j].empty())
00278 {
00279 if (minDistance > distanceMatrix[i][j])
00280 {
00281 minDistance = distanceMatrix[i][j];
00282 minRow = i;
00283 minColumn = j;
00284 }
00285 }
00286 }
00287
00288 }
00289 }
00290
00291
00292
00293 return;
00294 }
00295
00297 inline double getDistance_(const ClusterPointType& a, const ClusterPointType& b)
00298 {
00299
00300 double distance = 0;
00301 for (uint i=0; i<a.size(); ++i)
00302 {
00303 distance += pow(a[i]-b[i], 2);
00304 }
00305
00306 return sqrt(distance);
00307 }
00308
00311 inline void linkageComplete_(DistanceMatrixType& distanceMatrix, const uint& minRow, const uint& minColumn)
00312 {
00313
00314 for (uint i=0; i<distanceMatrix.size(); ++i)
00315 {
00316
00317 if (i>minRow)
00318 {
00319 distanceMatrix[i][minRow] = std::max(distanceMatrix[i][minRow], distanceMatrix[i][minColumn]);
00320 }
00321 else if (i<minColumn)
00322 {
00323 distanceMatrix[minRow][i] = std::max(distanceMatrix[minRow][i], distanceMatrix[minColumn][i]);
00324 }
00325 else if ((i<minRow) && (i>minColumn))
00326 {
00327 distanceMatrix[minRow][i] = std::max(distanceMatrix[minRow][i], distanceMatrix[i][minColumn]);
00328 }
00329
00330
00331 }
00332
00333 return;
00334 };
00335
00338 inline void linkageSingle_(DistanceMatrixType& distanceMatrix, const uint& minRow, const uint& minColumn)
00339 {
00340
00341 for (uint i=0; i<distanceMatrix.size(); ++i)
00342 {
00343
00344 if (i>minRow)
00345 {
00346 distanceMatrix[i][minRow] = std::min(distanceMatrix[i][minRow], distanceMatrix[i][minColumn]);
00347 }
00348 else if (i<minColumn)
00349 {
00350 distanceMatrix[minRow][i] = std::min(distanceMatrix[minRow][i], distanceMatrix[minColumn][i]);
00351 }
00352 else if ((i<minRow) && (i>minColumn))
00353 {
00354 distanceMatrix[minRow][i] = std::min(distanceMatrix[minRow][i], distanceMatrix[i][minColumn]);
00355 }
00356
00357 }
00358
00359 return;
00360 };
00361
00363 ClusterIdxVectorType clustermap_;
00364 };
00365 }
00366
00367 #endif // OPENMS_COMPARISON_CLUSTERING_HIERARCHICALCLUSTERING_H