00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #ifndef OPENMS_MATH_STATISTICS_LINEARREGRESSION_H
00029 #define OPENMS_MATH_STATISTICS_LINEARREGRESSION_H
00030
00031 #include<OpenMS/CONCEPT/Types.h>
00032
00033 #include<iostream>
00034 #include<vector>
00035 #include<gsl/gsl_fit.h>
00036 #include<gsl/gsl_statistics.h>
00037 #include<gsl/gsl_cdf.h>
00038
00039 using namespace std;
00040
00041 namespace OpenMS
00042 {
00043 namespace Math
00044 {
00065 template <typename Iterator>
00066 class LinearRegression
00067 {
00068 public:
00069
00071 LinearRegression()
00072 : intercept_(0),
00073 slope_(0),
00074 x_intercept_(0),
00075 lower_(0),
00076 upper_(0),
00077 t_star_(0),
00078 r_squared_(0),
00079 stand_dev_residuals_(0),
00080 mean_residuals_(0),
00081 stand_error_slope_(0),
00082 chi_squared_(0),
00083 rsd_(0)
00084 {}
00085
00087 LinearRegression ( LinearRegression const & arg )
00088 : intercept_(arg.intercept_),
00089 slope_(arg.slope_),
00090 x_intercept_(arg.x_intercept_),
00091 lower_(arg.lower_),
00092 upper_(arg.upper_),
00093 t_star_(arg.t_star_),
00094 r_squared_(arg.r_squared_),
00095 stand_dev_residuals_(arg.stand_dev_residuals_),
00096 mean_residuals_(arg.mean_residuals_),
00097 stand_error_slope_(arg.stand_error_slope_),
00098 chi_squared_(arg.chi_squared_),
00099 rsd_(arg.rsd_)
00100 {}
00101
00103 LinearRegression & operator = ( LinearRegression const & arg )
00104 {
00105
00106 if (this == &arg)
00107 {
00108 return *this;
00109 }
00110
00111 intercept_ = arg.intercept_;
00112 slope_ = arg.slope_;
00113 x_intercept_ = arg.x_intercept_;
00114 lower_ = arg.lower_;
00115 upper_ = arg.upper_;
00116 t_star_ = arg.t_star_;
00117 r_squared_ = arg.r_squared_;
00118 stand_dev_residuals_ = arg.stand_dev_residuals_;
00119 mean_residuals_ = arg.mean_residuals_;
00120 stand_error_slope_ = arg.stand_error_slope_;
00121 chi_squared_ = arg.chi_squared_;
00122 rsd_ = arg.rsd_;
00123
00124 return *this;
00125 }
00126
00127
00129 virtual ~LinearRegression()
00130 {}
00131
00132
00145 int computeInterceptXAxis
00146 (double confidence_interval_P,
00147 Iterator x_begin,
00148 Iterator x_end,
00149 Iterator y_begin);
00150
00163 int computeInterceptXAxisWeighted
00164 (double confidence_interval_P,
00165 Iterator x_begin,
00166 Iterator x_end,
00167 Iterator y_begin,
00168 Iterator w_begin);
00169
00170
00172 inline DoubleReal getIntercept() const
00173 {
00174 return intercept_;
00175 }
00177 inline DoubleReal getSlope() const
00178 {
00179 return slope_;
00180 }
00182 inline DoubleReal getXIntercept() const
00183 {
00184 return x_intercept_;
00185 }
00187 inline DoubleReal getLower() const
00188 {
00189 return lower_;
00190 }
00192 inline DoubleReal getUpper() const
00193 {
00194 return upper_;
00195 }
00197 inline DoubleReal getTValue() const
00198 {
00199 return t_star_;
00200 }
00202 inline DoubleReal getRSquared() const
00203 {
00204 return r_squared_;
00205 }
00207 inline DoubleReal getStandDevRes() const
00208 {
00209 return stand_dev_residuals_;
00210 }
00211
00213 inline DoubleReal getMeanRes() const
00214 {
00215 return mean_residuals_;
00216 }
00217
00219 inline DoubleReal getStandErrSlope() const
00220 {
00221 return stand_error_slope_;
00222 }
00224 inline DoubleReal getChiSquared() const
00225 {
00226 return chi_squared_;
00227 }
00228
00230 inline DoubleReal getRSD() const
00231 {
00232 return rsd_;
00233 }
00234
00235
00236 protected:
00237
00239 double intercept_;
00241 double slope_;
00243 double x_intercept_;
00245 double lower_;
00247 double upper_;
00249 double t_star_;
00251 double r_squared_;
00253 double stand_dev_residuals_;
00255 double mean_residuals_;
00257 double stand_error_slope_;
00259 double chi_squared_;
00261 double rsd_;
00262
00263
00265 void computeGoodness_
00266 (double* X,
00267 double* Y,
00268 int N,
00269 double confidence_interval_P);
00270
00272 void iteratorRange2Arrays_
00273 (Iterator x_begin,
00274 Iterator x_end,
00275 Iterator y_begin,
00276 double* x_array,
00277 double* y_array);
00278
00283 void iteratorRange3Arrays_
00284 (Iterator x_begin,
00285 Iterator x_end,
00286 Iterator y_begin,
00287 Iterator w_begin,
00288 double* x_array,
00289 double* y_array,
00290 double* w_array);
00291
00292 };
00293
00294
00295 template <typename Iterator>
00296 int LinearRegression<Iterator>::computeInterceptXAxis
00297 (double confidence_interval_P,
00298 Iterator x_begin,
00299 Iterator x_end,
00300 Iterator y_begin
00301 )
00302 {
00303 int N=distance(x_begin,x_end);
00304
00305 double* X = new double[N];
00306 double* Y = new double[N];
00307 iteratorRange2Arrays_(x_begin,x_end,y_begin,X,Y);
00308
00309 double cov00, cov01, cov11;
00310
00311
00312
00313
00314 int error = gsl_fit_linear (X,1,Y,1,N,&intercept_, &slope_, &cov00, &cov01, &cov11, &chi_squared_);
00315
00316 if (!error)
00317 {
00318 computeGoodness_(X,Y,N,confidence_interval_P);
00319 }
00320
00321 delete [] X;
00322 delete [] Y;
00323
00324 return error;
00325 }
00326
00327
00328 template <typename Iterator>
00329 int LinearRegression<Iterator>::computeInterceptXAxisWeighted
00330 (double confidence_interval_P,
00331 Iterator x_begin,
00332 Iterator x_end,
00333 Iterator y_begin,
00334 Iterator w_begin
00335 )
00336 {
00337 int N=distance(x_begin,x_end);
00338
00339 double* X = new double[N];
00340 double* Y = new double[N];
00341 double* W = new double[N];
00342 iteratorRange3Arrays_(x_begin,x_end,y_begin,w_begin,X,Y,W);
00343
00344 double cov00, cov01, cov11;
00345
00346
00347
00348
00349 int error = gsl_fit_wlinear (X,1,W,1,Y,1,N,&intercept_, &slope_, &cov00, &cov01, &cov11, &chi_squared_);
00350
00351 if (!error)
00352 {
00353 computeGoodness_(X,Y,N,confidence_interval_P);
00354 }
00355
00356 delete [] X;
00357 delete [] Y;
00358 delete [] W;
00359
00360 return error;
00361 }
00362
00363
00364 template <typename Iterator>
00365 void LinearRegression<Iterator>::computeGoodness_(double* X, double* Y,int N, double confidence_interval_P)
00366 {
00367
00368 double var_X = gsl_stats_variance(X,1,N);
00369 double var_Y = gsl_stats_variance(Y,1,N);
00370 double cov_XY = gsl_stats_covariance(X,1,Y,1,N);
00371
00372
00373 double x_mean = gsl_stats_mean(X,1,N);
00374 double y_mean = gsl_stats_mean(Y,1,N);
00375
00376
00377 double s_XX = 0;
00378 for (int i=0; i<N; ++i)
00379 {
00380 double d=(X[i]-x_mean);
00381 s_XX +=d*d;
00382 }
00383
00384
00385 r_squared_=(cov_XY*cov_XY)/(var_X*var_Y);
00386
00387
00388 double sum = 0;
00389 for (Int i = 0; i < N; ++i)
00390 {
00391 double x_i = fabs(Y[i] - (intercept_ + slope_ * X[i]));
00392 sum += x_i;
00393 }
00394 mean_residuals_ = sum / N;
00395 stand_dev_residuals_ = sqrt((chi_squared_ - (sum*sum)/N)/(N-1));
00396
00397
00398 stand_error_slope_=stand_dev_residuals_/sqrt(s_XX);
00399
00400
00401 x_intercept_= -(intercept_/slope_);
00402
00403 double P=1-(1-confidence_interval_P)/2;
00404 t_star_=gsl_cdf_tdist_Pinv(P,N-2);
00405
00406
00407 double g =(t_star_/(slope_/stand_error_slope_));
00408 g *= g;
00409 double left = (x_intercept_-x_mean)*g;
00410 double bottom = 1-g;
00411 double d=(x_intercept_-x_mean);
00412 double right = t_star_*(stand_dev_residuals_/slope_)*sqrt((d*d)/s_XX + (bottom/N));
00413
00414
00415 lower_ = x_intercept_+(left+right)/bottom;
00416 upper_ = x_intercept_+(left-right)/bottom;
00417
00418 if (lower_ > upper_)
00419 {
00420 std::swap(lower_,upper_);
00421 }
00422
00423 double tmp = 0;
00424 for (Int i = 0; i<N;++i)
00425 {
00426 tmp += (X[i] - x_mean) * (X[i] - x_mean);
00427 }
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439 rsd_ = (100.0 / fabs( x_intercept_ )) * (stand_dev_residuals_ / slope_) * sqrt( (1.0/ (double) N) * (y_mean / (slope_ * slope_ * tmp ) ) ) ;
00440
00441 if (rsd_ < 0.0)
00442 {
00443 std::cout << "rsd < 0.0 " << std::endl;
00444 std::cout << "Intercept " << intercept_
00445 << "\nSlope " << slope_
00446 << "\nSquared pearson coefficient " << r_squared_
00447 << "\nValue of the t-distribution " << t_star_
00448 << "\nStandard deviation of the residuals " << stand_dev_residuals_
00449 << "\nStandard error of the slope " << stand_error_slope_
00450 << "\nThe X intercept " << x_intercept_
00451 << "\nThe lower border of confidence interval " << lower_
00452 << "\nThe higher border of confidence interval " << upper_
00453 << "\nChi squared value " << chi_squared_
00454 << "\nx mean " << x_mean
00455 << "\nstand_error_slope/slope_ " << (stand_dev_residuals_/slope_)
00456 << "\nCoefficient of Variation " << (stand_dev_residuals_/slope_)/x_mean*100 << std::endl
00457 << "========================================="
00458 << std::endl;
00459 }
00460
00461
00462 }
00463
00464
00465
00466 template <typename Iterator>
00467 void LinearRegression<Iterator>::iteratorRange2Arrays_
00468 (Iterator x_begin,
00469 Iterator x_end,
00470 Iterator y_begin,
00471 double* x_array,
00472 double* y_array)
00473 {
00474 int i=0;
00475 while (x_begin < x_end)
00476 {
00477 x_array[i]= *x_begin;
00478 y_array[i]= *y_begin;
00479 ++x_begin;
00480 ++y_begin;
00481 ++i;
00482 }
00483 }
00484
00485
00486 template <typename Iterator>
00487 void LinearRegression<Iterator>::iteratorRange3Arrays_
00488 (Iterator x_begin,
00489 Iterator x_end,
00490 Iterator y_begin,
00491 Iterator w_begin,
00492 double* x_array,
00493 double* y_array,
00494 double* w_array)
00495 {
00496 int i=0;
00497 while (x_begin < x_end)
00498 {
00499 x_array[i]=*x_begin;
00500 y_array[i]=*y_begin;
00501 w_array[i]=*w_begin;
00502 ++x_begin;
00503 ++y_begin;
00504 ++w_begin;
00505 ++i;
00506 }
00507 }
00508 }
00509 }
00510
00511
00512 #endif