OpenMS
StatisticFunctions.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2023.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Timo Sachsenberg $
32 // $Authors: Clemens Groepl, Johannes Junker, Mathias Walzer, Chris Bielow $
33 // --------------------------------------------------------------------------
34 #pragma once
35 
36 #include <vector>
38 #include <OpenMS/CONCEPT/Types.h>
39 
40 #include <algorithm>
41 #include <cmath>
42 #include <iterator>
43 #include <numeric>
44 
45 namespace OpenMS
46 {
47 
48  namespace Math
49  {
50 
58  template <typename IteratorType>
59  static void checkIteratorsNotNULL(IteratorType begin, IteratorType end)
60  {
61  if (begin == end)
62  {
63  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
64  }
65  }
66 
74  template <typename IteratorType>
75  static void checkIteratorsEqual(IteratorType begin, IteratorType end)
76  {
77  if (begin != end)
78  {
79  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
80  }
81  }
82 
90  template <typename IteratorType1, typename IteratorType2>
92  IteratorType1 begin_b, IteratorType1 end_b,
93  IteratorType2 begin_a, IteratorType2 end_a)
94  {
95  if (begin_b != end_b && begin_a == end_a)
96  {
97  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
98  }
99  }
100 
106  template <typename IteratorType>
107  static double sum(IteratorType begin, IteratorType end)
108  {
109  return std::accumulate(begin, end, 0.0);
110  }
111 
119  template <typename IteratorType>
120  static double mean(IteratorType begin, IteratorType end)
121  {
122  checkIteratorsNotNULL(begin, end);
123  return sum(begin, end) / std::distance(begin, end);
124  }
125 
137  template <typename IteratorType>
138  static double median(IteratorType begin, IteratorType end,
139  bool sorted = false)
140  {
141  checkIteratorsNotNULL(begin, end);
142  if (!sorted)
143  {
144  std::sort(begin, end);
145  }
146 
147  Size size = std::distance(begin, end);
148  if (size % 2 == 0) // even size => average two middle values
149  {
150  IteratorType it1 = begin;
151  std::advance(it1, size / 2 - 1);
152  IteratorType it2 = it1;
153  std::advance(it2, 1);
154  return (*it1 + *it2) / 2.0;
155  }
156  else
157  {
158  IteratorType it = begin;
159  std::advance(it, (size - 1) / 2);
160  return *it;
161  }
162  }
163 
164 
184  template <typename IteratorType>
185  double MAD(IteratorType begin, IteratorType end, double median_of_numbers)
186  {
187  std::vector<double> diffs;
188  diffs.reserve(std::distance(begin, end));
189  for (IteratorType it = begin; it != end; ++it)
190  {
191  diffs.push_back(fabs(*it - median_of_numbers));
192  }
193  return median(diffs.begin(), diffs.end(), false);
194  }
195 
214  template <typename IteratorType>
215  double MeanAbsoluteDeviation(IteratorType begin, IteratorType end, double mean_of_numbers)
216  {
217  double mean_value {0};
218  for (IteratorType it = begin; it != end; ++it)
219  {
220  mean_value += fabs(*it - mean_of_numbers);
221  }
222  return mean_value / std::distance(begin, end);
223  }
224 
238  template <typename IteratorType>
239  static double quantile1st(IteratorType begin, IteratorType end,
240  bool sorted = false)
241  {
242  checkIteratorsNotNULL(begin, end);
243 
244  if (!sorted)
245  {
246  std::sort(begin, end);
247  }
248 
249  Size size = std::distance(begin, end);
250  if (size % 2 == 0)
251  {
252  return median(begin, begin + (size/2)-1, true); //-1 to exclude median values
253  }
254  return median(begin, begin + (size/2), true);
255  }
256 
270  template <typename IteratorType>
271  static double quantile3rd(
272  IteratorType begin, IteratorType end, bool sorted = false)
273  {
274  checkIteratorsNotNULL(begin, end);
275  if (!sorted)
276  {
277  std::sort(begin, end);
278  }
279 
280  Size size = std::distance(begin, end);
281  return median(begin + (size/2)+1, end, true); //+1 to exclude median values
282  }
283 
293  template <typename IteratorType>
294  static double variance(IteratorType begin, IteratorType end,
295  double mean = std::numeric_limits<double>::max())
296  {
297  checkIteratorsNotNULL(begin, end);
298  double sum_value = 0.0;
299  if (mean == std::numeric_limits<double>::max())
300  {
301  mean = Math::mean(begin, end);
302  }
303  for (IteratorType iter=begin; iter!=end; ++iter)
304  {
305  double diff = *iter - mean;
306  sum_value += diff * diff;
307  }
308  return sum_value / (std::distance(begin, end)-1);
309  }
310 
320  template <typename IteratorType>
321  static double sd(IteratorType begin, IteratorType end,
322  double mean = std::numeric_limits<double>::max())
323  {
324  checkIteratorsNotNULL(begin, end);
325  return std::sqrt( variance(begin, end, mean) );
326  }
327 
335  template <typename IteratorType>
336  static double absdev(IteratorType begin, IteratorType end,
337  double mean = std::numeric_limits<double>::max())
338  {
339  checkIteratorsNotNULL(begin, end);
340  double sum_value = 0.0;
341  if (mean == std::numeric_limits<double>::max())
342  {
343  mean = Math::mean(begin, end);
344  }
345  for (IteratorType iter=begin; iter!=end; ++iter)
346  {
347  sum_value += *iter - mean;
348  }
349  return sum_value / std::distance(begin, end);
350  }
351 
361  template <typename IteratorType1, typename IteratorType2>
362  static double covariance(IteratorType1 begin_a, IteratorType1 end_a,
363  IteratorType2 begin_b, IteratorType2 end_b)
364  {
365  //no data or different lengths
366  checkIteratorsNotNULL(begin_a, end_a);
367 
368  double sum_value = 0.0;
369  double mean_a = Math::mean(begin_a, end_a);
370  double mean_b = Math::mean(begin_b, end_b);
371  IteratorType1 iter_a = begin_a;
372  IteratorType2 iter_b = begin_b;
373  for (; iter_a != end_a; ++iter_a, ++iter_b)
374  {
375  /* assure both ranges have the same number of elements */
376  checkIteratorsAreValid(begin_b, end_b, begin_a, end_a);
377  sum_value += (*iter_a - mean_a) * (*iter_b - mean_b);
378  }
379  /* assure both ranges have the same number of elements */
380  checkIteratorsEqual(iter_b, end_b);
381  Size n = std::distance(begin_a, end_a);
382  return sum_value / (n-1);
383  }
384 
394  template <typename IteratorType1, typename IteratorType2>
395  static double meanSquareError(IteratorType1 begin_a, IteratorType1 end_a,
396  IteratorType2 begin_b, IteratorType2 end_b)
397  {
398  //no data or different lengths
399  checkIteratorsNotNULL(begin_a, end_a);
400 
401  SignedSize dist = std::distance(begin_a, end_a);
402  double error = 0;
403  IteratorType1 iter_a = begin_a;
404  IteratorType2 iter_b = begin_b;
405  for (; iter_a != end_a; ++iter_a, ++iter_b)
406  {
407  /* assure both ranges have the same number of elements */
408  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
409 
410  double tmp(*iter_a - *iter_b);
411  error += tmp * tmp;
412  }
413  /* assure both ranges have the same number of elements */
414  checkIteratorsEqual(iter_b, end_b);
415 
416  return error / dist;
417  }
418 
428  template <typename IteratorType1, typename IteratorType2>
429  static double classificationRate(IteratorType1 begin_a, IteratorType1 end_a,
430  IteratorType2 begin_b, IteratorType2 end_b)
431  {
432  //no data or different lengths
433  checkIteratorsNotNULL(begin_a, end_a);
434 
435  SignedSize dist = std::distance(begin_a, end_a);
436  SignedSize correct = dist;
437  IteratorType1 iter_a = begin_a;
438  IteratorType2 iter_b = begin_b;
439  for (; iter_a != end_a; ++iter_a, ++iter_b)
440  {
441  /* assure both ranges have the same number of elements */
442  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
443  if ((*iter_a < 0 && *iter_b >= 0) || (*iter_a >= 0 && *iter_b < 0))
444  {
445  --correct;
446  }
447 
448  }
449  /* assure both ranges have the same number of elements */
450  checkIteratorsEqual(iter_b, end_b);
451 
452  return double(correct) / dist;
453  }
454 
467  template <typename IteratorType1, typename IteratorType2>
469  IteratorType1 begin_a, IteratorType1 end_a,
470  IteratorType2 begin_b, IteratorType2 end_b)
471  {
472  //no data or different lengths
473  checkIteratorsNotNULL(begin_a, end_b);
474 
475  double tp = 0;
476  double fp = 0;
477  double tn = 0;
478  double fn = 0;
479  IteratorType1 iter_a = begin_a;
480  IteratorType2 iter_b = begin_b;
481  for (; iter_a != end_a; ++iter_a, ++iter_b)
482  {
483  /* assure both ranges have the same number of elements */
484  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
485 
486  if (*iter_a < 0 && *iter_b >= 0)
487  {
488  ++fn;
489  }
490  else if (*iter_a < 0 && *iter_b < 0)
491  {
492  ++tn;
493  }
494  else if (*iter_a >= 0 && *iter_b >= 0)
495  {
496  ++tp;
497  }
498  else if (*iter_a >= 0 && *iter_b < 0)
499  {
500  ++fp;
501  }
502  }
503  /* assure both ranges have the same number of elements */
504  checkIteratorsEqual(iter_b, end_b);
505 
506  return (tp * tn - fp * fn) / std::sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn));
507  }
508 
520  template <typename IteratorType1, typename IteratorType2>
522  IteratorType1 begin_a, IteratorType1 end_a,
523  IteratorType2 begin_b, IteratorType2 end_b)
524  {
525  //no data or different lengths
526  checkIteratorsNotNULL(begin_a, end_a);
527 
528  //calculate average
529  SignedSize dist = std::distance(begin_a, end_a);
530  double avg_a = std::accumulate(begin_a, end_a, 0.0) / dist;
531  double avg_b = std::accumulate(begin_b, end_b, 0.0) / dist;
532 
533  double numerator = 0;
534  double denominator_a = 0;
535  double denominator_b = 0;
536  IteratorType1 iter_a = begin_a;
537  IteratorType2 iter_b = begin_b;
538  for (; iter_a != end_a; ++iter_a, ++iter_b)
539  {
540  /* assure both ranges have the same number of elements */
541  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
542  double temp_a = *iter_a - avg_a;
543  double temp_b = *iter_b - avg_b;
544  numerator += (temp_a * temp_b);
545  denominator_a += (temp_a * temp_a);
546  denominator_b += (temp_b * temp_b);
547  }
548  /* assure both ranges have the same number of elements */
549  checkIteratorsEqual(iter_b, end_b);
550  return numerator / std::sqrt(denominator_a * denominator_b);
551  }
552 
554  template <typename Value>
555  static void computeRank(std::vector<Value> & w)
556  {
557  Size i = 0; // main index
558  Size z = 0; // "secondary" index
559  Value rank = 0;
560  Size n = (w.size() - 1);
561  //store original indices for later
562  std::vector<std::pair<Size, Value> > w_idx;
563  for (Size j = 0; j < w.size(); ++j)
564  {
565  w_idx.push_back(std::make_pair(j, w[j]));
566  }
567  //sort
568  std::sort(w_idx.begin(), w_idx.end(),
569  [](const auto& pair1, const auto& pair2) { return pair1.second < pair2.second; });
570  //replace pairs <orig_index, value> in w_idx by pairs <orig_index, rank>
571  while (i < n)
572  {
573  // test for equality with tolerance:
574  if (fabs(w_idx[i + 1].second - w_idx[i].second) > 0.0000001 * fabs(w_idx[i + 1].second)) // no tie
575  {
576  w_idx[i].second = Value(i + 1);
577  ++i;
578  }
579  else // tie, replace by mean rank
580  {
581  // count number of ties
582  for (z = i + 1; (z <= n) && fabs(w_idx[z].second - w_idx[i].second) <= 0.0000001 * fabs(w_idx[z].second); ++z)
583  {
584  }
585  // compute mean rank of tie
586  rank = 0.5 * (i + z + 1);
587  // replace intensities by rank
588  for (Size v = i; v <= z - 1; ++v)
589  {
590  w_idx[v].second = rank;
591  }
592  i = z;
593  }
594  }
595  if (i == n)
596  w_idx[n].second = Value(n + 1);
597  //restore original order and replace elements of w with their ranks
598  for (Size j = 0; j < w.size(); ++j)
599  {
600  w[w_idx[j].first] = w_idx[j].second;
601  }
602  }
603 
615  template <typename IteratorType1, typename IteratorType2>
617  IteratorType1 begin_a, IteratorType1 end_a,
618  IteratorType2 begin_b, IteratorType2 end_b)
619  {
620  //no data or different lengths
621  checkIteratorsNotNULL(begin_a, end_a);
622 
623  // store and sort intensities of model and data
624  SignedSize dist = std::distance(begin_a, end_a);
625  std::vector<double> ranks_data;
626  ranks_data.reserve(dist);
627  std::vector<double> ranks_model;
628  ranks_model.reserve(dist);
629  IteratorType1 iter_a = begin_a;
630  IteratorType2 iter_b = begin_b;
631  for (; iter_a != end_a; ++iter_a, ++iter_b)
632  {
633  /* assure both ranges have the same number of elements */
634  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
635 
636  ranks_model.push_back(*iter_a);
637  ranks_data.push_back(*iter_b);
638  }
639  /* assure both ranges have the same number of elements */
640  checkIteratorsEqual(iter_b, end_b);
641 
642  // replace entries by their ranks
643  computeRank(ranks_data);
644  computeRank(ranks_model);
645 
646  double mu = double(ranks_data.size() + 1) / 2.; // mean of ranks
647  // Was the following, but I think the above is more correct ... (Clemens)
648  // double mu = (ranks_data.size() + 1) / 2;
649 
650  double sum_model_data = 0;
651  double sqsum_data = 0;
652  double sqsum_model = 0;
653 
654  for (Int i = 0; i < dist; ++i)
655  {
656  sum_model_data += (ranks_data[i] - mu) * (ranks_model[i] - mu);
657  sqsum_data += (ranks_data[i] - mu) * (ranks_data[i] - mu);
658  sqsum_model += (ranks_model[i] - mu) * (ranks_model[i] - mu);
659  }
660 
661  // check for division by zero
662  if (!sqsum_data || !sqsum_model)
663  {
664  return 0;
665  }
666 
667  return sum_model_data / (std::sqrt(sqsum_data) * std::sqrt(sqsum_model));
668  }
669 
671  template<typename T>
673  {
674  SummaryStatistics() = default;
675 
676  // Ctor with data
678  {
679  count = data.size();
680  // Sanity check: avoid core dump if no data points present.
681  if (data.empty())
682  {
683  mean = variance = min = lowerq = median = upperq = max = 0.0;
684  }
685  else
686  {
687  sort(data.begin(), data.end());
688  mean = Math::mean(data.begin(), data.end());
689  variance = Math::variance(data.begin(), data.end(), mean);
690  min = data.front();
691  lowerq = Math::quantile1st(data.begin(), data.end(), true);
692  median = Math::median(data.begin(), data.end(), true);
693  upperq = Math::quantile3rd(data.begin(), data.end(), true);
694  max = data.back();
695  }
696  }
697 
698  double mean = 0, variance = 0 , lowerq = 0, median = 0, upperq = 0;
699  typename T::value_type min = 0, max = 0;
700  size_t count = 0;
701  };
702 
703  } // namespace Math
704 } // namespace OpenMS
705 
Invalid range exception.
Definition: Exception.h:278
int Int
Signed integer type.
Definition: Types.h:102
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:134
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
static double classificationRate(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the classification rate for the values in [begin_a, end_a) and [begin_b,...
Definition: StatisticFunctions.h:429
static double median(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the median of a range of values.
Definition: StatisticFunctions.h:138
static double mean(IteratorType begin, IteratorType end)
Calculates the mean of a range of values.
Definition: StatisticFunctions.h:120
static double covariance(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the covariance of two ranges of values.
Definition: StatisticFunctions.h:362
static double quantile3rd(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the third quantile of a range of values.
Definition: StatisticFunctions.h:271
static void checkIteratorsNotNULL(IteratorType begin, IteratorType end)
Helper function checking if two iterators are not equal.
Definition: StatisticFunctions.h:59
static double matthewsCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the Matthews correlation coefficient for the values in [begin_a, end_a) and [begin_b,...
Definition: StatisticFunctions.h:468
double MeanAbsoluteDeviation(IteratorType begin, IteratorType end, double mean_of_numbers)
mean absolute deviation (MeanAbsoluteDeviation)
Definition: StatisticFunctions.h:215
static double sum(IteratorType begin, IteratorType end)
Calculates the sum of a range of values.
Definition: StatisticFunctions.h:107
static double absdev(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the absolute deviation of a range of values.
Definition: StatisticFunctions.h:336
static double pearsonCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the Pearson correlation coefficient for the values in [begin_a, end_a) and [begin_b,...
Definition: StatisticFunctions.h:521
static double sd(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the standard deviation of a range of values.
Definition: StatisticFunctions.h:321
double MAD(IteratorType begin, IteratorType end, double median_of_numbers)
median absolute deviation (MAD)
Definition: StatisticFunctions.h:185
static double rankCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
calculates the rank correlation coefficient for the values in [begin_a, end_a) and [begin_b,...
Definition: StatisticFunctions.h:616
static void checkIteratorsAreValid(IteratorType1 begin_b, IteratorType1 end_b, IteratorType2 begin_a, IteratorType2 end_a)
Helper function checking if an iterator and a co-iterator both have a next element.
Definition: StatisticFunctions.h:91
static double quantile1st(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the first quantile of a range of values.
Definition: StatisticFunctions.h:239
static void checkIteratorsEqual(IteratorType begin, IteratorType end)
Helper function checking if two iterators are equal.
Definition: StatisticFunctions.h:75
static double variance(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the variance of a range of values.
Definition: StatisticFunctions.h:294
static double meanSquareError(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the mean square error for the values in [begin_a, end_a) and [begin_b, end_b)
Definition: StatisticFunctions.h:395
static void computeRank(std::vector< Value > &w)
Replaces the elements in vector w by their ranks.
Definition: StatisticFunctions.h:555
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:48
Helper class to gather (and dump) some statistics from a e.g. vector<double>.
Definition: StatisticFunctions.h:673
double lowerq
Definition: StatisticFunctions.h:698
double variance
Definition: StatisticFunctions.h:698
T::value_type max
Definition: StatisticFunctions.h:699
SummaryStatistics(T &data)
Definition: StatisticFunctions.h:677
double median
Definition: StatisticFunctions.h:698
size_t count
Definition: StatisticFunctions.h:700
double mean
Definition: StatisticFunctions.h:698
double upperq
Definition: StatisticFunctions.h:698
T::value_type min
Definition: StatisticFunctions.h:699