OpenMS
DBSuitability.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2023.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Tom Waschischeck $
32 // $Authors: Tom Waschischeck $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
37 #include <OpenMS/CONCEPT/Types.h>
41 
42 #include <cfloat>
43 #include <vector>
44 
45 #include <boost/regex.hpp>
46 
47 namespace OpenMS
48 {
49  class ParamXMLFile;
50  class PeptideIdentification;
51  class PeptideHit;
52  class MSExperiment;
53 
71  class OPENMS_DLLAPI DBSuitability:
72  public DefaultParamHandler
73  {
74  public:
76  struct OPENMS_DLLAPI SuitabilityData
77  {
79  Size num_top_novo = 0;
80 
82  Size num_top_db = 0;
83 
85  Size num_interest = 0;
86 
89  Size num_re_ranked = 0;
90 
93  double cut_off = DBL_MAX;
94 
104  double suitability = 0;
105 
108  double suitability_no_rerank = 0;
109 
111  double suitability_corr_no_rerank = 0;
112 
113  // resets all members to their defaults
114  void clear();
115 
118  void setCorrectionFactor(double factor);
119 
120  double getCorrectionFactor() const;
121 
122  double getCorrectedNovoHits() const;
123 
124  double getCorrectedSuitability() const;
125 
135 
136  private:
142  double corr_factor = -1;
143 
145  double num_top_novo_corr = 0;
146 
152  double suitability_corr = 0;
153  };
154 
159 
161  ~DBSuitability() override = default;
162 
164  friend class DBSuitability_friend;
165 
229  void compute(std::vector<PeptideIdentification>&& pep_ids, const MSExperiment& exp, const std::vector<FASTAFile::FASTAEntry>& original_fasta, const std::vector<FASTAFile::FASTAEntry>& novo_fasta, const ProteinIdentification::SearchParameters& search_params);
230 
240  const std::vector<SuitabilityData>& getResults() const;
241 
242  private:
244  std::vector<SuitabilityData> results_;
245 
247  const boost::regex decoy_pattern_;
248 
263  double getDecoyDiff_(const PeptideIdentification& pep_id) const;
264 
279  double getDecoyCutOff_(const std::vector<PeptideIdentification>& pep_ids, double reranking_cutoff_percentile) const;
280 
294  bool isNovoHit_(const PeptideHit& hit) const;
295 
304  bool checkScoreBetterThanThreshold_(const PeptideHit& hit, double threshold, bool higher_score_better) const;
305 
316  std::pair<String, Param> extractSearchAdapterInfoFromMetaValues_(const ProteinIdentification::SearchParameters& meta_values) const;
317 
325  void writeIniFile_(const Param& parameters, const String& filename) const;
326 
349  std::vector<PeptideIdentification> runIdentificationSearch_(const MSExperiment& exp, const std::vector<FASTAFile::FASTAEntry>& fasta_data, const String& adapter_name, Param& parameters) const;
350 
361  std::vector<FASTAFile::FASTAEntry> getSubsampledFasta_(const std::vector<FASTAFile::FASTAEntry>& fasta_data, double subsampling_rate) const;
362 
379  void calculateSuitability_(const std::vector<PeptideIdentification>& pep_ids, SuitabilityData& data) const;
380 
390  void appendDecoys_(std::vector<FASTAFile::FASTAEntry>& fasta) const;
391 
399  double extractScore_(const PeptideHit& pep_hit) const;
400 
414  double calculateCorrectionFactor_(const SuitabilityData& data, const SuitabilityData& data_sampled, double sampling_rate) const;
415 
424  UInt numberOfUniqueProteins_(const std::vector<PeptideIdentification>& peps, UInt number_of_hits = 1) const;
425 
434  Size getIndexWithMedianNovoHits_(const std::vector<SuitabilityData>& data) const;
435 
452  double getScoreMatchingFDR_(const std::vector<PeptideIdentification>& pep_ids, double FDR, const String& score_name, bool higher_score_better) const;
453  };
454 
455  // friend class to test private member functions
457  {
458  public:
459  DBSuitability_friend() = default;
460 
462 
463  std::vector<FASTAFile::FASTAEntry> getSubsampledFasta(const std::vector<FASTAFile::FASTAEntry>& fasta_data, double subsampling_rate)
464  {
465  return suit_.getSubsampledFasta_(fasta_data, subsampling_rate);
466  }
467 
468  void appendDecoys(std::vector<FASTAFile::FASTAEntry>& fasta)
469  {
470  suit_.appendDecoys_(fasta);
471  }
472 
473  double calculateCorrectionFactor(const DBSuitability::SuitabilityData& data, const DBSuitability::SuitabilityData& data_sampled, double sampling_rate)
474  {
475  return suit_.calculateCorrectionFactor_(data, data_sampled, sampling_rate);
476  }
477 
478  UInt numberOfUniqueProteins(const std::vector<PeptideIdentification>& peps, UInt number_of_hits = 1)
479  {
480  return suit_.numberOfUniqueProteins_(peps, number_of_hits);
481  }
482 
483  Size getIndexWithMedianNovoHits(const std::vector<DBSuitability::SuitabilityData>& data)
484  {
485  return suit_.getIndexWithMedianNovoHits_(data);
486  }
487 
488  double getScoreMatchingFDR(const std::vector<PeptideIdentification>& pep_ids, double FDR, String score_name, bool higher_score_better)
489  {
490  return suit_.getScoreMatchingFDR_(pep_ids, FDR, score_name, higher_score_better);
491  }
492 
493  /* Not tested:
494  getDecoyDiff_, getDecoyCutOff_, isNovoHit_, checkScoreBetterThanThreshold_
495  Reason: These functions are essential to the normal suitability calculation and if something would not work, the test for 'compute' would fail.
496 
497  extractSearchAdapterInfoFromMetaValues_, writeIniFile_, extractScore_
498  Reason: These functions are very straightforeward.
499 
500  runIdentificationSearch_
501  Reason: This function simulates a whole workflow and testing it would be to complicated.
502  */
503 
504  private:
506  };
507 }
508 
Definition: DBSuitability.h:457
std::vector< FASTAFile::FASTAEntry > getSubsampledFasta(const std::vector< FASTAFile::FASTAEntry > &fasta_data, double subsampling_rate)
Definition: DBSuitability.h:463
UInt numberOfUniqueProteins(const std::vector< PeptideIdentification > &peps, UInt number_of_hits=1)
Definition: DBSuitability.h:478
double calculateCorrectionFactor(const DBSuitability::SuitabilityData &data, const DBSuitability::SuitabilityData &data_sampled, double sampling_rate)
Definition: DBSuitability.h:473
DBSuitability suit_
Definition: DBSuitability.h:505
double getScoreMatchingFDR(const std::vector< PeptideIdentification > &pep_ids, double FDR, String score_name, bool higher_score_better)
Definition: DBSuitability.h:488
Size getIndexWithMedianNovoHits(const std::vector< DBSuitability::SuitabilityData > &data)
Definition: DBSuitability.h:483
void appendDecoys(std::vector< FASTAFile::FASTAEntry > &fasta)
Definition: DBSuitability.h:468
This class holds the functionality of calculating the database suitability.
Definition: DBSuitability.h:73
void compute(std::vector< PeptideIdentification > &&pep_ids, const MSExperiment &exp, const std::vector< FASTAFile::FASTAEntry > &original_fasta, const std::vector< FASTAFile::FASTAEntry > &novo_fasta, const ProteinIdentification::SearchParameters &search_params)
Computes suitability of a database used to search a mzML.
std::vector< FASTAFile::FASTAEntry > getSubsampledFasta_(const std::vector< FASTAFile::FASTAEntry > &fasta_data, double subsampling_rate) const
Creates a subsampled fasta with the given subsampling rate.
bool checkScoreBetterThanThreshold_(const PeptideHit &hit, double threshold, bool higher_score_better) const
Tests if a PeptideHit has a score better than the given threshold.
Size getIndexWithMedianNovoHits_(const std::vector< SuitabilityData > &data) const
Finds the SuitabilityData object with the median number of de novo hits.
std::vector< PeptideIdentification > runIdentificationSearch_(const MSExperiment &exp, const std::vector< FASTAFile::FASTAEntry > &fasta_data, const String &adapter_name, Param &parameters) const
Executes the workflow from search adapter, followed by PeptideIndexer and finishes with FDR.
void writeIniFile_(const Param &parameters, const String &filename) const
Writes parameters into a given file.
std::pair< String, Param > extractSearchAdapterInfoFromMetaValues_(const ProteinIdentification::SearchParameters &meta_values) const
Looks through meta values of SearchParameters to find out which search adapter was used.
UInt numberOfUniqueProteins_(const std::vector< PeptideIdentification > &peps, UInt number_of_hits=1) const
Determines the number of unique proteins found in the protein accessions of PeptideIdentifications.
~DBSuitability() override=default
Destructor.
double getDecoyCutOff_(const std::vector< PeptideIdentification > &pep_ids, double reranking_cutoff_percentile) const
Calculates a xcorr cut-off based on decoy hits.
double calculateCorrectionFactor_(const SuitabilityData &data, const SuitabilityData &data_sampled, double sampling_rate) const
Calculates the correction factor from two suitability calculations.
const std::vector< SuitabilityData > & getResults() const
Returns results calculated by this metric.
std::vector< SuitabilityData > results_
result vector
Definition: DBSuitability.h:244
void appendDecoys_(std::vector< FASTAFile::FASTAEntry > &fasta) const
Calculates and appends decoys to a given vector of FASTAEntry.
void calculateSuitability_(const std::vector< PeptideIdentification > &pep_ids, SuitabilityData &data) const
Calculates all suitability data from a combined deNovo+database search.
double getScoreMatchingFDR_(const std::vector< PeptideIdentification > &pep_ids, double FDR, const String &score_name, bool higher_score_better) const
Extracts the worst score that still passes a FDR (q-value) threshold.
bool isNovoHit_(const PeptideHit &hit) const
Tests if a PeptideHit is considered a deNovo hit.
const boost::regex decoy_pattern_
pattern for finding a decoy string
Definition: DBSuitability.h:247
double extractScore_(const PeptideHit &pep_hit) const
Returns the cross correlation score normalized by MW (if existing), else if the 'force' flag is set t...
double getDecoyDiff_(const PeptideIdentification &pep_id) const
Calculates the xcorr difference between the top two hits marked as decoy.
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:92
In-Memory representation of a mass spectrometry run.
Definition: MSExperiment.h:72
Management and storage of parameters / INI files.
Definition: Param.h:70
Representation of a peptide hit.
Definition: PeptideHit.h:57
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:65
A more convenient string class.
Definition: String.h:60
unsigned int UInt
Unsigned integer type.
Definition: Types.h:94
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:48
struct to store results
Definition: DBSuitability.h:77
SuitabilityData simulateNoReRanking() const
Returns a SuitabilityData object containing the data if re-ranking didn't happen.
Search parameters of the DB search.
Definition: ProteinIdentification.h:273