OpenMS
IdentificationDataConverter.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2023.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Hendrik Weisser $
32 // $Authors: Hendrik Weisser $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
39 #include <OpenMS/FORMAT/MzTab.h>
42 
43 namespace OpenMS
44 {
45  class FeatureMap;
46 
47  class OPENMS_DLLAPI IdentificationDataConverter
48  {
49  public:
50 
52  static void importIDs(IdentificationData& id_data,
53  const std::vector<ProteinIdentification>& proteins,
54  const std::vector<PeptideIdentification>& peptides);
55 
61  static void exportIDs(const IdentificationData& id_data,
62  std::vector<ProteinIdentification>& proteins,
63  std::vector<PeptideIdentification>& peptides,
64  bool export_ids_wo_scores = false);
65 
67  static MzTab exportMzTab(const IdentificationData& id_data);
68 
70  static void importSequences(IdentificationData& id_data,
71  const std::vector<FASTAFile::FASTAEntry>& fasta,
74  const String& decoy_pattern = "");
75 
77  static void exportParentMatches(
78  const IdentificationData::ParentMatches& parent_matches, PeptideHit& hit);
79 
86  static void importFeatureIDs(FeatureMap& features, bool clear_original = true);
87 
94  static void exportFeatureIDs(FeatureMap& features, bool clear_original = true);
95 
102  static void importConsensusIDs(ConsensusMap& consensus, bool clear_original = true);
103 
110  static void exportConsensusIDs(ConsensusMap& consensus, bool clear_original = true);
111 
112  protected:
113 
114  using StepOpt = std::optional<IdentificationData::ProcessingStepRef>;
115 
118  {
119  bool operator()(const StepOpt& left, const StepOpt& right) const
120  {
121  // @TODO: should runs without associated step go first or last?
122  if (!left) return bool(right);
123  if (!right) return false;
124  return **left < **right;
125  }
126  };
127 
130  {
132  const PeptideIdentification& right) const
133  {
134  // @TODO: should IDs without RT go first or last?
135  if (left.hasRT())
136  {
137  if (right.hasRT())
138  {
139  if (right.getRT() != left.getRT())
140  {
141  return left.getRT() < right.getRT();
142  } // else: compare by m/z (below)
143  }
144  else
145  {
146  return false;
147  }
148  }
149  else if (right.hasRT())
150  {
151  return true;
152  }
153  // no RTs or same RTs -> try to compare by m/z:
154  if (left.hasMZ())
155  {
156  if (right.hasMZ())
157  {
158  return left.getMZ() < right.getMZ();
159  }
160  else
161  {
162  return false;
163  }
164  }
165  // if both PI's have nothing, return false (to ensure 'x < x' is false for strict weak ordering)
166  return right.hasMZ();
167  }
168  };
169 
171  template <typename MzTabSectionRow>
174  std::vector<MzTabSectionRow>& output,
175  std::map<IdentificationData::ScoreTypeRef, Size>& score_map)
176  {
177  MzTabSectionRow row;
178  row.accession.set(parent.accession);
179  exportStepsAndScoresToMzTab_(parent.steps_and_scores, row.search_engine,
180  row.best_search_engine_score, score_map);
181  row.description.set(parent.description);
182  row.coverage.set(parent.coverage);
183  if (!parent.sequence.empty())
184  {
185  MzTabOptionalColumnEntry opt_seq;
186  opt_seq.first = "opt_sequence";
187  opt_seq.second.set(parent.sequence);
188  row.opt_.push_back(opt_seq);
189  }
190  output.push_back(row);
191  }
192 
194  template <typename MzTabSectionRow, typename IdentSeq>
196  const IdentSeq& identified, std::vector<MzTabSectionRow>& output,
197  std::map<IdentificationData::ScoreTypeRef, Size>& score_map)
198  {
199  MzTabSectionRow row;
200  // @TODO: handle modifications properly
201  row.sequence.set(identified.sequence.toString());
202  exportStepsAndScoresToMzTab_(identified.steps_and_scores,
203  row.search_engine,
204  row.best_search_engine_score, score_map);
205  if (identified.parent_matches.empty()) // no parent information given
206  {
207  // row.unique.set(false); // leave this unset?
208  output.push_back(row);
209  }
210  else // generate entries (with duplicated data) for every accession
211  {
212  // in mzTab, "unique" means "peptide is unique for this protein"
213  row.unique.set(identified.parent_matches.size() == 1);
214  for (const auto& match_pair : identified.parent_matches)
215  {
216  row.accession.set(match_pair.first->accession);
217  for (const IdentificationData::ParentMatch& match :
218  match_pair.second)
219  {
220  MzTabSectionRow copy = row;
221  addMzTabMoleculeParentContext_(match, copy);
222  output.push_back(copy);
223  }
224  }
225  }
226  }
227 
229  template <typename MzTabSectionRow>
231  const String& sequence,
232  const IdentificationData::ObservationMatch& match, double calc_mass,
233  std::vector<MzTabSectionRow>& output,
234  std::map<IdentificationData::ScoreTypeRef, Size>& score_map,
235  std::map<IdentificationData::InputFileRef, Size>& file_map)
236  {
237  MzTabSectionRow xsm; // PSM or OSM
238  // @TODO: handle modifications properly
239  xsm.sequence.set(sequence);
240  exportStepsAndScoresToMzTab_(match.steps_and_scores, xsm.search_engine,
241  xsm.search_engine_score, score_map);
242  const IdentificationData::Observation& query = *match.observation_ref;
243  std::vector<MzTabDouble> rts(1);
244  rts[0].set(query.rt);
245  xsm.retention_time.set(rts);
246  xsm.charge.set(match.charge);
247  xsm.exp_mass_to_charge.set(query.mz);
248  xsm.calc_mass_to_charge.set(calc_mass / abs(match.charge));
249  xsm.spectra_ref.setMSFile(file_map[query.input_file]);
250  xsm.spectra_ref.setSpecRef(query.data_id);
251  // optional column for adduct:
252  if (match.adduct_opt)
253  {
254  MzTabOptionalColumnEntry opt_adduct;
255  opt_adduct.first = "opt_adduct";
256  opt_adduct.second.set((*match.adduct_opt)->getName());
257  xsm.opt_.push_back(opt_adduct);
258  }
259  // optional columns for isotope offset:
260  // @TODO: find a way of passing in the names of relevant meta values
261  // (e.g. from NucleicAcidSearchEngine), instead of hard-coding them here
262  if (match.metaValueExists("isotope_offset"))
263  {
264  MzTabOptionalColumnEntry opt_meta;
265  opt_meta.first = "opt_isotope_offset";
266  opt_meta.second.set(match.getMetaValue("isotope_offset"));
267  xsm.opt_.push_back(opt_meta);
268  }
269  // don't repeat data from the peptide section (e.g. accessions)
270  // why are "pre"/"post"/"start"/"end" not in the peptide section?!
271  output.push_back(xsm);
272  }
273 
276  const IdentificationData::AppliedProcessingSteps& steps_and_scores,
277  MzTabParameterList& steps_out, std::map<Size, MzTabDouble>& scores_out,
278  std::map<IdentificationData::ScoreTypeRef, Size>& score_map);
279 
281  static void addMzTabSEScores_(
282  const std::map<IdentificationData::ScoreTypeRef, Size>& scores,
283  std::map<Size, MzTabParameter>& output);
284 
287  const IdentificationData::ParentMatch& match,
289 
292  const IdentificationData::ParentMatch& match,
294 
298  IdentificationData& id_data);
299 
303 
307  ProteinIdentification& protein);
308 
309  static void handleFeatureImport_(Feature& feature, const IntList& indexes,
310  std::vector<PeptideIdentification>& peptides,
311  Size& id_counter, bool clear_original);
312 
313  static void handleFeatureExport_(Feature& feature, const IntList& indexes,
314  IdentificationData& id_data, Size& id_counter);
315  };
316 }
A container for consensus elements.
Definition: ConsensusMap.h:92
A container for features.
Definition: FeatureMap.h:106
An LC-MS feature.
Definition: Feature.h:72
Definition: IdentificationDataConverter.h:48
static void addMzTabSEScores_(const std::map< IdentificationData::ScoreTypeRef, Size > &scores, std::map< Size, MzTabParameter > &output)
Helper function to add search engine score entries to MzTab's meta data section.
static void exportFeatureIDs(FeatureMap &features, bool clear_original=true)
Convert IDs in a feature map to legacy peptide/protein identifications.
static void importSequences(IdentificationData &id_data, const std::vector< FASTAFile::FASTAEntry > &fasta, IdentificationData::MoleculeType type=IdentificationData::MoleculeType::PROTEIN, const String &decoy_pattern="")
Import FASTA sequences as parent sequences.
static void exportObservationMatchToMzTab_(const String &sequence, const IdentificationData::ObservationMatch &match, double calc_mass, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map, std::map< IdentificationData::InputFileRef, Size > &file_map)
Export an input match (peptide- or oligonucleotide-spectrum match) to mzTab.
Definition: IdentificationDataConverter.h:230
static void exportIDs(const IdentificationData &id_data, std::vector< ProteinIdentification > &proteins, std::vector< PeptideIdentification > &peptides, bool export_ids_wo_scores=false)
Export to legacy peptide/protein identifications.
static void importIDs(IdentificationData &id_data, const std::vector< ProteinIdentification > &proteins, const std::vector< PeptideIdentification > &peptides)
Import from legacy peptide/protein identifications.
static IdentificationData::SearchParamRef importDBSearchParameters_(const ProteinIdentification::SearchParameters &pisp, IdentificationData &id_data)
Helper function to import DB search parameters from legacy format.
static void addMzTabMoleculeParentContext_(const IdentificationData::ParentMatch &match, MzTabOligonucleotideSectionRow &row)
Helper function for exportPeptideOrOligoToMzTab_() - oligonucleotide variant.
static ProteinIdentification::SearchParameters exportDBSearchParameters_(IdentificationData::SearchParamRef ref)
Helper function to export DB search parameters to legacy format.
static void exportPeptideOrOligoToMzTab_(const IdentSeq &identified, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Export an identified sequence (peptide or oligonucleotide, but not small molecule/compound) to mzTab.
Definition: IdentificationDataConverter.h:195
static void handleFeatureImport_(Feature &feature, const IntList &indexes, std::vector< PeptideIdentification > &peptides, Size &id_counter, bool clear_original)
static MzTab exportMzTab(const IdentificationData &id_data)
Export to mzTab format.
static void handleFeatureExport_(Feature &feature, const IntList &indexes, IdentificationData &id_data, Size &id_counter)
std::optional< IdentificationData::ProcessingStepRef > StepOpt
Definition: IdentificationDataConverter.h:114
static void exportParentSequenceToMzTab_(const IdentificationData::ParentSequence &parent, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Export a parent sequence (protein or nucleic acid) to mzTab.
Definition: IdentificationDataConverter.h:172
static void exportStepsAndScoresToMzTab_(const IdentificationData::AppliedProcessingSteps &steps_and_scores, MzTabParameterList &steps_out, std::map< Size, MzTabDouble > &scores_out, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Helper function to add processing steps (search engines) and their scores to MzTab.
static void exportMSRunInformation_(IdentificationData::ProcessingStepRef step_ref, ProteinIdentification &protein)
Helper function to export (primary) MS run information to legacy format.
static void importFeatureIDs(FeatureMap &features, bool clear_original=true)
Convert IDs from legacy peptide/protein identifications in a feature map.
static void exportParentMatches(const IdentificationData::ParentMatches &parent_matches, PeptideHit &hit)
Convert parent matches to peptide evidences.
static void exportConsensusIDs(ConsensusMap &consensus, bool clear_original=true)
Convert IDs in a consensus map to legacy peptide/protein identifications.
static void addMzTabMoleculeParentContext_(const IdentificationData::ParentMatch &match, MzTabPeptideSectionRow &row)
Helper function for exportPeptideOrOligoToMzTab_() - peptide variant.
static void importConsensusIDs(ConsensusMap &consensus, bool clear_original=true)
Convert IDs from legacy peptide/protein identifications in a consensus map.
Definition: IdentificationData.h:113
IdentificationDataInternal::ParentMatches ParentMatches
Definition: IdentificationData.h:164
IdentificationDataInternal::AppliedProcessingSteps AppliedProcessingSteps
Definition: IdentificationData.h:153
bool metaValueExists(const String &name) const
Returns whether an entry with the given name exists.
const DataValue & getMetaValue(const String &name) const
Returns the value corresponding to a string, or DataValue::EMPTY if not found.
Definition: MzTabBase.h:269
Data model of MzTab files. Please see the official MzTab specification at https://code....
Definition: MzTab.h:478
Representation of a peptide hit.
Definition: PeptideHit.h:57
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:65
double getRT() const
returns the RT of the MS2 spectrum where the identification occurred
bool hasMZ() const
shortcut for isnan(getRT())
bool hasRT() const
shortcut for isnan(getRT())
double getMZ() const
returns the MZ of the MS2 spectrum
Representation of a protein identification run.
Definition: ProteinIdentification.h:76
A more convenient string class.
Definition: String.h:60
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
std::vector< Int > IntList
Vector of signed integers.
Definition: ListUtils.h:55
MoleculeType
Definition: MetaData.h:66
@ PROTEIN
Definition: MetaData.h:67
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:48
std::pair< String, MzTabString > MzTabOptionalColumnEntry
Definition: MzTabBase.h:229
OLI - Oligonucleotide section (table-based)
Definition: MzTab.h:397
PEP - Peptide section (Table based)
Definition: MzTab.h:243
Functor for ordering peptide IDs by RT and m/z (if available)
Definition: IdentificationDataConverter.h:130
bool operator()(const PeptideIdentification &left, const PeptideIdentification &right) const
Definition: IdentificationDataConverter.h:131
Functor for ordering StepOpt (by date of the steps, if available):
Definition: IdentificationDataConverter.h:118
bool operator()(const StepOpt &left, const StepOpt &right) const
Definition: IdentificationDataConverter.h:119
Wrapper that adds operator< to iterators, so they can be used as (part of) keys in maps/sets or multi...
Definition: MetaData.h:46
Representation of a search hit (e.g. peptide-spectrum match).
Definition: ObservationMatch.h:74
AdductOpt adduct_opt
optional reference to adduct
Definition: ObservationMatch.h:81
Int charge
Definition: ObservationMatch.h:79
ObservationRef observation_ref
Definition: ObservationMatch.h:77
Representation of an observation, e.g. a spectrum or feature, in an input data file.
Definition: Observation.h:54
double mz
Definition: Observation.h:61
InputFileRef input_file
Reference to the input file.
Definition: Observation.h:59
String data_id
Spectrum or feature ID (from the file referenced by @t input_file)
Definition: Observation.h:56
double rt
Definition: Observation.h:61
Meta data for the association between an identified molecule (e.g. peptide) and a parent sequence (e....
Definition: ParentMatch.h:46
Representation of a parent sequence that is identified only indirectly (e.g. a protein).
Definition: ParentSequence.h:50
String sequence
Definition: ParentSequence.h:57
String description
Definition: ParentSequence.h:59
double coverage
sequence coverage as a fraction between 0 and 1
Definition: ParentSequence.h:61
String accession
Definition: ParentSequence.h:51
AppliedProcessingSteps steps_and_scores
Definition: ScoredProcessingResult.h:46
Search parameters of the DB search.
Definition: ProteinIdentification.h:273