OpenMS
MzTabFile.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2023.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Timo Sachsenberg $
32 // $Authors: Timo Sachsenberg $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
37 #include <OpenMS/FORMAT/MzTab.h>
38 
43 
44 #include <vector>
45 
46 namespace OpenMS
47 {
48  class String;
49  class SVOutStream;
55  class OPENMS_DLLAPI MzTabFile
56  {
57  public:
62 
63  typedef std::map<std::pair<String, String>, std::vector<PeptideHit> > MapAccPepType;
64 
65  // store MzTab file
66  void store(const String& filename, const MzTab& mz_tab) const;
67 
68  // stream IDs to file
69  void store(
70  const String& filename,
71  const std::vector<ProteinIdentification>& protein_identifications,
72  const std::vector<PeptideIdentification>& peptide_identifications,
73  bool first_run_inference_only,
74  bool export_empty_pep_ids = false,
75  bool export_all_psms = false,
76  const String& title = "ID export from OpenMS");
77 
78  // stream ConsensusMap to file
79  void store(
80  const String& filename,
81  const ConsensusMap& cmap,
82  const bool first_run_inference_only,
83  const bool export_unidentified_features,
84  const bool export_unassigned_ids,
85  const bool export_subfeatures,
86  const bool export_empty_pep_ids = false,
87  const bool export_all_psms = false) const;
88 
89  // Set store behaviour of optional "reliability" and "uri" columns (default=no)
92  void storePSMReliabilityColumn(bool store);
94  void storeProteinUriColumn(bool store);
95  void storePeptideUriColumn(bool store);
96  void storePSMUriColumn(bool store);
97  void storeSmallMoleculeUriColumn(bool store);
98  void storeProteinGoTerms(bool store);
99 
100  // load MzTab file
101  void load(const String& filename, MzTab& mz_tab);
102 
103  protected:
120 
122 
126  const Size n_best_search_engine_scores,
127  const std::vector<String>& optional_columns,
128  const MzTabMetaData& meta,
129  size_t& n_columns) const;
130 
131  String generateMzTabSectionRow_(const MzTabProteinSectionRow& row, const std::vector<String>& optional_columns, const MzTabMetaData& meta, size_t& n_columns) const;
132 
133  String generateMzTabPeptideHeader_(Size search_ms_runs, Size n_best_search_engine_scores, Size n_search_engine_score, Size assays, Size study_variables, const std::vector<String>& optional_columns, size_t& n_columns) const;
134 
135  String generateMzTabSectionRow_(const MzTabPeptideSectionRow& row, const std::vector<String>& optional_columns, const MzTabMetaData& meta, size_t& n_columns) const;
136 
137  String generateMzTabPSMHeader_(Size n_search_engine_scores, const std::vector<String>& optional_columns, size_t& n_columns) const;
138 
139  String generateMzTabSectionRow_(const MzTabPSMSectionRow& row, const std::vector<String>& optional_columns, const MzTabMetaData& meta, size_t& n_columns) const;
140 
141  String generateMzTabSmallMoleculeHeader_(Size search_ms_runs, Size n_best_search_engine_scores, Size n_search_engine_score, Size assays, Size study_variables, const std::vector<String>& optional_columns, size_t& n_columns) const;
142 
143  String generateMzTabSectionRow_(const MzTabSmallMoleculeSectionRow& row, const std::vector<String>& optional_columns, const MzTabMetaData& meta, size_t& n_columns) const;
144 
145  String generateMzTabNucleicAcidHeader_(Size search_ms_runs, Size n_best_search_engine_scores, Size n_search_engine_scores, const std::vector<String>& optional_columns, size_t& n_columns) const;
146 
147  String generateMzTabSectionRow_(const MzTabNucleicAcidSectionRow& row, const std::vector<String>& optional_columns, const MzTabMetaData& meta, size_t& n_columns) const;
148 
149  String generateMzTabOligonucleotideHeader_(Size search_ms_runs, Size n_best_search_engine_scores, Size n_search_engine_score, const std::vector<String>& optional_columns, size_t& n_columns) const;
150 
151  String generateMzTabSectionRow_(const MzTabOligonucleotideSectionRow& row, const std::vector<String>& optional_columns, const MzTabMetaData& meta, size_t& n_columns) const;
152 
153  String generateMzTabOSMHeader_(Size n_search_engine_scores, const std::vector<String>& optional_columns, size_t& n_columns) const;
154 
155  String generateMzTabSectionRow_(const MzTabOSMSectionRow& row, const std::vector<String>& optional_columns, const MzTabMetaData& meta, size_t& n_columns) const;
156 
158  template <typename SectionRow> void generateMzTabSection_(const std::vector<SectionRow>& rows, const std::vector<String>& optional_columns, const MzTabMetaData& meta, StringList& output, size_t n_header_columns) const
159  {
160  output.reserve(output.size() + rows.size() + 1);
161  for (const auto& row : rows)
162  {
163  size_t n_section_columns = 0;
164  output.push_back(generateMzTabSectionRow_(row, optional_columns, meta, n_section_columns));
165  if (n_header_columns != n_section_columns) throw Exception::Postcondition(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Header and content differs in columns. Please report this bug to the OpenMS developers.");
166  }
167  }
168 
169  // auxiliary functions
170 
172  static void addOptionalColumnsToSectionRow_(const std::vector<String>& column_names, const std::vector<MzTabOptionalColumnEntry>& column_entries, StringList& output);
173 
174  // extract two integers from string (e.g. search_engine_score[1]_ms_run[2] -> 1,2)
175  static std::pair<int, int> extractIndexPairsFromBrackets_(const String& s);
176 
177  static void sortPSM_(std::vector<PeptideIdentification>::iterator begin, std::vector<PeptideIdentification>::iterator end);
178 
179  static void keepFirstPSM_(std::vector<PeptideIdentification>::iterator begin, std::vector<PeptideIdentification>::iterator end);
180 
182  static void partitionIntoRuns_(const std::vector<PeptideIdentification>& pep_ids,
183  const std::vector<ProteinIdentification>& pro_ids,
184  std::map<String, std::vector<PeptideIdentification> >& map_run_to_pepids,
185  std::map<String, std::vector<ProteinIdentification> >& map_run_to_proids
186  );
187 
188 
190  static void createProteinToPeptideLinks_(const std::map<String, std::vector<PeptideIdentification> >& map_run_to_pepids, MapAccPepType& map_run_accession_to_pephits);
191 
193  static String extractProteinAccession_(const PeptideHit& peptide_hit);
194 
196  static String extractPeptideModifications_(const PeptideHit& peptide_hit);
197 
199  static String mapSearchEngineToCvParam_(const String& openms_search_engine_name);
200 
201  static String mapSearchEngineScoreToCvParam_(const String& openms_search_engine_name, double score, String score_type);
202 
203  static String extractNumPeptides_(const String& common_identifier, const String& protein_accession,
204  const MapAccPepType& map_run_accession_to_peptides);
205 
206  // mzTab definition of distinct
207  static String extractNumPeptidesDistinct_(String common_identifier, String protein_accession,
208  const MapAccPepType& map_run_accession_to_peptides);
209 
210  // same as distinct but additional constraint of uniqueness (=maps to exactly one Protein)
211  static String extractNumPeptidesUnambiguous_(String common_identifier, String protein_accession,
212  const MapAccPepType& map_run_accession_to_peptides);
213 
214  static std::map<String, Size> extractNumberOfSubSamples_(const std::map<String, std::vector<ProteinIdentification> >& map_run_to_proids);
215 
216  static void writePeptideHeader_(SVOutStream& output, std::map<String, Size> n_sub_samples);
217 
218  static void writeProteinHeader_(SVOutStream& output, std::map<String, Size> n_sub_samples);
219 
220  static void writeProteinData_(SVOutStream& output,
221  const ProteinIdentification& prot_id,
222  Size run_count,
223  String input_filename,
224  bool has_coverage,
225  const MapAccPepType& map_run_accession_to_peptides,
226  const std::map<String, Size>& map_run_to_num_sub
227  );
228 
229  private:
230  friend class MzTabMFile;
231  };
232 
233 } // namespace OpenMS
234 
A container for consensus elements.
Definition: ConsensusMap.h:92
Postcondition failed exception.
Definition: Exception.h:173
File adapter for MzTab files.
Definition: MzTabFile.h:56
static void sortPSM_(std::vector< PeptideIdentification >::iterator begin, std::vector< PeptideIdentification >::iterator end)
String generateMzTabProteinHeader_(const MzTabProteinSectionRow &reference_row, const Size n_best_search_engine_scores, const std::vector< String > &optional_columns, const MzTabMetaData &meta, size_t &n_columns) const
void storePSMUriColumn(bool store)
static void keepFirstPSM_(std::vector< PeptideIdentification >::iterator begin, std::vector< PeptideIdentification >::iterator end)
static std::pair< int, int > extractIndexPairsFromBrackets_(const String &s)
void store(const String &filename, const std::vector< ProteinIdentification > &protein_identifications, const std::vector< PeptideIdentification > &peptide_identifications, bool first_run_inference_only, bool export_empty_pep_ids=false, bool export_all_psms=false, const String &title="ID export from OpenMS")
String generateMzTabOSMHeader_(Size n_search_engine_scores, const std::vector< String > &optional_columns, size_t &n_columns) const
static String mapSearchEngineScoreToCvParam_(const String &openms_search_engine_name, double score, String score_type)
bool store_osm_uri_
Definition: MzTabFile.h:118
bool store_psm_uri_
Definition: MzTabFile.h:110
static void writePeptideHeader_(SVOutStream &output, std::map< String, Size > n_sub_samples)
bool store_smallmolecule_reliability_
Definition: MzTabFile.h:107
String generateMzTabSectionRow_(const MzTabPSMSectionRow &row, const std::vector< String > &optional_columns, const MzTabMetaData &meta, size_t &n_columns) const
bool store_protein_goterms_
Definition: MzTabFile.h:112
void storeSmallMoleculeReliabilityColumn(bool store)
bool store_protein_uri_
Definition: MzTabFile.h:108
bool store_nucleic_acid_goterms_
Definition: MzTabFile.h:119
void storePSMReliabilityColumn(bool store)
bool store_protein_reliability_
Definition: MzTabFile.h:104
static String mapSearchEngineToCvParam_(const String &openms_search_engine_name)
Map search engine identifier to CV, param etc.
bool store_psm_reliability_
Definition: MzTabFile.h:106
String generateMzTabSectionRow_(const MzTabSmallMoleculeSectionRow &row, const std::vector< String > &optional_columns, const MzTabMetaData &meta, size_t &n_columns) const
String generateMzTabSectionRow_(const MzTabProteinSectionRow &row, const std::vector< String > &optional_columns, const MzTabMetaData &meta, size_t &n_columns) const
static void writeProteinData_(SVOutStream &output, const ProteinIdentification &prot_id, Size run_count, String input_filename, bool has_coverage, const MapAccPepType &map_run_accession_to_peptides, const std::map< String, Size > &map_run_to_num_sub)
bool store_oligonucleotide_uri_
Definition: MzTabFile.h:117
String generateMzTabSectionRow_(const MzTabNucleicAcidSectionRow &row, const std::vector< String > &optional_columns, const MzTabMetaData &meta, size_t &n_columns) const
static void writeProteinHeader_(SVOutStream &output, std::map< String, Size > n_sub_samples)
static void addOptionalColumnsToSectionRow_(const std::vector< String > &column_names, const std::vector< MzTabOptionalColumnEntry > &column_entries, StringList &output)
Helper function for "generateMzTabSectionRow_" functions.
~MzTabFile()
Destructor.
String generateMzTabNucleicAcidHeader_(Size search_ms_runs, Size n_best_search_engine_scores, Size n_search_engine_scores, const std::vector< String > &optional_columns, size_t &n_columns) const
static String extractProteinAccession_(const PeptideHit &peptide_hit)
Extracts, if possible a unique protein accession for a peptide hit in mzTab format....
static String extractPeptideModifications_(const PeptideHit &peptide_hit)
Extracts, modifications and positions of a peptide hit in mzTab format.
static String extractNumPeptidesDistinct_(String common_identifier, String protein_accession, const MapAccPepType &map_run_accession_to_peptides)
bool store_smallmolecule_uri_
Definition: MzTabFile.h:111
static void createProteinToPeptideLinks_(const std::map< String, std::vector< PeptideIdentification > > &map_run_to_pepids, MapAccPepType &map_run_accession_to_pephits)
create links from protein to peptides
bool store_oligonucleotide_reliability_
Definition: MzTabFile.h:114
String generateMzTabSmallMoleculeHeader_(Size search_ms_runs, Size n_best_search_engine_scores, Size n_search_engine_score, Size assays, Size study_variables, const std::vector< String > &optional_columns, size_t &n_columns) const
String generateMzTabOligonucleotideHeader_(Size search_ms_runs, Size n_best_search_engine_scores, Size n_search_engine_score, const std::vector< String > &optional_columns, size_t &n_columns) const
String generateMzTabPeptideHeader_(Size search_ms_runs, Size n_best_search_engine_scores, Size n_search_engine_score, Size assays, Size study_variables, const std::vector< String > &optional_columns, size_t &n_columns) const
bool store_osm_reliability_
Definition: MzTabFile.h:115
void store(const String &filename, const MzTab &mz_tab) const
std::map< std::pair< String, String >, std::vector< PeptideHit > > MapAccPepType
Definition: MzTabFile.h:63
void store(const String &filename, const ConsensusMap &cmap, const bool first_run_inference_only, const bool export_unidentified_features, const bool export_unassigned_ids, const bool export_subfeatures, const bool export_empty_pep_ids=false, const bool export_all_psms=false) const
void storeProteinReliabilityColumn(bool store)
void storeProteinGoTerms(bool store)
void generateMzTabMetaDataSection_(const MzTabMetaData &map, StringList &sl) const
String generateMzTabSectionRow_(const MzTabPeptideSectionRow &row, const std::vector< String > &optional_columns, const MzTabMetaData &meta, size_t &n_columns) const
static String extractNumPeptidesUnambiguous_(String common_identifier, String protein_accession, const MapAccPepType &map_run_accession_to_peptides)
bool store_nucleic_acid_reliability_
Definition: MzTabFile.h:113
void storeSmallMoleculeUriColumn(bool store)
void load(const String &filename, MzTab &mz_tab)
MzTabFile()
Default constructor.
bool store_peptide_uri_
Definition: MzTabFile.h:109
String generateMzTabSectionRow_(const MzTabOSMSectionRow &row, const std::vector< String > &optional_columns, const MzTabMetaData &meta, size_t &n_columns) const
static String extractNumPeptides_(const String &common_identifier, const String &protein_accession, const MapAccPepType &map_run_accession_to_peptides)
String generateMzTabPSMHeader_(Size n_search_engine_scores, const std::vector< String > &optional_columns, size_t &n_columns) const
String generateMzTabSectionRow_(const MzTabOligonucleotideSectionRow &row, const std::vector< String > &optional_columns, const MzTabMetaData &meta, size_t &n_columns) const
static std::map< String, Size > extractNumberOfSubSamples_(const std::map< String, std::vector< ProteinIdentification > > &map_run_to_proids)
void storePeptideUriColumn(bool store)
bool store_peptide_reliability_
Definition: MzTabFile.h:105
void storeProteinUriColumn(bool store)
void generateMzTabSection_(const std::vector< SectionRow > &rows, const std::vector< String > &optional_columns, const MzTabMetaData &meta, StringList &output, size_t n_header_columns) const
Generate an mzTab section comprising multiple rows of the same type and perform sanity check.
Definition: MzTabFile.h:158
void storePeptideReliabilityColumn(bool store)
static void partitionIntoRuns_(const std::vector< PeptideIdentification > &pep_ids, const std::vector< ProteinIdentification > &pro_ids, std::map< String, std::vector< PeptideIdentification > > &map_run_to_pepids, std::map< String, std::vector< ProteinIdentification > > &map_run_to_proids)
Extract protein and peptide identifications for each run. maps are assumed empty.
bool store_nucleic_acid_uri_
Definition: MzTabFile.h:116
File adapter for MzTab-M files.
Definition: MzTabMFile.h:51
all meta data of a mzTab file. Please refer to specification for documentation.
Definition: MzTab.h:141
Data model of MzTab files. Please see the official MzTab specification at https://code....
Definition: MzTab.h:478
Representation of a peptide hit.
Definition: PeptideHit.h:57
Representation of a protein identification run.
Definition: ProteinIdentification.h:76
Stream class for writing to comma/tab/...-separated values files.
Definition: SVOutStream.h:58
A more convenient string class.
Definition: String.h:60
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
std::vector< String > StringList
Vector of String.
Definition: ListUtils.h:70
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:48
NUC - Nucleic acid section (table-based)
Definition: MzTab.h:362
OSM - OSM (oligonucleotide-spectrum match) section (table-based)
Definition: MzTab.h:432
OLI - Oligonucleotide section (table-based)
Definition: MzTab.h:397
PEP - Peptide section (Table based)
Definition: MzTab.h:243
SML Small molecule section (table based)
Definition: MzTab.h:332
PSM - PSM section (Table based)
Definition: MzTab.h:281
PRT - Protein section (Table based)
Definition: MzTab.h:204