OpenMS  2.7.0
PeptideIndexing.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2021.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Andreas Bertsch, Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
37 
54 #include <OpenMS/SYSTEM/SysInfo.h>
55 
56 #include <atomic>
57 #include <algorithm>
58 #include <fstream>
59 
60 
61 namespace OpenMS
62 {
63 
126  class OPENMS_DLLAPI PeptideIndexing :
127  public DefaultParamHandler, public ProgressLogger
128  {
129 public:
130 
132  static char const* const AUTO_MODE; /* = 'auto' */
133 
136  {
141  UNEXPECTED_RESULT
142  };
143 
145  enum class Unmatched
146  {
147  IS_ERROR,
148  WARN,
149  REMOVE,
150  SIZE_OF_UNMATCHED
151  };
152  static const std::array<std::string, (Size)Unmatched::SIZE_OF_UNMATCHED> names_of_unmatched;
153 
154  enum class MissingDecoy
155  {
156  IS_ERROR,
157  WARN,
158  SILENT,
159  SIZE_OF_MISSING_DECOY
160  };
161  static const std::array<std::string, (Size)MissingDecoy::SIZE_OF_MISSING_DECOY> names_of_missing_decoy;
162 
165 
167  ~PeptideIndexing() override;
168 
169 
171  inline ExitCodes run(std::vector<FASTAFile::FASTAEntry>& proteins, std::vector<ProteinIdentification>& prot_ids, std::vector<PeptideIdentification>& pep_ids)
172  {
173  FASTAContainer<TFI_Vector> protein_container(proteins);
174  return run<TFI_Vector>(protein_container, prot_ids, pep_ids);
175  }
176 
212  template<typename T>
213  ExitCodes run(FASTAContainer<T>& proteins, std::vector<ProteinIdentification>& prot_ids, std::vector<PeptideIdentification>& pep_ids)
214  {
215  if ((enzyme_name_ == "Chymotrypsin" || enzyme_name_ == "Chymotrypsin/P" || enzyme_name_ == "TrypChymo")
216  && IL_equivalent_)
217  {
218  throw Exception::InvalidParameter(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION,
219  "The used enzyme " + enzyme_name_ + "differentiates between I and L, therefore the IL_equivalent option cannot be used.");
220  }
221  // no decoy string provided? try to deduce from data
222  if (decoy_string_.empty())
223  {
224  auto r = DecoyHelper::findDecoyString(proteins);
225  proteins.reset();
226  if (!r.success)
227  {
228  r.is_prefix = true;
229  r.name = "DECOY_";
230  OPENMS_LOG_WARN << "Unable to determine decoy string automatically (not enough decoys were detected)! Using default " << (r.is_prefix ? "prefix" : "suffix") << " decoy string '" << r.name << "'\n"
231  << "If you think that this is incorrect, please provide a decoy_string and its position manually!" << std::endl;
232  }
233  prefix_ = r.is_prefix;
234  decoy_string_ = r.name;
235  // decoy string and position was extracted successfully
236  OPENMS_LOG_INFO << "Using " << (prefix_ ? "prefix" : "suffix") << " decoy string '" << decoy_string_ << "'" << std::endl;
237  }
238 
239  //---------------------------------------------------------------
240  // parsing parameters, correcting xtandem and MSGFPlus parameters
241  //---------------------------------------------------------------
242  ProteaseDigestion enzyme;
243  if (!enzyme_name_.empty() && (enzyme_name_.compare(AUTO_MODE) != 0))
244  { // use param (not empty, not 'auto')
245  enzyme.setEnzyme(enzyme_name_);
246  }
247  else if (!prot_ids.empty() && prot_ids[0].getSearchParameters().digestion_enzyme.getName() != "unknown_enzyme")
248  { // take from meta (this assumes all runs used the same enzyme)
249  OPENMS_LOG_INFO << "Info: using '" << prot_ids[0].getSearchParameters().digestion_enzyme.getName() << "' as enzyme (obtained from idXML) for digestion." << std::endl;
250  enzyme.setEnzyme(&prot_ids[0].getSearchParameters().digestion_enzyme);
251  }
252  else
253  { // fallback
254  OPENMS_LOG_WARN << "Warning: Enzyme name neither given nor deduceable from input. Defaulting to Trypsin!" << std::endl;
255  enzyme.setEnzyme("Trypsin");
256  }
257 
258  bool xtandem_fix_parameters = false;
259  bool msgfplus_fix_parameters = false;
260 
261  // determine if at least one search engine was xtandem or MSGFPlus to enable special rules
262  for (const auto& prot_id : prot_ids)
263  {
264  String search_engine = prot_id.getOriginalSearchEngineName();
265  StringUtils::toUpper(search_engine);
266  OPENMS_LOG_INFO << "Peptide identification engine: " << search_engine << std::endl;
267  if (search_engine == "XTANDEM" || prot_id.getSearchParameters().metaValueExists("SE:XTandem")) { xtandem_fix_parameters = true; }
268  if (search_engine == "MS-GF+" || search_engine == "MSGFPLUS" || prot_id.getSearchParameters().metaValueExists("SE:MS-GF+")) { msgfplus_fix_parameters = true; }
269  }
270 
271  // including MSGFPlus -> Trypsin/P as enzyme
272  if (msgfplus_fix_parameters && enzyme.getEnzymeName() == "Trypsin")
273  {
274  OPENMS_LOG_WARN << "MSGFPlus detected but enzyme cutting rules were set to Trypsin. Correcting to Trypsin/P to cope with special cutting rule in MSGFPlus." << std::endl;
275  enzyme.setEnzyme("Trypsin/P");
276  }
277 
278  OPENMS_LOG_INFO << "Enzyme: " << enzyme.getEnzymeName() << std::endl;
279 
280  if (!enzyme_specificity_.empty() && (enzyme_specificity_.compare(AUTO_MODE) != 0))
281  { // use param (not empty and not 'auto')
282  enzyme.setSpecificity(ProteaseDigestion::getSpecificityByName(enzyme_specificity_));
283  }
284  else if (!prot_ids.empty() && prot_ids[0].getSearchParameters().enzyme_term_specificity != ProteaseDigestion::SPEC_UNKNOWN)
285  { // deduce from data ('auto')
286  enzyme.setSpecificity(prot_ids[0].getSearchParameters().enzyme_term_specificity);
287  OPENMS_LOG_INFO << "Info: using '" << EnzymaticDigestion::NamesOfSpecificity[prot_ids[0].getSearchParameters().enzyme_term_specificity] << "' as enzyme specificity (obtained from idXML) for digestion." << std::endl;
288  }
289  else
290  { // fallback
291  OPENMS_LOG_WARN << "Warning: Enzyme specificity neither given nor present in the input file. Defaulting to 'full'!" << std::endl;
293  }
294 
295  //-------------------------------------------------------------
296  // calculations
297  //-------------------------------------------------------------
298  // cache the first proteins
299  const size_t PROTEIN_CACHE_SIZE = 4e5; // 400k should be enough for most DB's and is not too hard on memory either (~200 MB FASTA)
300 
301  this->startProgress(0, 1, "Load first DB chunk");
302  proteins.cacheChunk(PROTEIN_CACHE_SIZE);
303  this->endProgress();
304 
305  if (proteins.empty()) // we do not allow an empty database
306  {
307  OPENMS_LOG_ERROR << "Error: An empty database was provided. Mapping makes no sense. Aborting..." << std::endl;
308  return DATABASE_EMPTY;
309  }
310 
311  if (pep_ids.empty()) // Aho-Corasick requires non-empty input; but we allow this case, since the TOPP tool should not crash when encountering a bad raw file (with no PSMs)
312  {
313  OPENMS_LOG_WARN << "Warning: An empty set of peptide identifications was provided. Output will be empty as well." << std::endl;
314  if (!keep_unreferenced_proteins_)
315  {
316  // delete only protein hits, not whole ID runs incl. meta data:
317  for (std::vector<ProteinIdentification>::iterator it = prot_ids.begin();
318  it != prot_ids.end(); ++it)
319  {
320  it->getHits().clear();
321  }
322  }
323  return PEPTIDE_IDS_EMPTY;
324  }
325 
326  FoundProteinFunctor func(enzyme, xtandem_fix_parameters); // store the matches
327  Map<String, Size> acc_to_prot; // map: accessions --> FASTA protein index
328  std::vector<bool> protein_is_decoy; // protein index -> is decoy?
329  std::vector<std::string> protein_accessions; // protein index -> accession
330 
331  bool invalid_protein_sequence = false; // check for proteins with modifications, i.e. '[' or '(', and throw an exception
332 
333  { // new scope - forget data after search
334 
335  /*
336  BUILD Peptide DB
337  */
338  bool has_illegal_AAs(false);
340  for (std::vector<PeptideIdentification>::const_iterator it1 = pep_ids.begin(); it1 != pep_ids.end(); ++it1)
341  {
342  //String run_id = it1->getIdentifier();
343  const std::vector<PeptideHit>& hits = it1->getHits();
344  for (std::vector<PeptideHit>::const_iterator it2 = hits.begin(); it2 != hits.end(); ++it2)
345  {
346  //
347  // Warning:
348  // do not skip over peptides here, since the results are iterated in the same way
349  //
350  String seq = it2->getSequence().toUnmodifiedString().remove('*'); // make a copy, i.e. do NOT change the peptide sequence!
351  if (seqan::isAmbiguous(seqan::AAString(seq.c_str())))
352  { // do not quit here, to show the user all sequences .. only quit after loop
353  OPENMS_LOG_ERROR << "Peptide sequence '" << it2->getSequence() << "' contains one or more ambiguous amino acids (B|J|Z|X).\n";
354  has_illegal_AAs = true;
355  }
356  if (IL_equivalent_) // convert L to I;
357  {
358  seq.substitute('L', 'I');
359  }
360  appendValue(pep_DB, seq.c_str());
361  }
362  }
363  if (has_illegal_AAs)
364  {
365  OPENMS_LOG_ERROR << "One or more peptides contained illegal amino acids. This is not allowed!"
366  << "\nPlease either remove the peptide or replace it with one of the unambiguous ones (while allowing for ambiguous AA's to match the protein)." << std::endl;;
367  }
368 
369  OPENMS_LOG_INFO << "Mapping " << length(pep_DB) << " peptides to " << (proteins.size() == PROTEIN_CACHE_SIZE ? "? (unknown number of)" : String(proteins.size())) << " proteins." << std::endl;
370 
371  if (length(pep_DB) == 0)
372  { // Aho-Corasick will crash if given empty needles as input
373  OPENMS_LOG_WARN << "Warning: Peptide identifications have no hits inside! Output will be empty as well." << std::endl;
374  return PEPTIDE_IDS_EMPTY;
375  }
376 
377  /*
378  Aho Corasick (fast)
379  */
380  OPENMS_LOG_INFO << "Searching with up to " << aaa_max_ << " ambiguous amino acid(s) and " << mm_max_ << " mismatch(es)!" << std::endl;
382  OPENMS_LOG_INFO << "Building trie ...";
383  StopWatch s;
384  s.start();
386  AhoCorasickAmbiguous::initPattern(pep_DB, aaa_max_, mm_max_, pattern);
387  s.stop();
388  OPENMS_LOG_INFO << " done (" << int(s.getClockTime()) << "s)" << std::endl;
389  s.reset();
390 
391  uint16_t count_j_proteins(0);
392  bool has_active_data = true; // becomes false if end of FASTA file is reached
393  const std::string jumpX(aaa_max_ + mm_max_ + 1, 'X'); // jump over stretches of 'X' which cost a lot of time; +1 because AXXA is a valid hit for aaa_max == 2 (cannot split it)
394  // use very large target value for progress if DB size is unknown (did not fit into first chunk)
395  this->startProgress(0, proteins.size() == PROTEIN_CACHE_SIZE ? std::numeric_limits<SignedSize>::max() : proteins.size(), "Aho-Corasick");
396  std::atomic<int> progress_prots(0);
397 #ifdef _OPENMP
398 #pragma omp parallel
399 #endif
400  {
401  FoundProteinFunctor func_threads(enzyme, xtandem_fix_parameters);
402  Map<String, Size> acc_to_prot_thread; // map: accessions --> FASTA protein index
403  AhoCorasickAmbiguous fuzzyAC;
404  String prot;
405 
406  while (true)
407  {
408  #pragma omp barrier // all threads need to be here, since we are about to swap protein data
409  #pragma omp single
410  {
411  DEBUG_ONLY std::cerr << " activating cache ...\n";
412  has_active_data = proteins.activateCache(); // swap in last cache
413  protein_accessions.resize(proteins.getChunkOffset() + proteins.chunkSize());
414  } // implicit barrier here
415 
416  if (!has_active_data) break; // leave while-loop
417  SignedSize prot_count = (SignedSize)proteins.chunkSize();
418 
419  #pragma omp master
420  {
421  DEBUG_ONLY std::cerr << "Filling Protein Cache ...";
422  proteins.cacheChunk(PROTEIN_CACHE_SIZE);
423  protein_is_decoy.resize(proteins.getChunkOffset() + prot_count);
424  for (SignedSize i = 0; i < prot_count; ++i)
425  { // do this in master only, to avoid false sharing
426  const String& seq = proteins.chunkAt(i).identifier;
427  protein_is_decoy[i + proteins.getChunkOffset()] = (prefix_ ? seq.hasPrefix(decoy_string_) : seq.hasSuffix(decoy_string_));
428  }
429  DEBUG_ONLY std::cerr << " done" << std::endl;
430  }
431  DEBUG_ONLY std::cerr << " starting for loop \n";
432  // search all peptides in each protein
433  #pragma omp for schedule(dynamic, 100) nowait
434  for (SignedSize i = 0; i < prot_count; ++i)
435  {
436  ++progress_prots; // atomic
437  if (omp_get_thread_num() == 0)
438  {
439  this->setProgress(progress_prots);
440  }
441 
442  prot = proteins.chunkAt(i).sequence;
443  prot.remove('*');
444 
445  // check for invalid sequences with modifications
446  if (prot.has('[') || prot.has('('))
447  {
448  invalid_protein_sequence = true; // not omp-critical because its write-only
449  // we cannot throw an exception here, since we'd need to catch it within the parallel region
450  }
451 
452  // convert L/J to I; also replace 'J' in proteins
453  if (IL_equivalent_)
454  {
455  prot.substitute('L', 'I');
456  prot.substitute('J', 'I');
457  }
458  else
459  { // warn if 'J' is found (it eats into aaa_max)
460  if (prot.has('J'))
461  {
462  #pragma omp atomic
463  ++count_j_proteins;
464  }
465  }
466 
467  Size prot_idx = i + proteins.getChunkOffset();
468 
469  // test if protein was a hit
470  Size hits_total = func_threads.filter_passed + func_threads.filter_rejected;
471 
472  // check if there are stretches of 'X'
473  if (prot.has('X'))
474  {
475  // create chunks of the protein (splitting it at stretches of 'X..X') and feed them to AC one by one
476  size_t offset = -1, start = 0;
477  while ((offset = prot.find(jumpX, offset + 1)) != std::string::npos)
478  {
479  //std::cout << "found X..X at " << offset << " in protein " << proteins[i].identifier << "\n";
480  addHits_(fuzzyAC, pattern, pep_DB, prot.substr(start, offset + jumpX.size() - start), prot, prot_idx, (int)start, func_threads);
481  // skip ahead while we encounter more X...
482  while (offset + jumpX.size() < prot.size() && prot[offset + jumpX.size()] == 'X') ++offset;
483  start = offset;
484  //std::cout << " new start: " << start << "\n";
485  }
486  // last chunk
487  if (start < prot.size())
488  {
489  addHits_(fuzzyAC, pattern, pep_DB, prot.substr(start), prot, prot_idx, (int)start, func_threads);
490  }
491  }
492  else
493  {
494  addHits_(fuzzyAC, pattern, pep_DB, prot, prot, prot_idx, 0, func_threads);
495  }
496  // was protein found?
497  if (hits_total < func_threads.filter_passed + func_threads.filter_rejected)
498  {
499  protein_accessions[prot_idx] = proteins.chunkAt(i).identifier;
500  acc_to_prot_thread[protein_accessions[prot_idx]] = prot_idx;
501  }
502  } // end parallel FOR
503 
504  // join results again
505  DEBUG_ONLY std::cerr << " critical now \n";
506  #ifdef _OPENMP
507  #pragma omp critical(PeptideIndexer_joinAC)
508  #endif
509  {
510  s.start();
511  // hits
512  func.merge(func_threads);
513  // accession -> index
514  acc_to_prot.insert(acc_to_prot_thread.begin(), acc_to_prot_thread.end());
515  acc_to_prot_thread.clear();
516  s.stop();
517  } // OMP end critical
518  } // end readChunk
519  } // OMP end parallel
520  this->endProgress();
521  std::cout << "Merge took: " << s.toString() << "\n";
522  mu.after();
523  std::cout << mu.delta("Aho-Corasick") << "\n\n";
524 
525  OPENMS_LOG_INFO << "\nAho-Corasick done:\n found " << func.filter_passed << " hits for " << func.pep_to_prot.size() << " of " << length(pep_DB) << " peptides.\n";
526 
527  // write some stats
528  OPENMS_LOG_INFO << "Peptide hits passing enzyme filter: " << func.filter_passed << "\n"
529  << " ... rejected by enzyme filter: " << func.filter_rejected << std::endl;
530 
531  if (count_j_proteins)
532  {
533  OPENMS_LOG_WARN << "PeptideIndexer found " << count_j_proteins << " protein sequences in your database containing the amino acid 'J'."
534  << "To match 'J' in a protein, an ambiguous amino acid placeholder for I/L will be used.\n"
535  << "This costs runtime and eats into the 'aaa_max' limit, leaving less opportunity for B/Z/X matches.\n"
536  << "If you want 'J' to be treated as unambiguous, enable '-IL_equivalent'!" << std::endl;
537  }
538 
539  } // end local scope
540 
541  //
542  // do mapping
543  //
544  // index existing proteins
545  Map<String, Size> runid_to_runidx; // identifier to index
546  for (Size run_idx = 0; run_idx < prot_ids.size(); ++run_idx)
547  {
548  runid_to_runidx[prot_ids[run_idx].getIdentifier()] = run_idx;
549  }
550 
551  // for peptides --> proteins
552  Size stats_matched_unique(0);
553  Size stats_matched_multi(0);
554  Size stats_unmatched(0); // no match to DB
555  Size stats_count_m_t(0); // match to Target DB
556  Size stats_count_m_d(0); // match to Decoy DB
557  Size stats_count_m_td(0); // match to T+D DB
558 
559  Map<Size, std::set<Size> > runidx_to_protidx; // in which protID do appear which proteins (according to mapped peptides)
560 
561  Size pep_idx(0);
562  for (std::vector<PeptideIdentification>::iterator it1 = pep_ids.begin(); it1 != pep_ids.end(); ++it1)
563  {
564  // which ProteinIdentification does the peptide belong to?
565  Size run_idx = runid_to_runidx[it1->getIdentifier()];
566 
567  std::vector<PeptideHit>& hits = it1->getHits();
568 
569  for (std::vector<PeptideHit>::iterator it_hit = hits.begin(); it_hit != hits.end(); /* no increase here! we might need to skip it; see below */)
570  {
571  // clear protein accessions
572  it_hit->setPeptideEvidences(std::vector<PeptideEvidence>());
573 
574  //
575  // is this a decoy hit?
576  //
577  bool matches_target(false);
578  bool matches_decoy(false);
579 
580  std::set<Size> prot_indices;
581  // add new protein references
582  for (std::set<PeptideProteinMatchInformation>::const_iterator it_i = func.pep_to_prot[pep_idx].begin();
583  it_i != func.pep_to_prot[pep_idx].end(); ++it_i)
584  {
585  prot_indices.insert(it_i->protein_index);
586  const String& accession = protein_accessions[it_i->protein_index];
587  PeptideEvidence pe(accession, it_i->position, it_i->position + (int)it_hit->getSequence().size() - 1, it_i->AABefore, it_i->AAAfter);
588  it_hit->addPeptideEvidence(pe);
589 
590  runidx_to_protidx[run_idx].insert(it_i->protein_index); // fill protein hits
591 
592  if (protein_is_decoy[it_i->protein_index])
593  {
594  matches_decoy = true;
595  }
596  else
597  {
598  matches_target = true;
599  }
600  }
601  ++pep_idx; // next hit
602 
603  if (matches_decoy && matches_target)
604  {
605  it_hit->setMetaValue("target_decoy", "target+decoy");
606  ++stats_count_m_td;
607  }
608  else if (matches_target)
609  {
610  it_hit->setMetaValue("target_decoy", "target");
611  ++stats_count_m_t;
612  }
613  else if (matches_decoy)
614  {
615  it_hit->setMetaValue("target_decoy", "decoy");
616  ++stats_count_m_d;
617  } // else: could match to no protein (i.e. both are false)
618  //else ... // not required (handled below; see stats_unmatched);
619 
620  if (prot_indices.size() == 1)
621  {
622  it_hit->setMetaValue("protein_references", "unique");
623  ++stats_matched_unique;
624  }
625  else if (prot_indices.size() > 1)
626  {
627  it_hit->setMetaValue("protein_references", "non-unique");
628  ++stats_matched_multi;
629  }
630  else
631  {
632  ++stats_unmatched;
633  if (stats_unmatched < 15) OPENMS_LOG_INFO << "Unmatched peptide: " << it_hit->getSequence() << "\n";
634  else if (stats_unmatched == 15) OPENMS_LOG_INFO << "Unmatched peptide: ...\n";
635  if (unmatched_action_ == Unmatched::REMOVE)
636  {
637  it_hit = hits.erase(it_hit);
638  continue; // already points to the next hit
639  }
640  else
641  {
642  it_hit->setMetaValue("protein_references", "unmatched");
643  }
644  }
645 
646  ++it_hit; // next hit
647  } // all hits
648 
649  } // next PepID
650 
651  Size total_peptides = stats_count_m_t + stats_count_m_d + stats_count_m_td + stats_unmatched;
652  OPENMS_LOG_INFO << "-----------------------------------\n";
653  OPENMS_LOG_INFO << "Peptide statistics\n";
654  OPENMS_LOG_INFO << "\n";
655  OPENMS_LOG_INFO << " unmatched : " << stats_unmatched << " (" << stats_unmatched * 100 / total_peptides << " %)\n";
656  OPENMS_LOG_INFO << " target/decoy:\n";
657  OPENMS_LOG_INFO << " match to target DB only: " << stats_count_m_t << " (" << stats_count_m_t * 100 / total_peptides << " %)\n";
658  OPENMS_LOG_INFO << " match to decoy DB only : " << stats_count_m_d << " (" << stats_count_m_d * 100 / total_peptides << " %)\n";
659  OPENMS_LOG_INFO << " match to both : " << stats_count_m_td << " (" << stats_count_m_td * 100 / total_peptides << " %)\n";
660  OPENMS_LOG_INFO << "\n";
661  OPENMS_LOG_INFO << " mapping to proteins:\n";
662  OPENMS_LOG_INFO << " no match (to 0 protein) : " << stats_unmatched << "\n";
663  OPENMS_LOG_INFO << " unique match (to 1 protein) : " << stats_matched_unique << "\n";
664  OPENMS_LOG_INFO << " non-unique match (to >1 protein): " << stats_matched_multi << std::endl;
665 
667  Size stats_matched_proteins(0), stats_matched_new_proteins(0), stats_orphaned_proteins(0), stats_proteins_target(0), stats_proteins_decoy(0);
668 
669  // all peptides contain the correct protein hit references, now update the protein hits
670  for (Size run_idx = 0; run_idx < prot_ids.size(); ++run_idx)
671  {
672  std::set<Size> masterset = runidx_to_protidx[run_idx]; // all protein matches from above
673 
674  std::vector<ProteinHit>& phits = prot_ids[run_idx].getHits();
675  {
676  // go through existing protein hits and count orphaned proteins (with no peptide hits)
677  std::vector<ProteinHit> orphaned_hits;
678  for (std::vector<ProteinHit>::iterator p_hit = phits.begin(); p_hit != phits.end(); ++p_hit)
679  {
680  const String& acc = p_hit->getAccession();
681  if (!acc_to_prot.has(acc)) // acc_to_prot only contains found proteins from current run
682  { // old hit is orphaned
683  ++stats_orphaned_proteins;
684  if (keep_unreferenced_proteins_)
685  {
686  p_hit->setMetaValue("target_decoy", "");
687  orphaned_hits.push_back(*p_hit);
688  }
689  }
690  }
691  // only keep orphaned hits (if any)
692  phits = orphaned_hits;
693  }
694 
695  // add new protein hits
697  phits.reserve(phits.size() + masterset.size());
698  for (std::set<Size>::const_iterator it = masterset.begin(); it != masterset.end(); ++it)
699  {
700  ProteinHit hit;
701  hit.setAccession(protein_accessions[*it]);
702 
703  if (write_protein_sequence_ || write_protein_description_)
704  {
705  proteins.readAt(fe, *it);
706  if (write_protein_sequence_)
707  {
708  hit.setSequence(fe.sequence);
709  } // no else, since sequence is empty by default
710  if (write_protein_description_)
711  {
712  hit.setDescription(fe.description);
713  } // no else, since description is empty by default
714  }
715  if (protein_is_decoy[*it])
716  {
717  hit.setMetaValue("target_decoy", "decoy");
718  ++stats_proteins_decoy;
719  }
720  else
721  {
722  hit.setMetaValue("target_decoy", "target");
723  ++stats_proteins_target;
724  }
725  phits.push_back(hit);
726  ++stats_matched_new_proteins;
727  }
728  stats_matched_proteins += phits.size();
729  }
730 
731 
732  OPENMS_LOG_INFO << "-----------------------------------\n";
733  OPENMS_LOG_INFO << "Protein statistics\n";
734  OPENMS_LOG_INFO << "\n";
735  OPENMS_LOG_INFO << " total proteins searched: " << proteins.size() << "\n";
736  OPENMS_LOG_INFO << " matched proteins : " << stats_matched_proteins << " (" << stats_matched_new_proteins << " new)\n";
737  if (stats_matched_proteins)
738  { // prevent Division-by-0 Exception
739  OPENMS_LOG_INFO << " matched target proteins: " << stats_proteins_target << " (" << stats_proteins_target * 100 / stats_matched_proteins << " %)\n";
740  OPENMS_LOG_INFO << " matched decoy proteins : " << stats_proteins_decoy << " (" << stats_proteins_decoy * 100 / stats_matched_proteins << " %)\n";
741  }
742  OPENMS_LOG_INFO << " orphaned proteins : " << stats_orphaned_proteins << (keep_unreferenced_proteins_ ? " (all kept)" : " (all removed)\n");
743  OPENMS_LOG_INFO << "-----------------------------------" << std::endl;
744 
745 
747  bool has_error = false;
748 
749  if (invalid_protein_sequence)
750  {
751  OPENMS_LOG_ERROR << "Error: One or more protein sequences contained the characters '[' or '(', which are illegal in protein sequences."
752  << "\nPeptide hits might be masked by these characters (which usually indicate presence of modifications).\n";
753  has_error = true;
754  }
755 
756  if ((stats_count_m_d + stats_count_m_td) == 0)
757  {
758  String msg("No peptides were matched to the decoy portion of the database! Did you provide the correct concatenated database? Are your 'decoy_string' (=" + decoy_string_ + ") and 'decoy_string_position' (=" + std::string(param_.getValue("decoy_string_position")) + ") settings correct?");
759  if (missing_decoy_action_ == MissingDecoy::IS_ERROR)
760  {
761  OPENMS_LOG_ERROR << "Error: " << msg << "\nSet 'missing_decoy_action' to 'warn' if you are sure this is ok!\nAborting ..." << std::endl;
762  has_error = true;
763  }
764  else if (missing_decoy_action_ == MissingDecoy::WARN)
765  {
766  OPENMS_LOG_WARN << "Warn: " << msg << "\nSet 'missing_decoy_action' to 'error' if you want to elevate this to an error!" << std::endl;
767  }
768  else // silent
769  {
770  }
771  }
772 
773  if (stats_unmatched > 0)
774  {
775  OPENMS_LOG_ERROR << "PeptideIndexer found unmatched peptides, which could not be associated to a protein.\n";
776  if (unmatched_action_ == Unmatched::IS_ERROR)
777  {
779  << "Potential solutions:\n"
780  << " - check your FASTA database is identical to the search DB (or use 'auto')\n"
781  << " - set 'enzyme:specificity' and 'enzyme:name' to 'auto' to match the parameters of the search engine\n"
782  << " - increase 'aaa_max' to allow more ambiguous amino acids\n"
783  << " - as a last resort: use the 'unmatched_action' option to accept or even remove unmatched peptides\n"
784  << " (note that unmatched peptides cannot be used for FDR calculation or quantification)\n";
785  has_error = true;
786  }
787  else if (unmatched_action_ == Unmatched::WARN)
788  {
789  OPENMS_LOG_ERROR << " Warning: " << stats_unmatched << " unmatched hits have been found, but were not removed!\n"
790  << "These are not annotated with target/decoy information and might lead to issues with downstream tools (such as FDR).\n"
791  << "Switch to '" << names_of_unmatched[(Size)Unmatched::REMOVE] << "' if you want to avoid these problems.\n";
792  }
793  else if (unmatched_action_ == Unmatched::REMOVE)
794  {
795  OPENMS_LOG_ERROR << " Warning: " << stats_unmatched <<" unmatched hits have been removed!\n"
796  << "Make sure that these hits are actually a violation of the cutting rules by inspecting the database!\n";
797  if (xtandem_fix_parameters) OPENMS_LOG_ERROR << "Since the results are from X!Tandem, this is probably ok (check anyways).\n";
798  }
799  else
800  {
801  throw Exception::NotImplemented(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
802  }
803  }
804 
805 
806  if (has_error)
807  {
808  OPENMS_LOG_ERROR << "Result files will be written, but PeptideIndexer will exit with an error code." << std::endl;
809  return UNEXPECTED_RESULT;
810  }
811  return EXECUTION_OK;
812  }
813 
814  const String& getDecoyString() const;
815 
816  bool isPrefix() const;
817 
818  protected:
819 
821  {
822  OpenMS::Size protein_index; //< index of the protein the peptide is contained in
823  OpenMS::Int position; //< the position of the peptide in the protein
824  char AABefore; //< the amino acid after the peptide in the protein
825  char AAAfter; //< the amino acid before the peptide in the protein
826 
827  const std::tuple<const Size&, const Int&, const char&, const char&> tie() const
828  {
829  return std::tie(protein_index, position, AABefore, AAAfter);
830  }
831  bool operator<(const PeptideProteinMatchInformation& other) const
832  {
833  return tie() < other.tie();
834  }
836  {
837  return tie() == other.tie();
838  }
839  };
840 
842  {
843  public:
844  typedef std::map<OpenMS::Size, std::set<PeptideProteinMatchInformation> > MapType;
845  MapType pep_to_prot; //< peptide index --> protein indices
846  OpenMS::Size filter_passed; //< number of accepted hits (passing addHit() constraints)
847  OpenMS::Size filter_rejected; //< number of rejected hits (not passing addHit())
848 
849  private:
851  bool xtandem_; //< are we checking xtandem cleavage rules?
852 
853  public:
854  explicit FoundProteinFunctor(const ProteaseDigestion& enzyme, bool xtandem) :
855  pep_to_prot(), filter_passed(0), filter_rejected(0), enzyme_(enzyme), xtandem_(xtandem)
856  {
857  }
858 
860  {
861  if (pep_to_prot.empty())
862  { // first merge is easy
863  pep_to_prot.swap(other.pep_to_prot);
864  }
865  else
866  {
867  for (FoundProteinFunctor::MapType::const_iterator it = other.pep_to_prot.begin(); it != other.pep_to_prot.end(); ++it)
868  { // augment set
869  this->pep_to_prot[it->first].insert(other.pep_to_prot[it->first].begin(), other.pep_to_prot[it->first].end());
870  }
871  other.pep_to_prot.clear();
872  }
873  // cheap members
874  this->filter_passed += other.filter_passed;
875  other.filter_passed = 0;
876  this->filter_rejected += other.filter_rejected;
877  other.filter_rejected = 0;
878  }
879 
880  void addHit(const OpenMS::Size idx_pep,
881  const OpenMS::Size idx_prot,
882  const OpenMS::Size len_pep,
883  const OpenMS::String& seq_prot,
885  {
886  //TODO we could read and double-check missed cleavages as well
887  if (enzyme_.isValidProduct(seq_prot, position, len_pep, true, true, xtandem_))
888  {
890  {
891  idx_prot,
892  position,
893  (position == 0) ? PeptideEvidence::N_TERMINAL_AA : seq_prot[position - 1],
894  (position + len_pep >= seq_prot.size()) ?
896  seq_prot[position + len_pep]
897  };
898  pep_to_prot[idx_pep].insert(match);
899  ++filter_passed;
900  }
901  else
902  {
903  //std::cerr << "REJECTED Peptide " << seq_pep << " with hit to protein "
904  // << seq_prot << " at position " << position << std::endl;
905  ++filter_rejected;
906  }
907  }
908 
909  };
910 
911  inline void addHits_(AhoCorasickAmbiguous& fuzzyAC, const AhoCorasickAmbiguous::FuzzyACPattern& pattern, const AhoCorasickAmbiguous::PeptideDB& pep_DB, const String& prot, const String& full_prot, SignedSize idx_prot, Int offset, FoundProteinFunctor& func_threads) const
912  {
913  fuzzyAC.setProtein(prot);
914  while (fuzzyAC.findNext(pattern))
915  {
916  const seqan::Peptide& tmp_pep = pep_DB[fuzzyAC.getHitDBIndex()];
917  func_threads.addHit(fuzzyAC.getHitDBIndex(), idx_prot, length(tmp_pep), full_prot, fuzzyAC.getHitProteinPosition() + offset);
918  }
919  }
920 
921  void updateMembers_() override;
922 
923  String decoy_string_{};
924  bool prefix_{ false };
925  MissingDecoy missing_decoy_action_ = MissingDecoy::IS_ERROR;
926  String enzyme_name_{};
927  String enzyme_specificity_{};
928 
929  bool write_protein_sequence_{ false };
930  bool write_protein_description_{ false };
931  bool keep_unreferenced_proteins_{ false };
932  Unmatched unmatched_action_ = Unmatched::IS_ERROR;
933  bool IL_equivalent_{ false };
934 
935  Int aaa_max_{0};
936  Int mm_max_{0};
937  };
938 }
939 
#define DEBUG_ONLY
Definition: AhoCorasickAmbiguous.h:46
#define OPENMS_LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged.
Definition: LogStream.h:460
#define OPENMS_LOG_ERROR
Macro to be used if non-fatal error are reported (processing continues)
Definition: LogStream.h:455
#define OPENMS_LOG_INFO
Macro if a information, e.g. a status should be reported.
Definition: LogStream.h:465
Extended Aho-Corasick algorithm capable of matching ambiguous amino acids in the pattern (i....
Definition: AhoCorasickAmbiguous.h:973
::seqan::StringSet<::seqan::AAString > PeptideDB
Definition: AhoCorasickAmbiguous.h:975
void setProtein(const String &protein_sequence)
Reset to new protein sequence. All previous data is forgotten.
Definition: AhoCorasickAmbiguous.h:1026
Int getHitProteinPosition()
Offset into protein sequence where hit was found.
Definition: AhoCorasickAmbiguous.h:1059
static void initPattern(const PeptideDB &pep_db, const int aaa_max, const int mm_max, FuzzyACPattern &pattern)
Construct a trie from a set of peptide sequences (which are to be found in a protein).
Definition: AhoCorasickAmbiguous.h:993
bool findNext(const FuzzyACPattern &pattern)
Enumerate hits.
Definition: AhoCorasickAmbiguous.h:1039
Size getHitDBIndex()
Get index of hit into peptide database of the pattern.
Definition: AhoCorasickAmbiguous.h:1049
::seqan::Pattern< PeptideDB, ::seqan::FuzzyAC > FuzzyACPattern
Definition: AhoCorasickAmbiguous.h:976
static Result findDecoyString(FASTAContainer< T > &proteins)
Heuristic to determine the decoy string given a set of protein names.
Definition: FASTAContainer.h:361
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:93
@ SPEC_FULL
fully enzyme specific, e.g., tryptic (ends with KR, AA-before is KR), or peptide is at protein termin...
Definition: EnzymaticDigestion.h:70
@ SPEC_UNKNOWN
Definition: EnzymaticDigestion.h:71
static Specificity getSpecificityByName(const String &name)
static const std::string NamesOfSpecificity[SIZE_OF_SPECIFICITY]
Names of the Specificity.
Definition: EnzymaticDigestion.h:77
String getEnzymeName() const
Returns the enzyme for the digestion.
void setSpecificity(Specificity spec)
Sets the specificity for the digestion (default is SPEC_FULL).
Exception indicating that an invalid parameter was handed over to an algorithm.
Definition: Exception.h:341
Not implemented exception.
Definition: Exception.h:430
FASTAContainer<TFI_Vector> simply takes an existing vector of FASTAEntries and provides the same inte...
Definition: FASTAContainer.h:246
Map class based on the STL map (containing several convenience functions)
Definition: Map.h:52
bool has(const Key &key) const
Test whether the map contains the given key.
Definition: Map.h:108
void setMetaValue(const String &name, const DataValue &value)
Sets the DataValue corresponding to a name.
Representation of a peptide evidence.
Definition: PeptideEvidence.h:51
static const char C_TERMINAL_AA
Definition: PeptideEvidence.h:61
static const char N_TERMINAL_AA
Definition: PeptideEvidence.h:60
Refreshes the protein references for all peptide hits in a vector of PeptideIdentifications and adds ...
Definition: PeptideIndexing.h:128
const String & getDecoyString() const
Unmatched
Action to take when peptide hits could not be matched.
Definition: PeptideIndexing.h:146
MissingDecoy
Definition: PeptideIndexing.h:155
static char const *const AUTO_MODE
name of enzyme/specificity which signals that the enzyme/specificity should be taken from meta inform...
Definition: PeptideIndexing.h:132
ExitCodes run(std::vector< FASTAFile::FASTAEntry > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)
forward for old interface and pyOpenMS; use run<T>() for more control
Definition: PeptideIndexing.h:171
ExitCodes run(FASTAContainer< T > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)
Re-index peptide identifications honoring enzyme cutting rules, ambiguous amino acids and target/deco...
Definition: PeptideIndexing.h:213
~PeptideIndexing() override
Default destructor.
PeptideIndexing()
Default constructor.
void updateMembers_() override
This method is used to update extra member variables at the end of the setParameters() method.
ExitCodes
Exit codes.
Definition: PeptideIndexing.h:136
@ PEPTIDE_IDS_EMPTY
Definition: PeptideIndexing.h:139
@ ILLEGAL_PARAMETERS
Definition: PeptideIndexing.h:140
@ DATABASE_EMPTY
Definition: PeptideIndexing.h:138
@ EXECUTION_OK
Definition: PeptideIndexing.h:137
void addHits_(AhoCorasickAmbiguous &fuzzyAC, const AhoCorasickAmbiguous::FuzzyACPattern &pattern, const AhoCorasickAmbiguous::PeptideDB &pep_DB, const String &prot, const String &full_prot, SignedSize idx_prot, Int offset, FoundProteinFunctor &func_threads) const
Definition: PeptideIndexing.h:911
Base class for all classes that want to report their progress.
Definition: ProgressLogger.h:55
Class for the enzymatic digestion of proteins.
Definition: ProteaseDigestion.h:61
bool isValidProduct(const String &protein, int pep_pos, int pep_length, bool ignore_missed_cleavages=true, bool allow_nterm_protein_cleavage=false, bool allow_random_asp_pro_cleavage=false) const
Variant of EnzymaticDigestion::isValidProduct() with support for n-term protein cleavage and random D...
void setEnzyme(const String &name)
Sets the enzyme for the digestion (by name)
Representation of a protein hit.
Definition: ProteinHit.h:60
void setSequence(const String &sequence)
sets the protein sequence
void setDescription(const String &description)
sets the description of the protein
void setAccession(const String &accession)
sets the accession of the protein
This class is used to determine the current process' CPU (user and/or kernel) and wall time.
Definition: StopWatch.h:66
String toString() const
get a compact representation of the current time status.
void start()
Start the stop watch.
void stop()
Stop the stop watch (can be resumed later). If the stop watch was not running an exception is thrown.
double getClockTime() const
void reset()
Clear the stop watch but keep running.
static String & toUpper(String &this_s)
Definition: StringUtils.h:874
A more convenient string class.
Definition: String.h:61
String substr(size_t pos=0, size_t n=npos) const
Wrapper for the STL substr() method. Returns a String object with its contents initialized to a subst...
bool hasPrefix(const String &string) const
true if String begins with string, false otherwise
String & remove(char what)
Remove all occurrences of the character what.
bool has(Byte byte) const
true if String contains the byte, false otherwise
String & substitute(char from, char to)
Replaces all occurrences of the character from by the character to.
bool hasSuffix(const String &string) const
true if String ends with string, false otherwise
int Int
Signed integer type.
Definition: Types.h:102
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:134
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
template parameter for vector-based FASTA access
Definition: FASTAContainer.h:82
bool isAmbiguous(AAcid c)
Definition: AhoCorasickAmbiguous.h:580
String< AAcid, Alloc< void > > AAString
Definition: AhoCorasickAmbiguous.h:206
Size< TNeedle >::Type position(const PatternAuxData< TNeedle > &dh)
Definition: AhoCorasickAmbiguous.h:563
FASTA entry type (identifier, description and sequence) The first String corresponds to the identifie...
Definition: FASTAFile.h:72
String sequence
Definition: FASTAFile.h:75
String description
Definition: FASTAFile.h:74
Definition: PeptideIndexing.h:842
ProteaseDigestion enzyme_
Definition: PeptideIndexing.h:850
OpenMS::Size filter_rejected
Definition: PeptideIndexing.h:847
void addHit(const OpenMS::Size idx_pep, const OpenMS::Size idx_prot, const OpenMS::Size len_pep, const OpenMS::String &seq_prot, OpenMS::Int position)
Definition: PeptideIndexing.h:880
std::map< OpenMS::Size, std::set< PeptideProteinMatchInformation > > MapType
Definition: PeptideIndexing.h:844
FoundProteinFunctor(const ProteaseDigestion &enzyme, bool xtandem)
Definition: PeptideIndexing.h:854
MapType pep_to_prot
Definition: PeptideIndexing.h:845
bool xtandem_
Definition: PeptideIndexing.h:851
void merge(FoundProteinFunctor &other)
Definition: PeptideIndexing.h:859
OpenMS::Size filter_passed
Definition: PeptideIndexing.h:846
OpenMS::Size protein_index
Definition: PeptideIndexing.h:822
bool operator<(const PeptideProteinMatchInformation &other) const
Definition: PeptideIndexing.h:831
const std::tuple< const Size &, const Int &, const char &, const char & > tie() const
Definition: PeptideIndexing.h:827
OpenMS::Int position
Definition: PeptideIndexing.h:823
char AABefore
Definition: PeptideIndexing.h:824
bool operator==(const PeptideProteinMatchInformation &other) const
Definition: PeptideIndexing.h:835
char AAAfter
Definition: PeptideIndexing.h:825
A convenience class to report either absolute or delta (between two timepoints) RAM usage.
Definition: SysInfo.h:84
String delta(const String &event="delta")
void after()
record data for the second timepoint