OpenMS
FASTAContainer.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2023.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
43 
44 #include <functional>
45 #include <fstream>
46 #include <unordered_map>
47 #include <memory>
48 #include <utility>
49 #include <vector>
50 
51 #include <boost/regex.hpp>
52 
53 namespace OpenMS
54 {
55 
56  struct TFI_File;
57  struct TFI_Vector;
58 
81 template<typename TBackend>
82 class FASTAContainer; // prototype
83 
92 template<>
93 class FASTAContainer<TFI_File>
94 {
95 public:
96  FASTAContainer() = delete;
97 
99  FASTAContainer(const String& FASTA_file)
100  : f_(),
101  offsets_(),
102  data_fg_(),
103  data_bg_(),
104  chunk_offset_(0),
105  filename_(FASTA_file)
106  {
107  f_.readStart(FASTA_file);
108  }
109 
111  size_t getChunkOffset() const
112  {
113  return chunk_offset_;
114  }
115 
123  {
124  chunk_offset_ += data_fg_.size();
125  data_fg_.swap(data_bg_);
126  data_bg_.clear(); // just in case someone calls activateCache() multiple times...
127  return !data_fg_.empty();
128  }
129 
136  bool cacheChunk(int suggested_size)
137  {
138  data_bg_.clear();
139  data_bg_.reserve(suggested_size);
141  for (int i = 0; i < suggested_size; ++i)
142  {
143  std::streampos spos = f_.position();
144  if (!f_.readNext(p)) break;
145  data_bg_.push_back(std::move(p));
146  offsets_.push_back(spos);
147  }
148  return !data_bg_.empty();
149  }
150 
152  size_t chunkSize() const
153  {
154  return data_fg_.size();
155  }
156 
164  const FASTAFile::FASTAEntry& chunkAt(size_t pos) const
165  {
166  return data_fg_[pos];
167  }
168 
182  bool readAt(FASTAFile::FASTAEntry& protein, size_t pos)
183  {
184  // check if position is currently cached...
185  if (chunk_offset_ <= pos && pos < chunk_offset_ + chunkSize())
186  {
187  protein = data_fg_[pos - chunk_offset_];
188  return true;
189  }
190  // read anew from disk...
191  if (pos >= offsets_.size())
192  {
193  throw Exception::IndexOverflow(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, pos, offsets_.size());
194  }
195  std::streampos spos = f_.position(); // save old position
196  if (!f_.setPosition(offsets_[pos])) return false;
197  bool r = f_.readNext(protein);
198  f_.setPosition(spos); // restore old position
199  return r;
200  }
201 
203  bool empty()
204  { // trusting the FASTA file can be read...
205  return f_.atEnd() && offsets_.empty();
206  }
207 
209  void reset()
210  {
211  offsets_.clear();
212  data_fg_.clear();
213  data_bg_.clear();
214  chunk_offset_ = 0;
215  f_.readStart(filename_);
216  }
217 
218 
224  size_t size() const
225  {
226  return offsets_.size();
227  }
228 
229 private:
231  std::vector<std::streampos> offsets_;
232  std::vector<FASTAFile::FASTAEntry> data_fg_;
233  std::vector<FASTAFile::FASTAEntry> data_bg_;
234  size_t chunk_offset_;
235  std::string filename_;
236 };
237 
244 template<>
245 class FASTAContainer<TFI_Vector>
246 {
247 public:
248  FASTAContainer() = delete;
249 
255  FASTAContainer(const std::vector<FASTAFile::FASTAEntry>& data)
256  : data_(data)
257  {
258  }
259 
261  size_t getChunkOffset() const
262  {
263  return 0;
264  }
265 
271  {
272  if (!activate_count_)
273  {
274  activate_count_ = 1;
275  return true;
276  }
277  return false;
278  }
279 
283  bool cacheChunk(int /*suggested_size*/)
284  {
285  if (!cache_count_)
286  {
287  cache_count_ = 1;
288  return true;
289  }
290  return false;
291  }
292 
297  size_t chunkSize() const
298  {
299  return data_.size();
300  }
301 
303  const FASTAFile::FASTAEntry& chunkAt(size_t pos) const
304  {
305  return data_[pos];
306  }
307 
309  bool readAt(FASTAFile::FASTAEntry& protein, size_t pos) const
310  {
311  protein = data_[pos];
312  return true;
313  }
314 
316  bool empty() const
317  {
318  return data_.empty();
319  }
320 
322  size_t size() const
323  {
324  return data_.size();
325  }
326 
328  void reset()
329  {
330  activate_count_ = 0;
331  cache_count_ = 0;
332  }
333 
334 private:
335  const std::vector<FASTAFile::FASTAEntry>& data_;
336  int activate_count_ = 0;
337  int cache_count_ = 0;
338 };
339 
344 {
345 public:
346  struct Result
347  {
348  bool success;
350  bool is_prefix;
351 
352  bool operator==(const Result& rhs) const
353  {
354  return success == rhs.success
355  && name == rhs.name
356  && is_prefix == rhs.is_prefix;
357  }
358  };
359 
364  {
365  std::unordered_map<std::string, std::pair<Size, Size>> decoy_count;
366  std::unordered_map<std::string, std::string> decoy_case_sensitive;
370 
371  bool operator==(const DecoyStatistics& rhs) const
372  {
373  return decoy_count == rhs.decoy_count
378  }
379  };
380 
381  // decoy strings
382  inline static const std::vector<std::string> affixes = { "decoy", "dec", "reverse", "rev", "reversed", "__id_decoy", "xxx", "shuffled", "shuffle", "pseudo", "random" };
383 
384  // setup prefix- and suffix regex strings
385  inline static const std::string regexstr_prefix = std::string("^(") + ListUtils::concatenate<std::string>(affixes, "_*|") + "_*)";
386  inline static const std::string regexstr_suffix = std::string("(_") + ListUtils::concatenate<std::string>(affixes, "*|_") + ")$";
387 
395  template<typename T>
397  {
398  // calls function to search for decoys in input data
399  DecoyStatistics decoy_stats = countDecoys(proteins);
400 
401  // DEBUG ONLY: print counts of found decoys
402  for (const auto &a : decoy_stats.decoy_count)
403  {
404  OPENMS_LOG_DEBUG << a.first << "\t" << a.second.first << "\t" << a.second.second << std::endl;
405  }
406 
407  // less than 40% of proteins are decoys -> won't be able to determine a decoy string and its position
408  // return default values
409  if (static_cast<double>(decoy_stats.all_prefix_occur + decoy_stats.all_suffix_occur) < 0.4 * static_cast<double>(decoy_stats.all_proteins_count))
410  {
411  OPENMS_LOG_ERROR << "Unable to determine decoy string (not enough occurrences; <40%)!" << std::endl;
412  return {false, "?", true};
413  }
414 
415  if (decoy_stats.all_prefix_occur == decoy_stats.all_suffix_occur)
416  {
417  OPENMS_LOG_ERROR << "Unable to determine decoy string (prefix and suffix occur equally often)!" << std::endl;
418  return {false, "?", true};
419  }
420 
421  // Decoy prefix occurred at least 80% of all prefixes + observed in at least 40% of all proteins -> set it as prefix decoy
422  for (const auto& pair : decoy_stats.decoy_count)
423  {
424  const std::string & case_insensitive_decoy_string = pair.first;
425  const std::pair<Size, Size>& prefix_suffix_counts = pair.second;
426  double freq_prefix = static_cast<double>(prefix_suffix_counts.first) / static_cast<double>(decoy_stats.all_prefix_occur);
427  double freq_prefix_in_proteins = static_cast<double>(prefix_suffix_counts.first) / static_cast<double>(decoy_stats.all_proteins_count);
428 
429  if (freq_prefix >= 0.8 && freq_prefix_in_proteins >= 0.4)
430  {
431  if (prefix_suffix_counts.first != decoy_stats.all_prefix_occur)
432  {
433  OPENMS_LOG_WARN << "More than one decoy prefix observed!" << std::endl;
434  OPENMS_LOG_WARN << "Using most frequent decoy prefix (" << (int)(freq_prefix * 100) << "%)" << std::endl;
435  }
436 
437  return { true, decoy_stats.decoy_case_sensitive[case_insensitive_decoy_string], true};
438  }
439  }
440 
441  // Decoy suffix occurred at least 80% of all suffixes + observed in at least 40% of all proteins -> set it as suffix decoy
442  for (const auto& pair : decoy_stats.decoy_count)
443  {
444  const std::string& case_insensitive_decoy_string = pair.first;
445  const std::pair<Size, Size>& prefix_suffix_counts = pair.second;
446  double freq_suffix = static_cast<double>(prefix_suffix_counts.second) / static_cast<double>(decoy_stats.all_suffix_occur);
447  double freq_suffix_in_proteins = static_cast<double>(prefix_suffix_counts.second) / static_cast<double>(decoy_stats.all_proteins_count);
448 
449  if (freq_suffix >= 0.8 && freq_suffix_in_proteins >= 0.4)
450  {
451  if (prefix_suffix_counts.second != decoy_stats.all_suffix_occur)
452  {
453  OPENMS_LOG_WARN << "More than one decoy suffix observed!" << std::endl;
454  OPENMS_LOG_WARN << "Using most frequent decoy suffix (" << (int)(freq_suffix * 100) << "%)" << std::endl;
455  }
456 
457  return { true, decoy_stats.decoy_case_sensitive[case_insensitive_decoy_string], false};
458  }
459  }
460 
461  OPENMS_LOG_ERROR << "Unable to determine decoy string and its position. Please provide a decoy string and its position as parameters." << std::endl;
462  return {false, "?", true};
463  }
464 
471  template<typename T>
473  {
474  // common decoy strings in FASTA files
475  // note: decoy prefixes/suffices must be provided in lower case
476 
477  DecoyStatistics ds;
478 
479  // setup regexes
480  const boost::regex pattern_prefix(regexstr_prefix);
481  const boost::regex pattern_suffix(regexstr_suffix);
482 
483  constexpr size_t PROTEIN_CACHE_SIZE = 4e5;
484 
485  while (true)
486  {
487  proteins.cacheChunk(PROTEIN_CACHE_SIZE);
488  if (!proteins.activateCache()) break;
489 
490  auto prot_count = (SignedSize)proteins.chunkSize();
491  ds.all_proteins_count += prot_count;
492 
493  boost::smatch sm;
494  for (SignedSize i = 0; i < prot_count; ++i)
495  {
496  String seq = proteins.chunkAt(i).identifier;
497 
498  String seq_lower = seq;
499  seq_lower.toLower();
500 
501  // search for prefix
502  bool found_prefix = boost::regex_search(seq_lower, sm, pattern_prefix);
503  if (found_prefix)
504  {
505  std::string match = sm[0];
506  ds.all_prefix_occur++;
507 
508  // increase count of observed prefix
509  ds.decoy_count[match].first++;
510 
511  // store observed (case sensitive and with special characters)
512  std::string seq_decoy = StringUtils::prefix(seq, match.length());
513  ds.decoy_case_sensitive[match] = seq_decoy;
514  }
515 
516  // search for suffix
517  bool found_suffix = boost::regex_search(seq_lower, sm, pattern_suffix);
518  if (found_suffix)
519  {
520  std::string match = sm[0];
521  ds.all_suffix_occur++;
522 
523  // increase count of observed suffix
524  ds.decoy_count[match].second++;
525 
526  // store observed (case sensitive and with special characters)
527  std::string seq_decoy = StringUtils::suffix(seq, match.length());
528  ds.decoy_case_sensitive[match] = seq_decoy;
529  }
530  }
531  }
532  return ds;
533  }
534 
535 private:
536  using DecoyStringToAffixCount = std::unordered_map<std::string, std::pair<Size, Size>>;
537  using CaseInsensitiveToCaseSensitiveDecoy = std::unordered_map<std::string, std::string>;
538 };
539 
540 } // namespace OpenMS
541 
#define OPENMS_LOG_DEBUG
Macro for general debugging information.
Definition: LogStream.h:480
#define OPENMS_LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged.
Definition: LogStream.h:470
#define OPENMS_LOG_ERROR
Macro to be used if non-fatal error are reported (processing continues)
Definition: LogStream.h:465
Helper class for calculations on decoy proteins.
Definition: FASTAContainer.h:344
std::unordered_map< std::string, std::pair< Size, Size > > DecoyStringToAffixCount
Definition: FASTAContainer.h:536
std::unordered_map< std::string, std::string > CaseInsensitiveToCaseSensitiveDecoy
Definition: FASTAContainer.h:537
static Result findDecoyString(FASTAContainer< T > &proteins)
Heuristic to determine the decoy string given a set of protein names.
Definition: FASTAContainer.h:396
static const std::string regexstr_prefix
Definition: FASTAContainer.h:385
static DecoyStatistics countDecoys(FASTAContainer< T > &proteins)
Function to count the occurrences of decoy strings in a given set of protein names.
Definition: FASTAContainer.h:472
static const std::vector< std::string > affixes
Definition: FASTAContainer.h:382
static const std::string regexstr_suffix
Definition: FASTAContainer.h:386
Int overflow exception.
Definition: Exception.h:247
std::vector< FASTAFile::FASTAEntry > data_bg_
prefetched (background) data; will become the next active data
Definition: FASTAContainer.h:233
bool readAt(FASTAFile::FASTAEntry &protein, size_t pos)
Retrieve a FASTA entry at global position pos (must not be behind the currently active chunk,...
Definition: FASTAContainer.h:182
std::string filename_
FASTA file name.
Definition: FASTAContainer.h:235
size_t size() const
NOT the number of entries in the FASTA file, but merely the number of already read entries (since we ...
Definition: FASTAContainer.h:224
bool empty()
is the FASTA file empty?
Definition: FASTAContainer.h:203
std::vector< std::streampos > offsets_
internal byte offsets into FASTA file for random access reading of previous entries.
Definition: FASTAContainer.h:231
bool activateCache()
Swaps in the background cache of entries, read previously via cacheChunk()
Definition: FASTAContainer.h:122
FASTAContainer(const String &FASTA_file)
C'tor with FASTA filename.
Definition: FASTAContainer.h:99
size_t chunk_offset_
number of entries before the current chunk
Definition: FASTAContainer.h:234
bool cacheChunk(int suggested_size)
Prefetch a new cache in the background, with up to suggested_size entries (or fewer upon reaching end...
Definition: FASTAContainer.h:136
FASTAFile f_
FASTA file connection.
Definition: FASTAContainer.h:230
std::vector< FASTAFile::FASTAEntry > data_fg_
active (foreground) data
Definition: FASTAContainer.h:232
size_t chunkSize() const
number of entries in active cache
Definition: FASTAContainer.h:152
void reset()
resets reading of the FASTA file, enables fresh reading of the FASTA from the beginning
Definition: FASTAContainer.h:209
const FASTAFile::FASTAEntry & chunkAt(size_t pos) const
Retrieve a FASTA entry at cache position pos (fast)
Definition: FASTAContainer.h:164
size_t getChunkOffset() const
how many entries were read and got swapped out already
Definition: FASTAContainer.h:111
FASTAContainer(const std::vector< FASTAFile::FASTAEntry > &data)
C'tor for already existing data (by reference).
Definition: FASTAContainer.h:255
size_t size() const
calls size() on underlying vector
Definition: FASTAContainer.h:322
bool readAt(FASTAFile::FASTAEntry &protein, size_t pos) const
fast access to an entry
Definition: FASTAContainer.h:309
bool empty() const
calls empty() on underlying vector
Definition: FASTAContainer.h:316
bool cacheChunk(int)
no-op (since data is already fully available as vector)
Definition: FASTAContainer.h:283
bool activateCache()
no-op (since data is already fully available as vector)
Definition: FASTAContainer.h:270
const std::vector< FASTAFile::FASTAEntry > & data_
reference to existing data
Definition: FASTAContainer.h:335
size_t chunkSize() const
active data spans the full range, i.e. size of container
Definition: FASTAContainer.h:297
void reset()
required for template parameters!
Definition: FASTAContainer.h:328
const FASTAFile::FASTAEntry & chunkAt(size_t pos) const
fast access to chunked (i.e. all) entries
Definition: FASTAContainer.h:303
size_t getChunkOffset() const
always 0, since this specialization requires/supports no chunking
Definition: FASTAContainer.h:261
This class serves for reading in and writing FASTA files If the protein/gene sequence contains unusua...
Definition: FASTAFile.h:61
A more convenient string class.
Definition: String.h:60
String & toLower()
Converts the string to lowercase.
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:134
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
static String suffix(const String &this_s, size_t length)
Definition: StringUtilsSimple.h:156
static String prefix(const String &this_s, size_t length)
Definition: StringUtilsSimple.h:147
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:48
template parameter for vector-based FASTA access
Definition: FASTAContainer.h:82
struct for intermediate results needed for calculations on decoy proteins
Definition: FASTAContainer.h:364
std::unordered_map< std::string, std::pair< Size, Size > > decoy_count
map decoys to counts of occurrences as prefix/suffix
Definition: FASTAContainer.h:365
Size all_proteins_count
number of all checked proteins
Definition: FASTAContainer.h:369
Size all_suffix_occur
number of proteins with found decoy_suffix
Definition: FASTAContainer.h:368
std::unordered_map< std::string, std::string > decoy_case_sensitive
map case insensitive strings back to original case (as used in fasta)
Definition: FASTAContainer.h:366
Size all_prefix_occur
number of proteins with found decoy_prefix
Definition: FASTAContainer.h:367
bool operator==(const DecoyStatistics &rhs) const
Definition: FASTAContainer.h:371
Definition: FASTAContainer.h:347
bool operator==(const Result &rhs) const
Definition: FASTAContainer.h:352
bool is_prefix
on success, was it a prefix or suffix
Definition: FASTAContainer.h:350
bool success
did more than 40% of proteins have the *same* prefix or suffix
Definition: FASTAContainer.h:348
String name
on success, what was the decoy string?
Definition: FASTAContainer.h:349
FASTA entry type (identifier, description and sequence) The first String corresponds to the identifie...
Definition: FASTAFile.h:72