OpenMS
EnzymaticDigestion.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2023.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow, Xiao Liang $
32 // $Authors: Marc Sturm, Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
38 #include <OpenMS/CONCEPT/Types.h>
39 #include <boost/regex_fwd.hpp> // forward declaration of boost::regex
40 #include <functional> // for std::function
41 #include <memory> // unique_ptr
42 #include <string>
43 #include <vector>
44 
45 namespace OpenMS
46 {
47  class StringView;
48 
63  class OPENMS_DLLAPI EnzymaticDigestion
64  {
65  public:
68  { // note: the value of the first three items is important, since some engines just report the number of required termini (0, 1, 2)
69  SPEC_NONE = 0,
70  SPEC_SEMI = 1,
71  SPEC_FULL = 2,
72  SPEC_UNKNOWN = 3,
73  SPEC_NOCTERM = 8,
74  SPEC_NONTERM = 9,
75  SIZE_OF_SPECIFICITY = 10
76  };
78  static const std::string NamesOfSpecificity[SIZE_OF_SPECIFICITY];
79 
81  static const std::string NoCleavage;
82 
84  static const std::string UnspecificCleavage;
85 
88 
91 
94 
97 
100 
102  void setMissedCleavages(Size missed_cleavages);
103 
106 
108  virtual void setEnzyme(const DigestionEnzyme* enzyme);
109 
112 
115 
119 
131  Size digestUnmodified(const StringView& sequence, std::vector<StringView>& output, Size min_length = 1, Size max_length = 0) const;
132 
146  Size digestUnmodified(const StringView& sequence, std::vector<std::pair<Size, Size>>& output, Size min_length = 1, Size max_length = 0) const;
147 
159  bool isValidProduct(const String& protein, int pep_pos, int pep_length, bool ignore_missed_cleavages = true) const;
160 
166  Size countInternalCleavageSites(const String& sequence) const;
167 
175  bool filterByMissedCleavages(const String& sequence, const std::function<bool(const Int)>& filter) const;
176 
177  protected:
184  bool isValidProduct_(const String& sequence,
185  int pos,
186  int length,
187  bool ignore_missed_cleavages,
188  bool allow_nterm_protein_cleavage,
189  bool allow_random_asp_pro_cleavage) const;
205  std::vector<int> tokenize_(const String& sequence, int start = 0, int end = -1) const;
206 
215  Size digestAfterTokenize_(const std::vector<int>& fragment_positions, const StringView& sequence, std::vector<StringView>& output, Size min_length = 0, Size max_length = -1) const;
216  Size digestAfterTokenize_(const std::vector<int>& fragment_positions, const StringView& sequence, std::vector<std::pair<Size, Size>>& output, Size min_length = 0, Size max_length = -1) const;
217 
226  Size countMissedCleavages_(const std::vector<int>& cleavage_positions, Size seq_start, Size seq_end) const;
227 
230 
234  std::unique_ptr<boost::regex> re_; // use PImpl, since #include cost is huge
235 
238  };
239 
240 } // namespace OpenMS
Base class for digestion enzymes.
Definition: DigestionEnzyme.h:53
Class for the enzymatic digestion of sequences.
Definition: EnzymaticDigestion.h:64
bool isValidProduct(const String &protein, int pep_pos, int pep_length, bool ignore_missed_cleavages=true) const
Is the peptide fragment starting at position pep_pos with length pep_length within the sequence prote...
bool isValidProduct_(const String &sequence, int pos, int length, bool ignore_missed_cleavages, bool allow_nterm_protein_cleavage, bool allow_random_asp_pro_cleavage) const
supports functionality for ProteaseDigestion as well (which is deeply weaved into the function) To av...
Specificity specificity_
specificity of enzyme
Definition: EnzymaticDigestion.h:237
EnzymaticDigestion & operator=(const EnzymaticDigestion &rhs)
Assignment operator.
Specificity
when querying for valid digestion products, this determines if the specificity of the two peptide end...
Definition: EnzymaticDigestion.h:68
Size digestAfterTokenize_(const std::vector< int > &fragment_positions, const StringView &sequence, std::vector< std::pair< Size, Size >> &output, Size min_length=0, Size max_length=-1) const
Specificity getSpecificity() const
Returns the specificity for the digestion.
static Specificity getSpecificityByName(const String &name)
Size digestUnmodified(const StringView &sequence, std::vector< std::pair< Size, Size >> &output, Size min_length=1, Size max_length=0) const
Performs the enzymatic digestion of an unmodified sequence.
Size missed_cleavages_
Number of missed cleavages.
Definition: EnzymaticDigestion.h:229
Size countInternalCleavageSites(const String &sequence) const
Counts the number of internal cleavage sites (missed cleavages) in a protein sequence.
Size countMissedCleavages_(const std::vector< int > &cleavage_positions, Size seq_start, Size seq_end) const
Counts the number of missed cleavages in a sequence fragment.
void setMissedCleavages(Size missed_cleavages)
Sets the number of missed cleavages for the digestion (default is 0). This setting is ignored when lo...
Size digestAfterTokenize_(const std::vector< int > &fragment_positions, const StringView &sequence, std::vector< StringView > &output, Size min_length=0, Size max_length=-1) const
Helper function for digestUnmodified()
virtual ~EnzymaticDigestion()
Destructor.
EnzymaticDigestion()
Default constructor.
static const std::string UnspecificCleavage
Name for unspecific cleavage.
Definition: EnzymaticDigestion.h:84
bool filterByMissedCleavages(const String &sequence, const std::function< bool(const Int)> &filter) const
Filter based on the number of missed cleavages.
String getEnzymeName() const
Returns the enzyme for the digestion.
std::unique_ptr< boost::regex > re_
Regex for tokenizing (huge speedup by making this a member instead of stack object in tokenize_())
Definition: EnzymaticDigestion.h:234
std::vector< int > tokenize_(const String &sequence, int start=0, int end=-1) const
Digests the sequence using the enzyme's regular expression.
const DigestionEnzyme * enzyme_
Used enzyme.
Definition: EnzymaticDigestion.h:232
static const std::string NoCleavage
Name for no cleavage.
Definition: EnzymaticDigestion.h:81
virtual void setEnzyme(const DigestionEnzyme *enzyme)
Sets the enzyme for the digestion.
void setSpecificity(Specificity spec)
Sets the specificity for the digestion (default is SPEC_FULL).
EnzymaticDigestion(const EnzymaticDigestion &rhs)
Copy constructor.
Size digestUnmodified(const StringView &sequence, std::vector< StringView > &output, Size min_length=1, Size max_length=0) const
Performs the enzymatic digestion of an unmodified sequence.
Size getMissedCleavages() const
Returns the number of missed cleavages for the digestion.
StringView provides a non-owning view on an existing string.
Definition: StringView.h:56
A more convenient string class.
Definition: String.h:60
int Int
Signed integer type.
Definition: Types.h:102
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:48