NCBI C++ ToolKit
seq_match.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seq_match.hpp 33815 2007-05-04 17:18:18Z kazimird $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Josh Cherry
27  *
28  * File Description: Simple pattern matching for sequences
29  *
30  */
31 
32 
33 #ifndef GUI_CORE_ALGO_BASIC___SEQ_MATCH__HPP
34 #define GUI_CORE_ALGO_BASIC___SEQ_MATCH__HPP
35 
36 #include <corelib/ncbistd.hpp>
37 
39 
40 ///
41 /// This class provides functions for determining where
42 /// sequences, perhaps containing ambiguity codes,
43 /// must or can match patterns.
44 ///
45 /// The functions are highly templatized. The main
46 /// reason is to allow reuse of pattern-matching code with
47 /// different 'alphabets'. A second reason is to allow use
48 /// of different container classes for sequences and patterns,
49 /// e.g., the ncbi8na matching will work with both
50 /// string and vector<char>.
51 /// This would be much nicer with template template parameters,
52 /// but MSVC doesn't support them.
53 ///
54 
55 class CSeqMatch
56 {
57 public:
58  enum EMatch {
59  eNo,
61  eMaybe
62  };
63  /// determine whether ncbi8na base s is a match to q.
64  static EMatch CompareNcbi8na(char s, char q);
65  /// determine whether seq matches pattern pat starting at position pos.
66  ///
67  /// It is the caller's responsibility to ensure that there are
68  /// enough 'characters' in seq, i.e., that pos + pat.size() <= seq.size()
69  template<class Seq, class Pat, class Compare_fun>
70  static EMatch Match(const Seq& seq, const Pat& pat, TSeqPos pos,
71  const Compare_fun compare_fun)
72  {
73 
74  // for efficiency, no check is made that we're not looking
75  // past the end of seq; caller must assure this
76 
77  EMatch rv = eYes;
78  // check pattern positions in succession
79  for (unsigned int i = 0; i < pat.size(); i++) {
80  EMatch res = compare_fun(seq[pos+i], pat[i]);
81  if (res == eNo) {
82  return eNo;
83  }
84  if (res == eMaybe) {
85  rv = eMaybe;
86  }
87  }
88  // if we got here, everybody at least could have matched
89  return rv;
90  }
91 
92  template<class Seq, class Pat>
93  static EMatch MatchNcbi8na(const Seq& seq,
94  const Pat& pat, TSeqPos pos)
95  {
96  return Match(seq, pat, pos, CompareNcbi8na);
97  }
98 
99  template <class Seq, class Pat>
101  {
102  EMatch operator() (const Seq& seq,
103  const Pat& pat, TSeqPos pos) const
104  {
105  return CSeqMatch::Match(seq, pat, pos, CompareNcbi8na);
106  }
107  };
108 
109  /// find all places where seq must or might match pat
110  template<class Seq, class Pat, class Match_fun>
111  static void FindMatches(const Seq& seq,
112  const Pat& pat,
113  vector<TSeqPos>& definite_matches,
114  vector<TSeqPos>& possible_matches,
115  Match_fun match)
116  {
117  for (unsigned int i = 0; i < seq.size() - pat.size() + 1; i++) {
118  EMatch res = match(seq, pat, i);
119  if (res == eNo) {
120  continue;
121  }
122  if (res == eYes) {
123  definite_matches.push_back(i);
124  continue;
125  }
126  // otherwise must be eMaybe
127  possible_matches.push_back(i);
128  }
129  }
130 
131  template<class Seq, class Pat>
132  static void FindMatchesNcbi8na(const Seq& seq,
133  const Pat& pat,
134  vector<TSeqPos>& definite_matches,
135  vector<TSeqPos>& possible_matches)
136  {
137  FindMatches(seq, pat,
138  definite_matches, possible_matches,
140  }
141 
142  /// stuff for dealing with ncbi8na.
143  /// doesn't really belong here, but oh well
144 
145  /// convert a single base from IUPAC to ncbi8na
146  NCBI_XALGOSEQ_EXPORT static char IupacToNcbi8na(char in);
147  /// convert a whole string from IUPAC to ncbi8na
148  NCBI_XALGOSEQ_EXPORT static void IupacToNcbi8na(const string& in, string& out);
149  /// complement an ncbi8na sequence in place
150  NCBI_XALGOSEQ_EXPORT static void CompNcbi8na(string& seq8na);
151  /// complement a single ncbi8na base
152  NCBI_XALGOSEQ_EXPORT static char CompNcbi8na(char);
153 };
154 
155 
156 
157 
158 
159 // works on ncbi8na
160 // s can match q iff they have some set bits in common
161 // s must match q iff it represents a subset,
162 // i.e., if no bits set in s are unset in q
163 inline
165 {
166  if (!(s & q)) {
167  // nothing in common
168  return eNo;
169  }
170  if (s & ~q) {
171  return eMaybe;
172  }
173  return eYes;
174 }
175 
176 
178 
179 #endif // GUI_CORE_ALGO_BASIC___SEQ_MATCH__HPP
This class provides functions for determining where sequences, perhaps containing ambiguity codes,...
Definition: seq_match.hpp:56
static void FindMatchesNcbi8na(const Seq &seq, const Pat &pat, vector< TSeqPos > &definite_matches, vector< TSeqPos > &possible_matches)
Definition: seq_match.hpp:132
static char IupacToNcbi8na(char in)
stuff for dealing with ncbi8na.
Definition: seq_match.cpp:43
static EMatch MatchNcbi8na(const Seq &seq, const Pat &pat, TSeqPos pos)
Definition: seq_match.hpp:93
static void CompNcbi8na(string &seq8na)
complement an ncbi8na sequence in place
Definition: seq_match.cpp:121
static EMatch CompareNcbi8na(char s, char q)
determine whether ncbi8na base s is a match to q.
Definition: seq_match.hpp:164
static void FindMatches(const Seq &seq, const Pat &pat, vector< TSeqPos > &definite_matches, vector< TSeqPos > &possible_matches, Match_fun match)
find all places where seq must or might match pat
Definition: seq_match.hpp:111
static EMatch Match(const Seq &seq, const Pat &pat, TSeqPos pos, const Compare_fun compare_fun)
determine whether seq matches pattern pat starting at position pos.
Definition: seq_match.hpp:70
Include a standard set of the NCBI C++ Toolkit most basic headers.
std::ofstream out("events_result.xml")
main entry point for tests
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NCBI_XALGOSEQ_EXPORT
Definition: ncbi_export.h:1017
int i
std::istream & in(std::istream &in_, double &x_)
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
EMatch operator()(const Seq &seq, const Pat &pat, TSeqPos pos) const
Definition: seq_match.hpp:102
Modified on Mon May 20 05:03:02 2024 by modify_doxy.py rev. 669887