NCBI C++ ToolKit
seqport_util.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJECTS_SEQ___SEQPORT_UTIL__HPP
2 #define OBJECTS_SEQ___SEQPORT_UTIL__HPP
3 
4 /* $Id: seqport_util.hpp 57555 2013-03-21 12:46:54Z vasilche $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Clifford Clausen
30  * (also reviewed/fixed/groomed by Denis Vakatov and Aaron Ucko)
31  *
32  * File Description:
33  */
34 #include <corelib/ncbi_limits.hpp>
35 #include <objects/seq/Seq_data.hpp>
37 #include <memory>
38 #include <vector>
39 
40 
42 BEGIN_objects_SCOPE
43 
44 
45 // CSeqportUtil is a wrapper for a hidden object of class
46 // CSeqportUtil_implementation.
47 
49 
50 
52 {
53 public:
54 
55  // TypeDefs
56  typedef unsigned int TIndex;
57  typedef pair<TIndex, TIndex> TPair;
58 
59  // Classes thrown as errors
60  struct NCBI_SEQ_EXPORT CBadIndex : public runtime_error
61  {
62  CBadIndex(TIndex idx, string method)
63  : runtime_error("CSeqportUtil::" + method +
64  " -- bad index specified: " + NStr::UIntToString(idx)) {}
65  };
66  struct NCBI_SEQ_EXPORT CBadSymbol : public runtime_error
67  {
68  CBadSymbol(string code, string method)
69  : runtime_error("CSeqportUtil::" + method +
70  " -- bad symbol specified: " + code) {}
71  };
72  struct NCBI_SEQ_EXPORT CBadType : public runtime_error
73  {
74  CBadType(string method)
75  : runtime_error("CSeqportUtil::" + method +
76  " -- specified code or code combination not supported") {}
77  };
78 
79  // Alphabet conversion function. Function returns the
80  // number of converted codes.
81  static TSeqPos Convert(const CSeq_data& in_seq,
82  CSeq_data* out_seq,
83  CSeq_data::E_Choice to_code,
84  TSeqPos uBeginIdx = 0,
85  TSeqPos uLength = 0,
86  bool bAmbig = false,
87  Uint4 seed = 17734276);
88 
89  // Alphabet conversion function with ambiguities in BLAST form, for DNA only.
90  // Target encoding is ncbi2na. Ambiguities are returned in a separate vector.
91  // This function can be called in a loop, in which the returned data length
92  // and ambiguity vector are accumulated.
93  // @param in_seq Input sequence [in]
94  // @param out_seq Output sequence [out]
95  // @param uBeginIdx Starting offset in input sequence [in]
96  // @param uLength Ending offset in input sequence [in]
97  // @param total_length Total length of sequences to be converted. The BLAST
98  // ambiguity encoding depends on whether total length is >= 1<<24.
99  // @param out_seq_length Number of bytes in the output sequence data.
100  // Incremented if != 0 on input. [in|out]
101  // @param blast_ambig Ambiguities encoded in BLAST form. The vector is
102  // appended, so this function can be called in a loop. [in|out]
103  // @return Number of converted codes.
104  static TSeqPos
105  ConvertWithBlastAmbig(const CSeq_data& in_seq,
106  CSeq_data* out_seq,
107  TSeqPos uBeginIdx,
108  TSeqPos uLength,
109  TSeqPos total_length,
110  TSeqPos* out_seq_length,
111  vector<Uint4>* blast_ambig);
112 
113  // Function to provide maximum in-place packing of na
114  // sequences without loss of information. Iupacna
115  // can always be packed to ncbi4na without loss. Iupacna
116  // can sometimes be packed to ncbi2na. Ncbi4na can
117  // sometimes be packed to ncbi2na. Returns number of
118  // residues packed. If in_seq cannot be packed, the
119  // original in_seq is returned unchanged and the return value
120  // from Pack is 0
121  static TSeqPos Pack(CSeq_data* in_seq,
123 
124  // Performs fast validation of CSeq_data. If all data in the
125  // sequence represent valid elements of a biological sequence, then
126  // FastValidate returns true. Otherwise it returns false
127  static bool FastValidate(const CSeq_data& in_seq,
128  TSeqPos uBeginIdx = 0,
129  TSeqPos uLength = 0);
130 
131  // Performs validation of CSeq_data. Returns a list of indices
132  // corresponding to data that does not represent a valid element
133  // of a biological sequence.
134  static void Validate(const CSeq_data& in_seq,
135  vector<TSeqPos>* badIdx,
136  TSeqPos uBeginIdx = 0,
137  TSeqPos uLength = 0);
138 
139  // Get ambiguous bases. out_indices returns
140  // the indices relative to in_seq of ambiguous bases.
141  // out_seq returns the ambiguous bases. Note, there are
142  // only ambiguous bases for iupacna->ncib2na and
143  // ncib4na->ncbi2na coversions.
144  static TSeqPos GetAmbigs(const CSeq_data& in_seq,
145  CSeq_data* out_seq,
146  vector<TSeqPos>* out_indices,
148  TSeqPos uBeginIdx = 0,
149  TSeqPos uLength = 0);
150 
151  // Get a copy of CSeq_data. No conversion is done. uBeginIdx of the
152  // biological sequence in in_seq will be in position
153  // 0 of out_seq. Usually, uLength bases will be copied
154  // from in_seq to out_seq. If uLength goes beyond the end of
155  // in_seq, it will be shortened to go to the end of in_seq.
156  // For packed sequence formats (ncbi2na and ncbi4na),
157  // only uLength bases are valid copies. For example,
158  // in an ncbi4na encoded sequence, if uLength is odd, the last
159  // sequence returned will be uLength+1 because 2 bases are encoded
160  // per byte in ncbi4na. However, in this case, uLength will be returned
161  // unchanged (it will remain odd unless it goes beyond the end
162  // of in_seq). If uLength=0, then a copy from uBeginIdx to the end
163  // of in_seq is returned.
164  static TSeqPos GetCopy(const CSeq_data& in_seq,
165  CSeq_data* out_seq,
166  TSeqPos uBeginIdx = 0,
167  TSeqPos uLength = 0);
168 
169  // Method to keep only a contiguous piece of a sequence beginning
170  // at uBeginIdx and uLength residues long. Does bit shifting as
171  // needed to put uBeginIdx of original sequence at position zero on output.
172  // Similar to GetCopy(), but done in place. Returns length of
173  // kept sequence.
174  static TSeqPos Keep(CSeq_data* in_seq,
175  TSeqPos uBeginIdx = 0,
176  TSeqPos uLength = 0);
177 
178  // Append in_seq2 to to end of in_seq1. Both in seqs must be
179  // in the same alphabet or this method will throw a runtime_error.
180  // The result of the append will be put into out_seq.
181  // For packed sequences ncbi2na and ncbi4na, Append will shift and
182  // append so as to remove any jaggedness at the append point.
183  static TSeqPos Append(CSeq_data* out_seq,
184  const CSeq_data& in_seq1,
185  TSeqPos uBeginIdx1,
186  TSeqPos uLength1,
187  const CSeq_data& in_seq2,
188  TSeqPos uBeginIdx2,
189  TSeqPos uLength2);
190 
191  // Create a biological complement of an na sequence.
192  // Attempts to complement an aa sequence will throw
193  // a runtime_error. Returns length of complemented sequence.
194 
195  // Complement the input sequence in place
196  static TSeqPos Complement(CSeq_data* in_seq,
197  TSeqPos uBeginIdx = 0,
198  TSeqPos uLength = 0);
199 
200  // Complement the input sequence and put the result in
201  // the output sequence
202  static TSeqPos Complement(const CSeq_data& in_seq,
203  CSeq_data* out_seq,
204  TSeqPos uBeginIdx = 0,
205  TSeqPos uLength = 0);
206 
207  // Create a biological sequence that is the reversse
208  // of an input sequence. Attempts to reverse an aa
209  // sequence will throw a runtime_error. Returns length of
210  // reversed sequence.
211 
212  // Reverse a sequence in place
213  static TSeqPos Reverse(CSeq_data* in_seq,
214  TSeqPos uBeginIdx = 0,
215  TSeqPos uLength = 0);
216 
217  // Reverse an input sequence and put result in output sequence.
218  // Reverses packed bytes as necessary.
219  static TSeqPos Reverse(const CSeq_data& in_seq,
220  CSeq_data* out_seq,
221  TSeqPos uBeginIdx = 0,
222  TSeqPos uLength = 0);
223 
224 
225  // Create the reverse complement of an input sequence. Attempts
226  // to reverse-complement an aa sequence will throw a
227  // runtime_error.
228 
229  // Reverse complement a sequence in place
230  static TSeqPos ReverseComplement(CSeq_data* in_seq,
231  TSeqPos uBeginIdx = 0,
232  TSeqPos uLength = 0);
233 
234  // Reverse complmenet a sequence and put result in output sequence
235  static TSeqPos ReverseComplement(const CSeq_data& in_seq,
236  CSeq_data* out_seq,
237  TSeqPos uBeginIdx = 0,
238  TSeqPos uLength = 0);
239 
240  // Given an Ncbistdaa input code index, returns the 3 letter Iupacaa3 code
241  // Implementation is efficient
242  static const string& GetIupacaa3(TIndex ncbistdaa);
243 
244  // Given a code type expressible as enum CSeq_data::E_Choice, returns
245  // true if the code type is available. See Seq_data_.hpp for definition
246  // of enum CSeq_data::E_Choice.
247  static bool IsCodeAvailable(CSeq_data::E_Choice code_type);
248 
249  // Same as above, but for code types expressible as enum ESeq_code_type.
250  // See Seq_code_type_.hpp for definition of enum ESeq_code_type.
251  static bool IsCodeAvailable(ESeq_code_type code_type);
252 
253  // Gets the first and last indices of a code type (e.g., for iupacna,
254  // the first index is 65 and the last index is 89). Throws CBadType
255  // if code_type is not available
256  static TPair GetCodeIndexFromTo(CSeq_data::E_Choice code_type);
257 
258  // Same as above but takes code type expressed as an ESeq_code_type
259  static TPair GetCodeIndexFromTo(ESeq_code_type code_type);
260 
261  // Given an index for any code type expressible as a
262  // CSeq_data::E_Choice, returns the corresponding symbol (code)
263  // (e.g., for iupacna, idx 65 causes "A" to be returned)
264  // if the code type is available (e.g., ncbi8aa is not currently
265  // available). Use IsCodeAvailable() to find out which code types
266  // are available. Throws CBadType if code_type not available. Throws
267  // CBadIndex if idx out of range for code_type.
268  static const string& GetCode(CSeq_data::E_Choice code_type,
269  TIndex idx);
270 
271  // Similar to above, but works for all code types expressible as
272  // a ESeq_code_type (i.e., iupacaa3 is expressible as an
273  // ESeq_code_type but not as a CSeq_data::E_Choice)
274  static const string& GetCode(ESeq_code_type code_type,
275  TIndex idx);
276 
277  // Given an index for any code type expressible as a
278  // CSeq_data::E_Choice, returns the corresponding name
279  // (e.g., for iupacna, idx 65 causes "Adenine" to be returned)
280  // if the code type is available (e.g., ncbi8aa is not currently
281  // available). Use IsCodeAvailable() to find out which code types
282  // are available. Throws CBadType if code_type not available. Throws
283  // CBadIndex if idx out of range for code_type.
284  static const string& GetName(CSeq_data::E_Choice code_type,
285  TIndex idx);
286 
287  // Similar to above, but works for all code types expressible as
288  // a ESeq_code_type (i.e., iupacaa3 is expressible as an
289  // ESeq_code_type but not as a CSeq_data::E_Choice)
290  static const string& GetName(ESeq_code_type code_type,
291  TIndex idx);
292 
293  // Given a code (symbol) for any code type expressible as a
294  // CSeq_data::E_Choice, returns the corresponding index
295  // (e.g., for iupacna, symbol "A" causes 65 to be returned)
296  // if the code type is available (e.g., ncbi8aa is not currently
297  // available). Use IsCodeAvailable() to find out which code types
298  // are available.Throws CBadType if code_type not available. Throws
299  // CBadSymbol if code is not a valid symbol for code_type.
300  static TIndex GetIndex(CSeq_data::E_Choice code_type, const string& code);
301 
302  // Similar to above, but works for all code types expressible as
303  // a ESeq_code_type (i.e., iupacaa3 is expressible as an
304  // ESeq_code_type but not as a CSeq_data::E_Choice)
305  static TIndex GetIndex(ESeq_code_type code_type, const string& code);
306 
307  // Get the index that is the complement of the index for the
308  // input CSeq_data::E_Choice code type (e.g., for iupacna, the
309  // complement of index 66 is 86). Throws CBadType if complements for
310  // code_type not available. Throws CBadIndex if idx out of range for
311  // code_type
312  static TIndex GetIndexComplement(CSeq_data::E_Choice code_type,
313  TIndex idx);
314 
315  // Same as above, but for code type expressible as ESeq_code_type.
316  static TIndex GetIndexComplement(ESeq_code_type code_type,
317  TIndex idx);
318 
319  // Takes an index for a from_type code and returns the index for to_type
320  // (e.g., GetMapToIndex(CSeq_data::e_Iupacna, CSeq_data::e_Ncbi2na,
321  // 68) returns 2). Returns 255 if no map value exists for a legal
322  // index (e.g., GetMapToIndex(CSeq_data::e_Iupacna, CSeq_data::e_Ncbi2na,
323  // 69) returns 255). Throws CBadType if map for from_type to to_type not
324  // available. Throws CBadIndex if from_idx out of range for from_type.
325  static TIndex GetMapToIndex(CSeq_data::E_Choice from_type,
326  CSeq_data::E_Choice to_type,
327  TIndex from_idx);
328 
329  // Same as above, but uses ESeq_code_type.
330  static TIndex GetMapToIndex(ESeq_code_type from_type,
331  ESeq_code_type to_type,
332  TIndex from_idx);
333 
334 private:
335 
336  // we maintain a singleton internally
337  // the singleton can be retrieved by calling x_GetImplementation()
338  static CSeqportUtil_implementation& x_GetImplementation (void);
339 };
340 
341 
342 END_objects_SCOPE
344 
345 #endif /* OBJECTS_SEQ___SEQPORT_UTIL__HPP */
ENa_strand Reverse(ENa_strand s)
Definition: Na_strand.hpp:90
User-defined methods of the data storage class.
unsigned int TIndex
pair< TIndex, TIndex > TPair
NStr –.
Definition: ncbistr.hpp:243
TResidue Complement(TResidue c)
void ReverseComplement(const BidirectionalIterator &first, const BidirectionalIterator &last)
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NCBI_SEQ_EXPORT
Definition: ncbi_export.h:825
ESeq_code_type
sequence representations
E_Choice
Choice variants.
Definition: Seq_data_.hpp:102
@ e_Ncbi2na
2 bit nucleic acid code
Definition: Seq_data_.hpp:106
CMapControl::TIndex TIndex
static string Pack(const string &s)
T max(T x_, T y_)
CBadIndex(TIndex idx, string method)
CBadSymbol(string code, string method)
CBadType(string method)
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
Definition: inftrees.h:24
static int seed
Definition: test_table.cpp:132
const value_slice::CValueConvert< value_slice::SRunTimeCP, FROM > Convert(const FROM &value)
Modified on Wed Jul 24 17:20:44 2024 by modify_doxy.py rev. 669887