NCBI C++ ToolKit
sparse_aln.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_ALNMGR___SPARSE_ALN__HPP
2 #define OBJTOOLS_ALNMGR___SPARSE_ALN__HPP
3 /* $Id: sparse_aln.hpp 75420 2016-11-16 14:17:08Z grichenk $
4  * ===========================================================================
5  *
6  * PUBLIC DOMAIN NOTICE
7  * National Center for Biotechnology Information
8  *
9  * This software/database is a "United States Government Work" under the
10  * terms of the United States Copyright Act. It was written as part of
11  * the author's official duties as a United States Government employee and
12  * thus cannot be copyrighted. This software/database is freely available
13  * to the public for use. The National Library of Medicine and the U.S.
14  * Government have not placed any restriction on its use or reproduction.
15  *
16  * Although all reasonable efforts have been taken to ensure the accuracy
17  * and reliability of the software and data, the NLM and the U.S.
18  * Government do not and cannot warrant the performance or results that
19  * may be obtained by using this software or data. The NLM and the U.S.
20  * Government disclaim all warranties, express or implied, including
21  * warranties of performance, merchantability or fitness for any particular
22  * purpose.
23  *
24  * Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  * Authors: Andrey Yazhuk
29  *
30  * File Description:
31  *
32  */
33 
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbiobj.hpp>
36 
37 #include <util/align_range.hpp>
39 
40 #include <objmgr/scope.hpp>
41 
44 
45 
47 
48 
49 /// Sparse alignment
51 {
52 public:
56  typedef CAnchoredAln::TDim TDim; ///< Synonym of TNumrow
57 
58  /// Constructor
59  /// @param anchored_aln
60  /// Input CAnchoredAln object. Should be built using BuildAln function
61  /// for the alignment coordinates to be correct.
62  /// @param scope
63  /// CScope used to fetch sequence data.
64  /// @sa BuildAln
65  CSparseAln(const CAnchoredAln& anchored_aln,
66  objects::CScope& scope);
67 
68  /// Destructor
69  virtual ~CSparseAln(void);
70 
71  /// Gap character modifier
72  void SetGapChar(TResidue gap_char);
73 
74  /// Scope accessor
75  CRef<objects::CScope> GetScope(void) const;
76 
77  /// Alignment dimension (number of sequence rows in the alignment)
78  TDim GetDim(void) const;
79  /// Synonym of the above
80  TNumrow GetNumRows(void) const { return GetDim(); }
81 
82  /// Get seq-id for the row.
83  const objects::CSeq_id& GetSeqId(TNumrow row) const;
84 
85  /// Get whole alignment range.
86  TRng GetAlnRange(void) const;
87 
88  /// Get pairwise alignment for the row.
89  const TAlnRngColl& GetAlignCollection(TNumrow row);
90 
91  /// Check if anchor is set - always true for sparse alignments.
92  bool IsSetAnchor(void) const { return true; }
93 
94  /// Get anchor row index.
95  TNumrow GetAnchor(void) const
96  {
97  return m_Aln->GetAnchorRow();
98  }
99 
100  /// Get sequence range in alignment coords (strand ignored).
101  TSignedRange GetSeqAlnRange(TNumrow row) const;
102  TSignedSeqPos GetSeqAlnStart(TNumrow row) const;
103  TSignedSeqPos GetSeqAlnStop(TNumrow row) const;
104 
105  /// Get sequence range in sequence coords.
106  TRange GetSeqRange(TNumrow row) const;
107  TSeqPos GetSeqStart(TNumrow row) const;
108  TSeqPos GetSeqStop(TNumrow row) const;
109 
110  /// Check direction of the row.
111  bool IsPositiveStrand(TNumrow row) const;
112  bool IsNegativeStrand(TNumrow row) const;
113 
114  /// Map sequence position to alignment coordinates.
115  /// @param row
116  /// Alignment row where the input position is defined.
117  /// @param seq_pos
118  /// Input position
119  /// @param dir
120  /// In case the input position can not be mapped to the alignment
121  /// coordinates (e.g. the position is inside an unaligned range),
122  /// try to search for the neares alignment position in the specified
123  /// direction.
124  /// @param try_reverse_dir
125  /// Not implemented
126  TSignedSeqPos GetAlnPosFromSeqPos(TNumrow row, TSeqPos seq_pos,
127  ESearchDirection dir = eNone,
128  bool try_reverse_dir = true) const;
129  TSignedSeqPos GetSeqPosFromAlnPos(TNumrow for_row, TSeqPos aln_pos,
130  ESearchDirection dir = eNone,
131  bool try_reverse_dir = true) const;
132 
134  /// Get sequence coding for nucleotides.
135  TCoding GetNaCoding(void) const { return m_NaCoding; }
136  /// Get sequence coding for proteins.
137  TCoding GetAaCoding(void) const { return m_AaCoding; }
138  /// Set sequence coding for nucleotides. If not set, Iupacna coding is used.
139  void SetNaCoding(TCoding coding) { m_NaCoding = coding; }
140  /// Set sequence coding for proteins. If not set, Iupacaa coding is used.
141  void SetAaCoding(TCoding coding) { m_AaCoding = coding; }
142 
143  /// Fetch sequence data for the given row and range.
144  /// @param row
145  /// Alignment row to fetch sequence for.
146  /// @param buffer
147  /// Output buffer.
148  /// @param seq_from
149  /// Start sequence position.
150  /// @param seq_to
151  /// End sequence position.
152  /// @param force_translation
153  /// Force nucleotide to protein sequence translation.
154  /// @return
155  /// Reference to the output buffer.
156  string& GetSeqString(TNumrow row,
157  string& buffer,
158  TSeqPos seq_from,
159  TSeqPos seq_to,
160  bool force_translation = false) const;
161 
162  /// Fetch sequence data for the given row and range.
163  /// @param row
164  /// Alignment row to fetch sequence for.
165  /// @param buffer
166  /// Output buffer.
167  /// @param seq_rng
168  /// Sequence range.
169  /// @param force_translation
170  /// Force nucleotide to protein sequence translation.
171  /// @return
172  /// Reference to the output buffer.
173  string& GetSeqString(TNumrow row,
174  string& buffer,
175  const TRange& rq_seq_rng,
176  bool force_translation = false) const;
177 
178  /// Fetch alignment sequence data. Unaligned ranges of the selected row
179  /// are filled with gap char.
180  /// @param row
181  /// Alignment row to fetch sequence for.
182  /// @param buffer
183  /// Output buffer.
184  /// @param aln_rng
185  /// Alignment range.
186  /// @param force_translation
187  /// Force nucleotide to protein sequence translation.
188  /// @return
189  /// Reference to the output buffer.
190  string& GetAlnSeqString(TNumrow row,
191  string& buffer,
192  const TSignedRange& rq_aln_rng,
193  bool force_translation = false) const;
194 
195  /// Get bioseq handle for the row. Throw exception if the handle can not be
196  /// obtained.
197  const objects::CBioseq_Handle& GetBioseqHandle(TNumrow row) const;
198 
199  /// Create segment iterator.
200  /// @param row
201  /// Row to iterate segments for.
202  /// @param range
203  /// Range to iterate.
204  /// @param flags
205  /// Iterator flags.
206  /// @sa CSparse_CI
207  /// @sa IAlnSegmentIterator
208  virtual IAlnSegmentIterator*
209  CreateSegmentIterator(TNumrow row,
210  const TSignedRange& range,
212 
213  /// Wheather the alignment is translated (heterogenous), e.g. nuc-prot.
214  bool IsTranslated(void) const;
215 
216  enum EConstants {
217  kDefaultGenCode = 1
218  };
219 
220  // Static utilities:
221  static void TranslateNAToAA(const string& na, string& aa,
222  int gen_code = kDefaultGenCode); //< per http://www.ncbi.nlm.nih.gov/collab/FT/#7.5.5
223 
224  /// Get base width for the sequence (1 for nucleotides, 3 for proteins).
226  {
227  _ASSERT(row >= 0 && row < GetDim());
228  int w = m_Aln->GetPairwiseAlns()[row]->GetSecondBaseWidth();
229  _ASSERT(w == 1 || w == 3);
230  return w;
231  }
232 
233  /// Convert alignment (genomic) coordinate on the selected row to real
234  /// sequence position.
236  {
237  return aln_pos/GetBaseWidth(row);
238  }
239 
240  /// For protein sequences get frame for the specified coordinated.
241  /// For genomic sequences always returns 0.
243  {
244  int w = GetBaseWidth(row);
245  return (w == 3) ? aln_pos % 3 + 1 : 0;
246  }
247 
248  /// Convert sequence position to alignment (genomic) coordinate.
249  /// Optional frame can be used with protein positions.
251  TSignedSeqPos seq_pos,
252  int frame = 0) const
253  {
254  int w = GetBaseWidth(row);
255  TSignedSeqPos ret = seq_pos*w;
256  if (w == 3 && frame) ret += frame - 1;
257  return ret;
258  }
259 
260  /// Convert alignment range (genomic coordinates) on the selected row
261  /// to real sequence range.
262  /// NOTE: Need to use template since there are many range types:
263  /// TRng, TAlnRng, TRange, TSignedRange etc.
264  template<class _TRange>
265  _TRange AlnRangeToNativeSeqRange(TNumrow row, _TRange aln_range) const
266  {
267  if (aln_range.Empty() || aln_range.IsWhole()) return aln_range;
268  int w = GetBaseWidth(row);
269  return _TRange(aln_range.GetFrom()/w, aln_range.GetToOpen()/w - 1);
270  }
271 
272  /// Get start and stop frames for the selected row/range.
273  /// 0 - no frame (native coordinates are genomic)
274  /// 1..3 - frame value for protein coordinates
275  typedef pair<int, int> TFrames;
276  template<class _TRange>
277  TFrames AlnRangeToNativeFrames(TNumrow row, _TRange aln_range) const
278  {
279  if (aln_range.Empty() || aln_range.IsWhole()) return TFrames(0, 0);
280  int w = GetBaseWidth(row);
281  if (w == 1) return TFrames(0, 0);
282  return TFrames(aln_range.GetFrom() % w + 1, aln_range.GetTo() % w + 1);
283  }
284 
285  /// Convert sequence range to alignment range (genomic coordinates).
286  /// Optional frames argument can be provided for protein ranges.
287  /// NOTE: Need to use template since there are many range types:
288  /// TRng, TAlnRng, TRange, TSignedRange etc.
289  template<class _TRange>
291  _TRange seq_range,
292  TFrames frames = TFrames(0, 0)) const
293  {
294  if (seq_range.Empty() || seq_range.IsWhole()) return seq_range;
295  int w = GetBaseWidth(row);
296  int from_frame = frames.first ? frames.first - 1 : 0;
297  int to_frame = frames.second ? frames.second - 1 : 0;
298  return _TRange(seq_range.GetFrom()*w + from_frame, seq_range.GetToOpen()*w + to_frame - 1);
299  }
300 
301 protected:
302  friend class CSparse_CI;
303 
304  void x_Build(const CAnchoredAln& src_align);
305  CSeqVector& x_GetSeqVector(TNumrow row) const;
306  int x_GetGenCode(TNumrow row) const;
307 
309 
312  TRng m_FirstRange; // the extent of all segments in aln coords
313  vector<TRng> m_SecondRanges;
315  mutable vector<objects::CBioseq_Handle> m_BioseqHandles;
316  mutable vector<CRef<CSeqVector> > m_SeqVectors;
317 
320 
322 };
323 
324 
326 
327 #endif // OBJTOOLS_ALNMGR___SPARSE_ALN__HPP
class CAlignRangeCollectionList<TAlignRange> represent a sorted collection of TAlignRange.
CAlignRange Represents an element of pairwise alignment of two sequences.
Definition: align_range.hpp:63
Query-anchored alignment can be 2 or multi-dimentional.
vector< CRef< CPairwiseAln > > TPairwiseAlnVector
CObject –.
Definition: ncbiobj.hpp:180
CSeqVector –.
Definition: seq_vector.hpp:65
Sparse alignment.
Definition: sparse_aln.hpp:51
CPairwiseAln::TAlnRng TAlnRng
Definition: sparse_aln.hpp:54
CSeq_data::E_Choice TCoding
Definition: sparse_aln.hpp:133
bool IsSetAnchor(void) const
Check if anchor is set - always true for sparse alignments.
Definition: sparse_aln.hpp:92
bool m_AnchorDirect
Definition: sparse_aln.hpp:321
TNumrow GetNumRows(void) const
Synonym of the above.
Definition: sparse_aln.hpp:80
CAnchoredAln::TPairwiseAlnVector TPairwiseAlnVector
Definition: sparse_aln.hpp:308
int GetBaseWidth(TNumrow row) const
Get base width for the sequence (1 for nucleotides, 3 for proteins).
Definition: sparse_aln.hpp:225
TResidue m_GapChar
Definition: sparse_aln.hpp:314
TCoding GetNaCoding(void) const
Get sequence coding for nucleotides.
Definition: sparse_aln.hpp:135
pair< int, int > TFrames
Get start and stop frames for the selected row/range.
Definition: sparse_aln.hpp:275
CAnchoredAln::TDim TDim
Synonym of TNumrow.
Definition: sparse_aln.hpp:56
CPairwiseAln::TAlnRngColl TAlnRngColl
Definition: sparse_aln.hpp:55
CRef< CAnchoredAln > m_Aln
Definition: sparse_aln.hpp:310
_TRange AlnRangeToNativeSeqRange(TNumrow row, _TRange aln_range) const
Convert alignment range (genomic coordinates) on the selected row to real sequence range.
Definition: sparse_aln.hpp:265
TCoding GetAaCoding(void) const
Get sequence coding for proteins.
Definition: sparse_aln.hpp:137
TSignedSeqPos AlnPosToNativeSeqPos(TNumrow row, TSignedSeqPos aln_pos) const
Convert alignment (genomic) coordinate on the selected row to real sequence position.
Definition: sparse_aln.hpp:235
CPairwiseAln::TRng TRng
Definition: sparse_aln.hpp:53
void SetNaCoding(TCoding coding)
Set sequence coding for nucleotides. If not set, Iupacna coding is used.
Definition: sparse_aln.hpp:139
TFrames AlnRangeToNativeFrames(TNumrow row, _TRange aln_range) const
Definition: sparse_aln.hpp:277
vector< objects::CBioseq_Handle > m_BioseqHandles
Definition: sparse_aln.hpp:315
vector< TRng > m_SecondRanges
Definition: sparse_aln.hpp:313
TCoding m_AaCoding
Definition: sparse_aln.hpp:319
void SetAaCoding(TCoding coding)
Set sequence coding for proteins. If not set, Iupacaa coding is used.
Definition: sparse_aln.hpp:141
TCoding m_NaCoding
Definition: sparse_aln.hpp:318
CRef< objects::CScope > m_Scope
Definition: sparse_aln.hpp:311
TNumrow GetAnchor(void) const
Get anchor row index.
Definition: sparse_aln.hpp:95
_TRange NativeSeqRangeToAlnRange(TNumrow row, _TRange seq_range, TFrames frames=TFrames(0, 0)) const
Convert sequence range to alignment range (genomic coordinates).
Definition: sparse_aln.hpp:290
int AlnPosToNativeFrame(TNumrow row, TSignedSeqPos aln_pos) const
For protein sequences get frame for the specified coordinated.
Definition: sparse_aln.hpp:242
vector< CRef< CSeqVector > > m_SeqVectors
Definition: sparse_aln.hpp:316
TRng m_FirstRange
Definition: sparse_aln.hpp:312
TSignedSeqPos NativeSeqPosToAlnPos(TNumrow row, TSignedSeqPos seq_pos, int frame=0) const
Convert sequence position to alignment (genomic) coordinate.
Definition: sparse_aln.hpp:250
Implementation of IAlnSegmentIterator for CSparseAln.
Definition: sparse_ci.hpp:73
Alignment explorer interface.
objects::CSeqVector::TResidue TResidue
Alignment segment iterator interface.
EFlags
Iterator options.
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NCBI_XALNMGR_EXPORT
Definition: ncbi_export.h:1065
E_Choice
Choice variants.
Definition: Seq_data_.hpp:102
range(_Ty, _Ty) -> range< _Ty >
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
static pcre_uint8 * buffer
Definition: pcretest.c:1051
static bool GetSeqId(const T &d, set< string > &labels, const string name="", bool detect=false, bool found=false)
#define row(bind, expected)
Definition: string_bind.c:73
#define _ASSERT
CScope & GetScope()
Modified on Sun Jun 23 05:15:03 2024 by modify_doxy.py rev. 669887