NCBI C++ ToolKit
sparse_aln.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: sparse_aln.cpp 100587 2023-08-14 14:22:17Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Andrey Yazhuk
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
37 #include <objtools/error_codes.hpp>
38 #include <objmgr/util/sequence.hpp>
39 
43 
44 
45 #define NCBI_USE_ERRCODE_X Objtools_Aln_Sparse
46 
47 
50 
51 
53  objects::CScope& scope)
54  : m_Scope(&scope),
55  m_GapChar('-'),
56  m_NaCoding(CSeq_data::e_not_set),
57  m_AaCoding(CSeq_data::e_not_set),
58  m_AnchorDirect(true)
59 {
60  x_Build(anchored_aln);
61 }
62 
63 
65 {
66 }
67 
68 
70  return m_Aln->GetDim();
71 }
72 
73 
74 struct SGapRange
75 {
76  TSignedSeqPos from; // original position of the gap on the anchor
77  TSignedSeqPos second_from; // position on the sequence
78  TSignedSeqPos len; // length of the gap
79  bool direct;
80  int row; // Row, containing the gap.
81  //size_t idx; // Index of the gap in the original vector.
82  // This gap's 'from' must be shifted by 'shift'.
83  // Segments at or after 'from' position must be offset by 'shift + len'.
85 
86  bool operator<(const SGapRange& rg) const
87  {
88  if (from != rg.from) return from < rg.from; // sort by pos
89  return row < rg.row; // lower rows go first.
90  // Don't check the length. Stable sort will preserve the order
91  // of consecutive gaps on the same row.
92  }
93 };
94 
95 
96 typedef vector<SGapRange> TGapRanges;
97 
98 
99 void CSparseAln::x_Build(const CAnchoredAln& src_align)
100 {
101  const TDim& dim = src_align.GetDim();
102 
103  m_BioseqHandles.clear();
104  m_BioseqHandles.resize(dim);
105 
106  m_SeqVectors.clear();
107  m_SeqVectors.resize(dim);
108 
109  // Collect all gaps on all rows. They need to be inserted into the
110  // new anchor as normal segments.
111  TGapRanges gaps;
112  for (TDim row = 0; row < dim; ++row) {
113  const CPairwiseAln& pw = *src_align.GetPairwiseAlns()[row];
114  const CPairwiseAln::TInsertions& ins_vec = pw.GetInsertions();
115  gaps.reserve(gaps.size() + ins_vec.size());
116  for (auto i = ins_vec.begin(); i != ins_vec.end(); ++i) {
117  SGapRange gap;
118  gap.from = i->GetFirstFrom();
119  gap.second_from = i->GetSecondFrom();
120  gap.len = i->GetLength();
121  gap.direct = i->IsDirect();
122  gap.row = row;
123  gap.shift = 0;
124  //gap.idx = i;
125  gaps.push_back(gap);
126  }
127  }
128 
129  // We need to preserve the order of consecutive gaps at the same
130  // position on the same row. Use stable_sort.
131  stable_sort(gaps.begin(), gaps.end());
132  // Set shift for all gaps.
133  TSignedSeqPos shift = 0;
134  NON_CONST_ITERATE(TGapRanges, gap_it, gaps) {
135  gap_it->shift = shift;
136  shift += gap_it->len;
137  }
138 
139  m_Aln.Reset(new CAnchoredAln());
140  m_Aln->SetDim(dim);
141  m_Aln->SetAnchorRow(src_align.GetAnchorRow());
142  m_Aln->SetScore(src_align.GetScore());
143 
144  // Now for each row convert insertions to normal segments and change
145  // the old anchor coordinates to alignment coordinates.
146  for (TDim row = 0; row < dim; ++row) {
147  const CPairwiseAln& pw = *src_align.GetPairwiseAlns()[row];
148  CPairwiseAln::const_iterator seg_it = pw.begin();
149  TGapRanges::const_iterator gap = gaps.begin();
150  TSignedSeqPos shift = 0;
152  pw.GetFirstId(), pw.GetSecondId(), pw.GetFlags()));
153  TSeqPos last_to = 0;
154  bool first_direct = true;
155  bool second_direct = true;
156  while (seg_it != pw.end()) {
157  CPairwiseAln::TAlignRange rg = *seg_it;
158  first_direct = rg.IsFirstDirect();
159  second_direct = rg.IsDirect();
160  ++seg_it;
161  // Check if there are gaps before the new segment's end.
162  while (gap != gaps.end() &&
163  gap->from < rg.GetFirstToOpen()) {
164  if (gap->row == row) {
165  // Insertion in this row - align to the anchor.
166  CPairwiseAln::TAlignRange ins(gap->from + shift,
167  gap->second_from, gap->len, gap->direct);
168  // Reset flags to match row orientation.
169  ins.SetFirstDirect(first_direct);
170  ins.SetDirect(second_direct);
171  dst->push_back(ins);
172  }
173  else if (gap->from > rg.GetFirstFrom()) {
174  // Split the range if there are insertions in other rows.
175  CPairwiseAln::TAlignRange sub = rg;
176  sub.SetLength(gap->from - rg.GetFirstFrom());
177  sub.SetFirstFrom(sub.GetFirstFrom() + shift);
178  if (rg.IsDirect()) {
179  rg.SetSecondFrom(rg.GetSecondFrom() + sub.GetLength());
180  }
181  else {
182  sub.SetSecondFrom(rg.GetSecondToOpen() - sub.GetLength());
183  }
184  rg.SetFirstFrom(rg.GetFirstFrom() + sub.GetLength());
185  rg.SetLength(rg.GetLength() - sub.GetLength());
186  dst->push_back(sub);
187  }
188  shift = gap->shift + gap->len;
189  ++gap;
190  }
191  rg.SetFirstFrom(rg.GetFirstFrom() + shift);
192  dst->push_back(rg);
193  last_to = rg.GetSecondToOpen();
194  }
195  // Still have gaps in some rows?
196  while (gap != gaps.end()) {
197  if (gap->row == row) {
198  CPairwiseAln::TAlignRange ins(gap->from + shift,
199  last_to, gap->len, gap->direct);
200  ins.SetFirstDirect(first_direct);
201  ins.SetDirect(second_direct);
202  dst->push_back(ins);
203  }
204  shift = gap->shift + gap->len;
205  ++gap;
206  }
207  m_Aln->SetPairwiseAlns()[row] = dst;
208  }
209  // Now the new anchored alignment should contain all segments and gaps
210  // aligned to the new anchor coordinates.
211 
212  m_SecondRanges.resize(dim);
213  for (TDim row = 0; row < dim; ++row) {
214 
215  /// Check collections flags
219 
220  /// Determine m_FirstRange
221  if (row == 0) {
222  m_FirstRange = m_Aln->GetPairwiseAlns()[row]->GetFirstRange();
223  } else {
224  m_FirstRange.CombineWith(m_Aln->GetPairwiseAlns()[row]->GetFirstRange());
225  }
226 
227  /// Determine m_SecondRanges
229  ext.UpdateIndex();
231  }
232 
233  const CPairwiseAln& anch_pw = *m_Aln->GetPairwiseAlns()[m_Aln->GetAnchorRow()];
234  m_AnchorDirect = anch_pw.empty() || anch_pw.begin()->IsFirstDirect();
235 }
236 
237 
239 {
240  return m_FirstRange;
241 }
242 
243 
245 {
246  m_GapChar = gap_char;
247 }
248 
249 
251 {
252  return m_Scope;
253 }
254 
255 
258 {
259  _ASSERT(row >= 0 && row < GetDim());
260  return *m_Aln->GetPairwiseAlns()[row];
261 }
262 
263 
265 {
266  _ASSERT(row >= 0 && row < GetDim());
267  return m_Aln->GetPairwiseAlns()[row]->GetSecondId()->GetSeqId();
268 }
269 
270 
272 {
273  _ASSERT(row >= 0 && row < GetDim());
274  return m_Aln->GetPairwiseAlns()[row]->GetFirstFrom();
275 }
276 
277 
279 {
280  _ASSERT(row >= 0 && row < GetDim());
281  return m_Aln->GetPairwiseAlns()[row]->GetFirstTo();
282 }
283 
284 
286 {
287  _ASSERT(row >= 0 && row < GetDim());
289 }
290 
291 
293 {
294  _ASSERT(row >= 0 && row < GetDim());
295  return m_SecondRanges[row].GetFrom();
296 }
297 
298 
300 {
301  _ASSERT(row >= 0 && row < GetDim());
302  return m_SecondRanges[row].GetTo();
303 }
304 
305 
307 {
308  _ASSERT(row >= 0 && row < GetDim());
309  return TRange(GetSeqStart(row), GetSeqStop(row));
310 }
311 
312 
314 {
315  _ASSERT(row >= 0 && row < GetDim());
318 }
319 
320 
322 {
323  _ASSERT(row >= 0 && row < GetDim());
326 }
327 
328 
330  /// TODO Does BaseWidth of 1 always mean nucleotide? Should we
331  /// have an enum (with an invalid (unasigned) value?
332  const int k_unasigned_base_width = 0;
333  int base_width = k_unasigned_base_width;
334  for (TDim row = 0; row < GetDim(); ++row) {
335  if (base_width == k_unasigned_base_width) {
336  base_width = m_Aln->GetPairwiseAlns()[row]->GetFirstBaseWidth();
337  }
338  if (base_width != m_Aln->GetPairwiseAlns()[row]->GetFirstBaseWidth() ||
339  base_width != m_Aln->GetPairwiseAlns()[row]->GetSecondBaseWidth()) {
340  return true; //< there *at least one* base diff base width
341  }
342  /// TODO or should this check be stronger:
343  if (base_width != 1) {
344  return true;
345  }
346  }
347  return false;
348 }
349 
350 
353 {
354  typedef CSparseAln::TAlnRngColl T;
355  switch(dir) {
356  case CSparseAln::eNone:
357  return T::eNone;
358  case CSparseAln::eLeft:
359  return T::eLeft;
360  case CSparseAln::eRight:
361  return T::eRight;
363  return T::eForward;
365  return T::eBackwards;
366  }
367  _ASSERT(false); // invalid
368  return T::eNone;
369 }
370 
371 
374  TSeqPos seq_pos,
375  ESearchDirection dir,
376  bool try_reverse_dir) const
377 {
378  _ASSERT(row >= 0 && row < GetDim());
379 
381  return m_Aln->GetPairwiseAlns()[row]->GetFirstPosBySecondPos(seq_pos, c_dir);
382 }
383 
384 
386  ESearchDirection dir,
387  bool try_reverse_dir) const
388 {
389  _ASSERT(row >= 0 && row < GetDim());
390 
392  return m_Aln->GetPairwiseAlns()[row]->GetSecondPosByFirstPos(aln_pos, c_dir);
393 }
394 
395 
397 {
398  _ASSERT(row >= 0 && row < GetDim());
399 
400  if ( !m_BioseqHandles[row] ) {
401  if ( !(m_BioseqHandles[row] = m_Scope->GetBioseqHandle(GetSeqId(row))) ) {
402  string errstr = "Invalid bioseq handle. Seq id \"" +
403  GetSeqId(row).AsFastaString() + "\" not in scope?";
404  NCBI_THROW(CAlnException, eInvalidRequest, errstr);
405  }
406  }
407  return m_BioseqHandles[row];
408 }
409 
410 
412 {
413  _ASSERT(row >= 0 && row < GetDim());
414 
415  if ( !m_SeqVectors[row] ) {
416  CSeqVector vec = GetBioseqHandle(row).GetSeqVector
421  m_SeqVectors[row].Reset(new CSeqVector(vec));
422  }
423 
424  CSeqVector& seq_vec = *m_SeqVectors[row];
425  if ( seq_vec.IsNucleotide() ) {
427  seq_vec.SetCoding(m_NaCoding);
428  }
429  else {
430  seq_vec.SetIupacCoding();
431  }
432  }
433  else if ( seq_vec.IsProtein() ) {
435  seq_vec.SetCoding(m_AaCoding);
436  }
437  else {
438  seq_vec.SetIupacCoding();
439  }
440  }
441 
442  return seq_vec;
443 }
444 
445 
447 {
449  if ( !h ) return kDefaultGenCode;
450 
451  // Try to use bio-source first.
453  if ( src ) return src->GetGenCode();
454 
455  // Fallback to org-ref.
457  if ( ref ) return ref->GetGcode();
458 
459  return kDefaultGenCode;
460 }
461 
462 
463 void CSparseAln::TranslateNAToAA(const string& na,
464  string& aa,
465  int gencode)
466 {
467  const CTrans_table& tbl = CGen_code_table::GetTransTable(gencode);
468 
469  size_t na_remainder = na.size() % 3;
470  size_t na_size = na.size() - na_remainder;
471 
472  if (&aa != &na) {
473  aa.resize(na_size / 3 + (na_remainder ? 1 : 0));
474  }
475 
476  if ( na.empty() ) return;
477 
478  size_t aa_i = 0;
479  int state = 0;
480  for (size_t na_i = 0; na_i < na_size; ) {
481  for (size_t i = 0; i < 3; ++i, ++na_i) {
482  state = tbl.NextCodonState(state, na[na_i]);
483  }
484  aa[aa_i++] = tbl.GetCodonResidue(state);
485  }
486  if (na_remainder) {
487  aa[aa_i++] = '\\';
488  }
489 
490  if (&aa == &na) {
491  aa.resize(aa_i);
492  }
493 }
494 
495 
497  string &buffer,
498  TSeqPos seq_from, TSeqPos seq_to,
499  bool force_translation) const
500 {
501  return GetSeqString(row, buffer, TRange(seq_from, seq_to), force_translation);
502 }
503 
504 
506  string &buffer,
507  const TRange &rq_seq_range,
508  bool force_translation) const
509 {
510  _ASSERT(row >= 0 && row < GetDim());
511 
512  TRange seq_range = rq_seq_range;
513  if ( seq_range.IsWhole() ) {
514  seq_range = GetSeqRange(row);
515  }
516 
517  buffer.erase();
518  int width = m_Aln->GetPairwiseAlns()[row]->GetSecondBaseWidth();
519  TSeqPos tr_from = seq_range.GetFrom();
520  TSeqPos tr_to = seq_range.GetToOpen();
521  if (width > 1) {
522  // It's a protein sequence. Protein coordinates were multiplied by 3,
523  // any fractions (incomplete AAs) must be discarded.
524  tr_from /= 3;
525  if (seq_range.GetFrom() % 3 > 0) {
526  tr_from++; // skip incomplete AA
527  }
528  tr_to /= 3;
529  force_translation = false; // do not translate AAs.
530  }
531  if (tr_to > tr_from) {
532  CSeqVector& seq_vector = x_GetSeqVector(row);
533 
534  TSeqPos size = tr_to - tr_from;
535  buffer.resize(size, m_GapChar);
536 
537  if (IsPositiveStrand(row)) {
538  seq_vector.GetSeqData(tr_from, tr_to, buffer);
539  } else {
540  TSeqPos vec_size = seq_vector.size();
541  seq_vector.GetSeqData(vec_size - tr_to, vec_size - tr_from, buffer);
542  }
543 
544  if ( force_translation ) {
546  }
547  }
548  return buffer;
549 }
550 
551 
553  string &buffer,
554  const TSignedRange &rq_aln_range,
555  bool force_translation) const
556 {
557  _ASSERT(row >= 0 && row < GetDim());
558 
559  TSignedRange aln_range(rq_aln_range);
560  if ( aln_range.IsWhole() ) {
561  aln_range = GetSeqAlnRange(row);
562  }
563 
564  buffer.erase();
565  if (aln_range.GetLength() <= 0) {
566  return buffer;
567  }
568 
569  const CPairwiseAln& pairwise_aln = *m_Aln->GetPairwiseAlns()[row];
570  if (pairwise_aln.empty()) {
571  string errstr = "Invalid (empty) row (" + NStr::IntToString(row) + "). Seq id \"" +
572  GetSeqId(row).AsFastaString() + "\".";
573  NCBI_THROW(CAlnException, eInvalidRequest, errstr);
574  }
575 
576  CSeqVector& seq_vector = x_GetSeqVector(row);
577  TSeqPos vec_size = seq_vector.size();
578 
579  const int base_width = pairwise_aln.GetSecondBaseWidth();
580  int gencode = 0;
581  bool translate = force_translation || pairwise_aln.GetSecondId()->IsProtein();
582 
583  // buffer holds sequence for "aln_range", 0 index corresonds to aln_range.GetFrom()
584  size_t buf_size = aln_range.GetLength();
585  if (translate) {
586  gencode = x_GetGenCode(row);
587  buf_size /= 3;
588  }
589  buffer.resize(buf_size, m_GapChar);
590 
591  string s; // current segment sequence
592  CSparse_CI it(*this, row, IAlnSegmentIterator::eSkipInserts, aln_range);
593 
594  // Store last incomplete codon's position and length. If the next
595  // segment includes the rest of the same codon, it should be included
596  // in the results. Otherwise incomplete codons are removed from protein
597  // sequences.
598  TSeqPos split_codon_pos = kInvalidSeqPos;
599  // When trimming one or more codons, future offsets must be adjusted.
600  bool direct = IsPositiveStrand(row);
601  bool is_first_seg = true;
602  size_t trim_from = 0;
603  size_t trim_to = 0;
604  while ( it ) {
605  trim_to = 0;
606  const IAlnSegment::TSignedRange& aln_r = it->GetAlnRange(); // in alignment
607  const IAlnSegment::TSignedRange& row_r = it->GetRange(); // on sequence
608  if ( row_r.Empty() ) {
609  ++it;
610  is_first_seg = false;
611  continue;
612  }
613 
614  size_t off = aln_r.GetFrom() - aln_range.GetFrom();
615  if (base_width == 1) {
616  // TODO performance issue - waiting for better API
617  if ( direct ) {
618  seq_vector.GetSeqData(row_r.GetFrom(), row_r.GetToOpen(), s);
619  } else {
620  seq_vector.GetSeqData(vec_size - row_r.GetToOpen(),
621  vec_size - row_r.GetFrom(), s);
622  }
623  if ( translate ) {
624  TranslateNAToAA(s, s, gencode);
625  off /= 3;
626  }
627  }
628  else {
629  _ASSERT(base_width == 3);
630  TSeqPos tr_from = row_r.GetFrom();
631  TSeqPos tr_to = row_r.GetToOpen();
632  if ( direct ) {
633  if (tr_from % 3 > 0) {
634  if (tr_from == split_codon_pos) {
635  // Include incomplete codon from the last range.
636  _ASSERT(off > 0);
637  if ( is_first_seg ) trim_from = tr_from % 3;
638  off -= tr_from % 3;
639  tr_from -= tr_from % 3;
640  }
641  else {
642  // Trim incomplete codon at range start.
643  off += 3 - (tr_from % 3);
644  tr_from += 3 - (tr_from % 3);
645  }
646  }
647  if (tr_to % 3 > 0) {
648  split_codon_pos = tr_to;
649  trim_to = tr_to % 3;
650  tr_to -= tr_to % 3;
651  }
652  }
653  else /* reverse */ {
654  if (tr_to % 3 > 0) {
655  if (tr_to == split_codon_pos) {
656  // Include incomplete codon from the last range.
657  _ASSERT(off > 0);
658  if ( is_first_seg ) trim_from = 3 - (tr_to % 3);
659  off -= 3 - (tr_to % 3);
660  tr_to += 3 - (tr_to % 3);
661  }
662  else {
663  off += tr_to % 3;
664  tr_to -= tr_to % 3;
665  }
666  }
667  if (tr_from % 3 > 0) {
668  split_codon_pos = tr_from;
669  trim_to = 3 - (tr_from % 3);
670  tr_from += 3 - (tr_from % 3);
671  }
672  }
673  off /= 3;
674  _ASSERT(tr_from % 3 == 0);
675  _ASSERT(tr_to % 3 == 0);
677  prot_r.SetOpen(tr_from / 3, tr_to / 3);
678  if ( direct ) {
679  seq_vector.GetSeqData(prot_r.GetFrom(),
680  prot_r.GetToOpen(), s);
681  }
682  else {
683  seq_vector.GetSeqData(vec_size - prot_r.GetToOpen(),
684  vec_size - prot_r.GetFrom(), s);
685  }
686  }
687 
688  size_t len = min(buf_size - off, s.size());
689  _ASSERT(off + len <= buf_size);
690 
691  if (len > 0) {
692  if ( m_AnchorDirect ) {
693  buffer.replace(off, len, s, 0, len);
694  }
695  else {
696  buffer.replace(buf_size - off - len, len, s, 0, len);
697  }
698  }
699  ++it;
700  is_first_seg = false;
701  }
702  if (translate && (size_t)aln_range.GetLength() >= trim_from + trim_to) {
703  buffer.resize((aln_range.GetLength() - trim_from - trim_to) / 3);
704  }
705  return buffer;
706 }
707 
708 
711  const TSignedRange& range,
712  IAlnSegmentIterator::EFlags flag) const
713 {
714  _ASSERT(row >= 0 && row < GetDim());
715  _ASSERT( !m_Aln->GetPairwiseAlns()[row]->empty() );
716  if (m_Aln->GetPairwiseAlns()[row]->empty()) {
717  string errstr = "Invalid (empty) row (" + NStr::IntToString(row) + "). Seq id \"" +
718  GetSeqId(row).AsFastaString() + "\".";
719  NCBI_THROW(CAlnException, eInvalidRequest, errstr);
720  }
721  return new CSparse_CI(*this, row, flag, range);
722 }
723 
724 
static CRef< CScope > m_Scope
User-defined methods of the data storage class.
@ eNone
None specified.
Definition: blast_def.h:326
CRange< position_type > GetSecondRange() const
class CAlignRangeCollectionList<TAlignRange> represent a sorted collection of TAlignRange.
const_iterator begin() const
ESearchDirection
adding empty ranges is considered valid, they are simply ignored
@ fInvalid
collection was modified and not validated
@ fUnsorted
one or more policies violated
const TInsertions & GetInsertions() const
Each insertion shows where the 'first' sequence has a gap while the 'second' sequence has the inserti...
const_iterator end() const
CAlignRange Represents an element of pairwise alignment of two sequences.
Definition: align_range.hpp:63
Query-anchored alignment can be 2 or multi-dimentional.
const TPairwiseAlnVector & GetPairwiseAlns(void) const
The vector of pairwise alns.
TDim GetDim(void) const
How many rows.
void SetAnchorRow(TDim anchor_row)
Modify anchor row (never do this unless you are creating a new alignment and know what you're doing).
TPairwiseAlnVector & SetPairwiseAlns(void)
Modify pairwise alns.
void SetScore(int score)
Set the total score.
TDim GetAnchorRow(void) const
Which is the anchor row?
int GetScore(void) const
What is the total score?
void SetDim(TDim dim)
Modify the number of rows.
int GetGenCode(int def=1) const
Definition: BioSource.cpp:73
CBioseq_Handle –.
static const CTrans_table & GetTransTable(int id)
A pairwise aln is a collection of ranges for a pair of rows.
int GetSecondBaseWidth(void) const
Base width of the second row.
const TAlnSeqIdIRef & GetFirstId(void) const
Get first sequence id.
const TAlnSeqIdIRef & GetSecondId(void) const
Get second sequence id.
CRef –.
Definition: ncbiobj.hpp:618
CSeqVector –.
Definition: seq_vector.hpp:65
TSignedSeqPos GetSeqAlnStart(TNumrow row) const
Definition: sparse_aln.cpp:271
bool m_AnchorDirect
Definition: sparse_aln.hpp:321
TRng GetAlnRange(void) const
Get whole alignment range.
Definition: sparse_aln.cpp:238
const TAlnRngColl & GetAlignCollection(TNumrow row)
Get pairwise alignment for the row.
Definition: sparse_aln.cpp:257
TSignedSeqPos GetSeqAlnStop(TNumrow row) const
Definition: sparse_aln.cpp:278
void x_Build(const CAnchoredAln &src_align)
Definition: sparse_aln.cpp:99
void SetGapChar(TResidue gap_char)
Gap character modifier.
Definition: sparse_aln.cpp:244
TResidue m_GapChar
Definition: sparse_aln.hpp:314
static void TranslateNAToAA(const string &na, string &aa, int gen_code=kDefaultGenCode)
Definition: sparse_aln.cpp:463
TRange GetSeqRange(TNumrow row) const
Get sequence range in sequence coords.
Definition: sparse_aln.cpp:306
CAnchoredAln::TDim TDim
Synonym of TNumrow.
Definition: sparse_aln.hpp:56
CSparseAln(const CAnchoredAln &anchored_aln, objects::CScope &scope)
Constructor.
Definition: sparse_aln.cpp:52
CRef< CAnchoredAln > m_Aln
Definition: sparse_aln.hpp:310
TSeqPos GetSeqStop(TNumrow row) const
Definition: sparse_aln.cpp:299
virtual IAlnSegmentIterator * CreateSegmentIterator(TNumrow row, const TSignedRange &range, IAlnSegmentIterator::EFlags flags) const
Create segment iterator.
Definition: sparse_aln.cpp:710
string & GetAlnSeqString(TNumrow row, string &buffer, const TSignedRange &rq_aln_rng, bool force_translation=false) const
Fetch alignment sequence data.
Definition: sparse_aln.cpp:552
TSignedSeqPos GetAlnPosFromSeqPos(TNumrow row, TSeqPos seq_pos, ESearchDirection dir=eNone, bool try_reverse_dir=true) const
Map sequence position to alignment coordinates.
Definition: sparse_aln.cpp:373
int x_GetGenCode(TNumrow row) const
Definition: sparse_aln.cpp:446
TSignedSeqPos GetSeqPosFromAlnPos(TNumrow for_row, TSeqPos aln_pos, ESearchDirection dir=eNone, bool try_reverse_dir=true) const
Definition: sparse_aln.cpp:385
vector< objects::CBioseq_Handle > m_BioseqHandles
Definition: sparse_aln.hpp:315
bool IsTranslated(void) const
Wheather the alignment is translated (heterogenous), e.g. nuc-prot.
Definition: sparse_aln.cpp:329
vector< TRng > m_SecondRanges
Definition: sparse_aln.hpp:313
TCoding m_AaCoding
Definition: sparse_aln.hpp:319
bool IsPositiveStrand(TNumrow row) const
Check direction of the row.
Definition: sparse_aln.cpp:313
TDim GetDim(void) const
Alignment dimension (number of sequence rows in the alignment)
Definition: sparse_aln.cpp:69
const objects::CBioseq_Handle & GetBioseqHandle(TNumrow row) const
Get bioseq handle for the row.
Definition: sparse_aln.cpp:396
virtual ~CSparseAln(void)
Destructor.
Definition: sparse_aln.cpp:64
TCoding m_NaCoding
Definition: sparse_aln.hpp:318
CRef< objects::CScope > m_Scope
Definition: sparse_aln.hpp:311
TSeqPos GetSeqStart(TNumrow row) const
Definition: sparse_aln.cpp:292
TSignedRange GetSeqAlnRange(TNumrow row) const
Get sequence range in alignment coords (strand ignored).
Definition: sparse_aln.cpp:285
CRef< objects::CScope > GetScope(void) const
Scope accessor.
Definition: sparse_aln.cpp:250
vector< CRef< CSeqVector > > m_SeqVectors
Definition: sparse_aln.hpp:316
CSeqVector & x_GetSeqVector(TNumrow row) const
Definition: sparse_aln.cpp:411
string & GetSeqString(TNumrow row, string &buffer, TSeqPos seq_from, TSeqPos seq_to, bool force_translation=false) const
Fetch sequence data for the given row and range.
Definition: sparse_aln.cpp:496
friend class CSparse_CI
Definition: sparse_aln.hpp:302
TRng m_FirstRange
Definition: sparse_aln.hpp:312
bool IsNegativeStrand(TNumrow row) const
Definition: sparse_aln.cpp:321
const objects::CSeq_id & GetSeqId(TNumrow row) const
Get seq-id for the row.
Definition: sparse_aln.cpp:264
Implementation of IAlnSegmentIterator for CSparseAln.
Definition: sparse_ci.hpp:73
char GetCodonResidue(int state) const
static int NextCodonState(int state, unsigned char ch)
CRange< TSignedSeqPos > TSignedRange
CRange< TSeqPos > TRange
objects::CSeqVector::TResidue TResidue
ESearchDirection
Position search options.
@ eNone
No search.
@ eRight
Towards higher aln coord (always to the right)
@ eLeft
Towards lower aln coord (always to the left)
@ eBackwards
Towards lower seq coord (to the left if plus strand, right if minus)
@ eForward
Towards higher seq coord (to the right if plus strand, left if minus)
Alignment segment iterator interface.
EFlags
Iterator options.
@ eSkipInserts
Iterate segments where at least some rows are aligned (including gap segments)
virtual const TSignedRange & GetRange(void) const =0
Get the selected row range.
virtual const TSignedRange & GetAlnRange(void) const =0
Get alignment range for the segment.
#define T(s)
Definition: common.h:230
#define true
Definition: bool.h:35
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const CBioSource * GetBioSource(const CBioseq &bioseq)
Retrieve the BioSource object for a given bioseq handle.
Definition: sequence.cpp:104
const COrg_ref * GetOrg_refOrNull(const CBioseq_Handle &handle)
Return the pointer to org-ref associated with a given sequence or null if there is no org-ref associa...
Definition: sequence.cpp:245
@ eStrand_Plus
Plus strand.
@ eStrand_Minus
Minus strand.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
bool IsProtein(void) const
Definition: seq_vector.hpp:350
void SetCoding(TCoding coding)
void SetIupacCoding(void)
Set coding to either Iupacaa or Iupacna depending on molecule type.
bool IsNucleotide(void) const
Definition: seq_vector.hpp:357
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
position_type GetLength(void) const
Definition: range.hpp:158
TThisType & CombineWith(const TThisType &r)
Definition: range.hpp:345
position_type GetToOpen(void) const
Definition: range.hpp:138
position_type GetFirstToOpen(void) const
TThisType & SetLength(position_type len)
bool IsFirstDirect() const
TThisType & SetSecondFrom(position_type second_from)
bool IsDirect() const
Definition: align_range.hpp:96
position_type GetSecondFrom(void) const
bool Empty(void) const
Definition: range.hpp:148
TThisType & SetOpen(position_type from, position_type toOpen)
Definition: range.hpp:184
void SetDirect(bool direct=true)
bool IsWhole(void) const
Definition: range.hpp:284
TThisType & SetFirstFrom(position_type from)
void SetFirstDirect(bool direct=true)
position_type GetFirstFrom(void) const
position_type GetSecondToOpen(void) const
position_type GetLength(void) const
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
@ e_not_set
No variant selected.
Definition: Seq_data_.hpp:103
@ eRight
@ eLeft
Definition of all error codes used in objtools libraries.
@ e_not_set
int i
int len
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::SIZE size
T min(T x_, T y_)
static pcre_uint8 * buffer
Definition: pcretest.c:1051
vector< SGapRange > TGapRanges
Definition: sparse_aln.cpp:96
USING_SCOPE(ncbi::objects)
CSparseAln::TAlnRngColl::ESearchDirection GetCollectionSearchDirection(CSparseAln::ESearchDirection dir)
Definition: sparse_aln.cpp:352
#define row(bind, expected)
Definition: string_bind.c:73
TSignedSeqPos from
Definition: sparse_aln.cpp:76
TSignedSeqPos second_from
Definition: sparse_aln.cpp:77
TSignedSeqPos shift
Definition: sparse_aln.cpp:84
TSignedSeqPos len
Definition: sparse_aln.cpp:78
bool operator<(const SGapRange &rg) const
Definition: sparse_aln.cpp:86
bool direct
Definition: sparse_aln.cpp:79
#define _ASSERT
Modified on Tue Apr 23 07:40:34 2024 by modify_doxy.py rev. 669887