NCBI C++ ToolKit
cigar_formatter.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cigar_formatter.cpp 99483 2023-04-04 17:43:43Z stakhovv $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Aaron Ucko, Aleksey Grichenko
27 *
28 * File Description:
29 * Base class for CIGAR formatters.
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
39 #include <objmgr/util/sequence.hpp>
40 #include <objtools/error_codes.hpp>
43 
44 
45 #define NCBI_USE_ERRCODE_X Objtools_Fmt_CIGAR
46 
49 
50 
52  CScope* scope,
54  : m_Align(aln),
55  m_CurAlign(nullptr),
56  m_Scope(scope),
57  m_Flags(flags),
58  m_IsFirstSubalign(true),
59  m_IsTrivial(true),
60  m_LastType(0),
61  m_Frame(-1),
62  m_RefRow(-1),
63  m_RefSign(1),
64  m_TargetRow(-1),
65  m_TargetSign(1),
66  m_FormatBy(eFormatBy_NotSet)
67 {
68 }
69 
70 
72 {
73 }
74 
75 
77 {
80  EndAlignment();
81 }
82 
83 
85  bool width_inverted)
86 {
87  switch (sa.GetSegs().Which()) {
89  x_FormatDensegRows(sa.GetSegs().GetDenseg(), width_inverted);
90  break;
91 
93  {
94  CRef<CSeq_align> sa2;
95  try {
96  sa2 = sa.GetSegs().GetSpliced().AsDiscSeg();
97  if (sa.IsSetScore()) {
98  sa2->SetScore().insert(sa2->SetScore().end(),
99  sa.GetScore().begin(),
100  sa.GetScore().end());
101  }
102  } STD_CATCH_ALL_X(1, "CCIGAR_Formatter::x_FormatAlignmentRows")
103  if (sa2) {
104  // Conversion from Spliced to Disc inverts meaning of width!!!
105  x_FormatAlignmentRows(*sa2, true);
106  }
107  break;
108  }
109 
111  {
112  CRef<CSeq_align> sa2;
113  try {
114  sa2 = sa.CreateDensegFromStdseg();
115  } STD_CATCH_ALL_X(1, "CCIGAR_Formatter::x_FormatAlignmentRows")
116  if (sa2.NotEmpty() && sa2->GetSegs().IsDenseg()) {
117  x_FormatDensegRows(sa2->GetSegs().GetDenseg(), width_inverted);
118  }
119  break;
120  }
121 
123  {
125  m_CurAlign = (*it).GetPointer();
127  x_FormatAlignmentRows(**it, width_inverted);
128  EndSubAlignment();
129  m_CurAlign = nullptr;
130  m_IsFirstSubalign = false;
131  }
132  break;
133  }
134 
135  default: // dendiag or packed; unsupported
136  NCBI_THROW(CFlatException, eNotSupported,
137  "Conversion of alignments of type dendiag and packed "
138  "not supported in current CIGAR output");
139  }
140 }
141 
142 
144 {
145  CScope* scope = GetScope();
146  for (TNumrow row = 0; row < m_AlnMap->GetNumRows(); ++row) {
147  if (sequence::IsSameBioseq(m_AlnMap->GetSeqId(row), id, scope)) {
148  return row;
149  }
150  }
151  ERR_POST_X(1, "CCIGAR_Formatter::x_GetRowById: "
152  "no row with a matching ID found: " << id.AsFastaString());
153  return -1;
154 }
155 
156 
157 void CCIGAR_Formatter::x_FormatLine(bool width_inverted)
158 {
159  if (m_TargetRow == m_RefRow) {
160  return;
161  }
162  CNcbiOstrstream cigar;
163  m_LastType = 0;
164  TSeqPos last_count = 0;
165 
166  if ( !m_RefId ) {
169  }
170  if ( !m_TargetId ) {
173  }
174 
175  m_RefWidth =
176  (static_cast<size_t>(m_RefRow) < m_DenseSeg->GetWidths().size()) ?
177  m_DenseSeg->GetWidths()[m_RefRow] : 1;
179  m_TargetWidth =
180  (static_cast<size_t>(m_TargetRow) < m_DenseSeg->GetWidths().size()) ?
183  m_IsTrivial = true;
184  TSignedSeqPos last_frameshift = 0;
185 
186  if (! width_inverted && (m_RefWidth != 1 || m_TargetWidth != 1)) {
187  // Supporting widths ONLY in the unamiguous case when we
188  // know they are WRONG and put there incorrectly from conversion
189  // from Spliced-seg. If we didn't get widths that way, we don't
190  // know what they mean, so punt if not all widths are 1.
191  NCBI_THROW(CFlatException, eNotSupported,
192  "Widths in alignments do not have clear semantics, "
193  "and thus are not supported in current CIGAR output");
194  }
195 
196  StartRow();
197 
198  // Is the following correct???
199  //
200  // Expecting all coordinates to be normalized relative to
201  // some reference width, which might be the
202  // Least Common Multiple of length in nucleotide bases of
203  // the coordinate system used for each row, e.g. using
204  // LCM of 3 if either row is protein. The Least Common Multiple
205  // would allow accurately representing frameshifts in either
206  // sequence.
207  //
208  // What does width for an alignment really mean, biologically?
209  // It can't have arbitrary meaning, because CIGAR has fixed
210  // semantics that M/I/D/F/R are in 3-bp units (i.e. one aa) for
211  // proteins and 1-bp units for cDNA.
212  //
213  // Thus, in practice, I think we are expecting widths to be
214  // one of (1, 1) for nuc-nuc, (1, 3) for nuc-prot,
215  // (3, 1) for prot-nuc, and (3, 3) for prot-prot.
218  for (CAlnMap::TNumchunk idx = 0; idx < numseg; ++idx) {
219  // If ref is on minus strand, revert the order of segments.
220  CAlnMap::TNumchunk seg = m_RefSign > 0 ? idx : numseg - idx - 1;
221  TRange ref_piece = m_AlnMap->GetRange(m_RefRow, seg);
222  TRange tgt_piece = m_AlnMap->GetRange(m_TargetRow, seg);
225  //The type and count are guaranteed set by one of the if/else cases below.
226  char type = 'X'; // Guaranteed set. Pacify compiler.
227  TSeqPos count = 0; // Guaranteed set. Pacify compiler.
228  TSignedSeqPos frameshift = 0;
229 
230  if ( (tgt_flags & CAlnMap::fSeq) &&
231  !(ref_flags & CAlnMap::fSeq) ) {
232  // TODO: Handle non-initial protein gap that does not start
233  // on an aa boundary.
234  //
235  type = 'I';
236  if (idx == 0 && IsSetFlag(fCIGAR_GffForFlybase) && m_TargetWidth == 3) {
237  // See comments about frame and phase, below.
238  m_Frame = tgt_piece.GetFrom() % m_TargetWidth;
239  }
240  count = tgt_piece.GetLength()/width;
241  frameshift = -(tgt_piece.GetLength()%TSignedSeqPos(width));
242  tgt_piece.SetFrom(tgt_piece.GetFrom()/m_TargetWidth);
243  tgt_piece.SetTo(tgt_piece.GetTo()/m_TargetWidth);
244  m_TargetRange += tgt_piece;
245  }
246  else if (! (tgt_flags & CAlnMap::fSeq) &&
247  (ref_flags & CAlnMap::fSeq)) {
248  // TODO: Handle gap that does not start on an aa boundary.
249  //
250  type = 'D';
251  if (idx == 0 && IsSetFlag(fCIGAR_GffForFlybase) && m_RefWidth == 3) {
252  // See comments about frame and phase, below.
253  m_Frame = ref_piece.GetFrom() % m_RefWidth;
254  }
255  count = ref_piece.GetLength()/width;
256  frameshift = +(ref_piece.GetLength()%width);
257  // Adjusting for start position, converting to natural cordinates
258  // (aa for protein locations, which would imply divide by 3).
259  ref_piece.SetFrom(ref_piece.GetFrom()/m_RefWidth);
260  ref_piece.SetTo(ref_piece.GetTo()/m_RefWidth);
261  m_RefRange += ref_piece;
262  }
263  else if ((tgt_flags & CAlnMap::fSeq) &&
264  (ref_flags & CAlnMap::fSeq)) {
265  // Hanlde case when sequences aligned.
266  // The remaining case is when both don't align at all,
267  // which shouldn't happen in a pairwise alignment. If we
268  // happen to have a multiple alignment, the remaining case
269  // would be one that aligns unrelated sequences, thus has
270  // no affect on the current GFF3 output.
271  // TODO: Resolve why the following implementation is different
272  // from the above historic implementation. The difference
273  // will be in rounding down vs up on single or last
274  // segment.
275  //
276  type = 'M';
277  if (ref_piece.GetLength() != tgt_piece.GetLength()) {
278  // There's a frameshift.. somewhere. Is this valid? Bail.
279  NCBI_THROW(CFlatException, eNotSupported,
280  "Frameshift(s) in Spliced-exon-chunk's diag "
281  "not supported in current CIGAR output");
282  }
283  if (idx == 0 && IsSetFlag(fCIGAR_GffForFlybase)) {
284  // Semantics of the phase aren't defined in GFF3 for
285  // feature types other than a CDS, and this is an alignment.
286  //
287  // Since phase is not required for alignment features, don't
288  // emit one, unless we have been requested with the special
289  // Flybase variant of GFF3 -- they did ask for phase.
290  //
291  // Also, phase can only be interpreted if we have an alignment
292  // in terms of protein aa, and a width of 3 for one or
293  // the other.
294  //
295  // For an alignment, the meaning of phase is ambiguous,
296  // particularly in dealing with a protein-protein
297  // alignment (if ever it allowed alignment to parts of
298  // a codon), and when the seqid is the protein, rather
299  // than the target.
300  //
301  // A protein won't be "reverse complemented" thus,
302  // can assume that it's plus-strand and look at start
303  // position.
304  //
305  // The computation below is actually for the frame.
306  // The phase is not the same, and will be derived from
307  // the frame.
308  if (m_RefWidth == 3) {
309  m_Frame = ref_piece.GetFrom() % m_RefWidth;
310  } else if (m_TargetWidth == 3) {
311  m_Frame = tgt_piece.GetFrom() % m_TargetWidth;
312  }
313  }
314  // Adjusting for start position, converting to natural cordinates
315  // (aa for protein locations, which would imply divide by 3).
316  count = ref_piece.GetLength()/width;
317  ref_piece.SetFrom(ref_piece.GetFrom()/m_RefWidth);
318  ref_piece.SetTo(ref_piece.GetTo()/m_RefWidth);
319  m_RefRange += ref_piece;
320  tgt_piece.SetFrom(tgt_piece.GetFrom()/m_TargetWidth);
321  tgt_piece.SetTo(tgt_piece.GetTo()/m_TargetWidth);
322  m_TargetRange += tgt_piece;
323  }
324  if (type == m_LastType) {
325  last_count += count;
326  last_frameshift += frameshift;
327  } else {
328  if (m_LastType) {
329  if (last_count) {
330  m_IsTrivial = false;
331  AddSegment(cigar, m_LastType, last_count);
332  }
333  if (last_frameshift) {
334  m_IsTrivial = false;
335  AddSegment(cigar,
336  (last_frameshift < 0 ? 'F' : 'R'),
337  abs(last_frameshift));
338  }
339  }
340  m_LastType = type;
341  last_count = count;
342  last_frameshift = frameshift;
343  }
344  }
345  CNcbiOstrstream aln_out;
346 
347  AddSegment(cigar, m_LastType, last_count);
348  string cigar_string = CNcbiOstrstreamToString(cigar);
349 
350  AddRow(cigar_string);
351 
352  EndRow();
353 }
354 
355 
357  bool width_inverted)
358 {
359  m_DenseSeg.Reset(&ds);
360 
361  // Frame, as a value of 0, 1, 2, or -1 for undefined.
362  // This is NOT the same frame as the frame in ASN.1!
363  m_Frame = -1;
364 
365  // WORKAROUND
366  // I do believe there is lack of agreement on what
367  // the "widths" of a dense-seg mean -- as multiplier, or as divisor.
368  //
369  // In CSpliced_seg::s_ExonToDenseg, the widths act as divisors,
370  // i.e. width 3 means every 3 units (na) in the alignment
371  // correspond to 1 unit (aa) on the sequence.
372  //
373  // In CAlnMix as witnessed by e.g. x_ExtendDSWithWidths,
374  // or even CDense_seg::Validate, the widths are a multiplier,
375  // i.e. width 3 means every 1 unit (aa) in the alignment
376  // corresponds to an alignment of 3 units (na) on the sequence.
377  //
378  // These definitions are incompatible.
379  // The problem with the latter definition as a multiplier,
380  // is that the smallest unit of alignment (in a protein-to-nucleotide)
381  // is 1 aa = 3 bp... no opportunity for a frameshift. :-(
382  //
383  // To compensate (or rather, avoid double-compentating), avoid use
384  // of widths, and copy to a temporary alignment, storing the old widths
385  // for lookup, but reset them in the temporary alignment.
386  const CDense_seg* ds_for_alnmix(&ds);
387  CDense_seg ds_no_widths;
388  if (width_inverted) {
389  ds_no_widths.Assign(ds);
390  ds_no_widths.ResetWidths();
391  ds_for_alnmix = &ds_no_widths;
392  }
393 
394  m_AlnMap.Reset(new CAlnMap(*ds_for_alnmix));
395 
396  switch ( m_FormatBy ) {
398  {
399  bool by_id = m_RefId.NotNull();
400  if ( by_id ) {
402  }
403  else {
404  _ASSERT(m_RefRow >= 0);
406  }
407  StartRows();
409  x_FormatLine(width_inverted);
410  m_TargetId.Reset();
411  }
412  m_TargetRow = -1;
413  if ( by_id ) {
414  m_RefRow = -1;
415  }
416  else {
417  m_RefId.Reset();
418  }
419  break;
420  }
421  case eFormatBy_TargetId:
422  {
423  bool by_id = m_TargetId.NotNull();
424  if ( by_id ) {
426  }
427  else {
428  _ASSERT(m_TargetRow >= 0);
431  }
432  StartRows();
433  for (m_RefRow = 0; m_RefRow < m_AlnMap->GetNumRows(); ++m_RefRow) {
434  x_FormatLine(width_inverted);
435  m_RefId.Reset();
436  }
437  m_RefRow = -1;
438  if ( by_id ) {
439  m_TargetRow = -1;
440  }
441  else {
442  m_TargetId.Reset();
443  }
444  break;
445  }
446  default:
447  break;
448  }
449 
450  EndRows();
451 
452  // Reset all values which have no sence anymore
453  m_DenseSeg.Reset();
454  m_AlnMap.Reset();
455  m_IsTrivial = true;
456  m_LastType = 0;
457  m_Frame = -1;
459  m_RefSign = 1;
461  m_TargetSign = 1;
462 }
463 
464 
466 {
468  m_RefId.Reset(&ref_id);
469  m_TargetId.Reset();
470  m_RefRow = -1;
471  m_TargetRow = -1;
473 }
474 
475 
477 {
479  m_RefId.Reset();
480  m_TargetId.Reset(&target_id);
481  m_RefRow = -1;
482  m_TargetRow = -1;
484 }
485 
486 
488 {
490  m_RefId.Reset();
491  m_TargetId.Reset();
492  m_RefRow = ref_row;
493  m_TargetRow = -1;
495 }
496 
497 
499 {
501  m_RefId.Reset();
502  m_TargetId.Reset();
503  m_RefRow = -1;
504  m_TargetRow = target_row;
506 }
507 
508 
510  char seg_type,
511  TSeqPos seg_len)
512 {
513  cigar << seg_len << seg_type;
514 }
515 
516 
static CRef< CScope > m_Scope
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Base class for CIGAR formatters.
TNumseg TNumchunk
Definition: alnmap.hpp:112
TSegTypeFlags GetSegType(TNumrow row, TNumseg seg, int offset=0) const
Definition: alnmap.hpp:503
const CSeq_id & GetSeqId(TNumrow row) const
Definition: alnmap.hpp:645
@ fSeq
Definition: alnmap.hpp:52
TDim GetNumRows(void) const
Definition: alnmap.hpp:517
unsigned int TSegTypeFlags
Definition: alnmap.hpp:50
TSignedRange GetRange(TNumrow row, TNumseg seg, int offset=0) const
Definition: alnmap.hpp:653
int StrandSign(TNumrow row) const
Definition: alnmap.hpp:593
TNumseg GetNumSegs(void) const
Definition: alnmap.hpp:510
The base class for alignment formatters which use CIGAR format.
void ResetWidths(void)
Definition: Dense_seg.hpp:225
void Assign(const CSerialObject &obj, ESerialRecursionMode how=eRecursive)
overloaded Assign()
Definition: Dense_seg.cpp:62
const TWidths & GetWidths(void) const
Definition: Dense_seg.hpp:210
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CScope –.
Definition: scope.hpp:92
CRef< CSeq_align > CreateDensegFromStdseg(SSeqIdChooser *SeqIdChooser=0) const
---------------------------------------------------------------------------- PRE : the Seq-align has ...
Definition: Seq_align.cpp:728
CRef< CSeq_align > AsDiscSeg() const
Convert this alignment to a discontinuous segment.
static uch flags
#define true
Definition: bool.h:35
static int type
Definition: getdata.c:31
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
#define STD_CATCH_ALL_X(err_subcode, message)
Standard handling of "exception"-derived exceptions; catches non-standard exceptions and generates "u...
Definition: ncbiexpt.hpp:608
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual void EndSubAlignment(void)
CScope * GetScope(void) const
CAlnMap::TNumrow TNumrow
TNumrow x_GetRowById(const CSeq_id &id)
void FormatByReferenceRow(TNumrow ref_row)
CRef< CAlnMap > m_AlnMap
CConstRef< CSeq_id > m_TargetId
virtual void EndRows(void)
const CSeq_align & GetSeq_align(void) const
const CSeq_align * m_CurAlign
virtual void StartRow(void)
CConstRef< CSeq_id > m_RefId
void x_FormatLine(bool width_inverted)
void x_FormatAlignmentRows(void)
virtual void StartSubAlignment(void)
virtual void AddSegment(CNcbiOstream &cigar, char seg_type, TSeqPos seg_len)
virtual void AdjustSeqIdType(CConstRef< CSeq_id > &)
bool IsSetFlag(ECIGARFlags flag) const
virtual void StartRows(void)
void FormatByReferenceId(const CSeq_id &ref_id)
void FormatByTargetId(const CSeq_id &target_id)
virtual void EndAlignment(void)
void x_FormatDensegRows(const CDense_seg &ds, bool width_inverted)
virtual void StartAlignment(void)
CConstRef< CDense_seg > m_DenseSeg
virtual void AddRow(const string &)
void FormatByTargetRow(TNumrow target_row)
virtual void EndRow(void)
virtual ~CCIGAR_Formatter(void)
@ fCIGAR_GffForFlybase
Flybase flavour of GFF3.
bool IsSameBioseq(const CSeq_id &id1, const CSeq_id &id2, CScope *scope, CScope::EGetBioseqFlag get_flag=CScope::eGetBioseq_All)
Determines if two CSeq_ids represent the same CBioseq.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool NotNull(void) const THROWS_NONE
Check if pointer is not null – same effect as NotEmpty().
Definition: ncbiobj.hpp:1410
position_type GetLength(void) const
Definition: range.hpp:158
static TThisType GetEmpty(void)
Definition: range.hpp:306
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
void SetFrom(TFrom value)
Assign a value to From data member.
Definition: Range_.hpp:231
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
void SetTo(TTo value)
Assign a value to To data member.
Definition: Range_.hpp:278
const TDenseg & GetDenseg(void) const
Get the variant data.
Definition: Seq_align_.cpp:153
TScore & SetScore(void)
Assign a value to Score data member.
Definition: Seq_align_.hpp:902
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_align_.hpp:691
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
bool IsSetScore(void) const
for whole alignment Check if a value has been assigned to Score data member.
Definition: Seq_align_.hpp:884
list< CRef< CSeq_align > > Tdata
const TScore & GetScore(void) const
Get the Score member data.
Definition: Seq_align_.hpp:896
const TDisc & GetDisc(void) const
Get the variant data.
Definition: Seq_align_.cpp:197
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
bool IsDenseg(void) const
Check if variant Denseg is selected.
Definition: Seq_align_.hpp:740
Definition of all error codes used in objtools libraries.
#define abs(a)
Definition: ncbi_heapmgr.c:130
T max(T x_, T y_)
#define row(bind, expected)
Definition: string_bind.c:73
Definition: type.c:6
#define _ASSERT
#define const
Definition: zconf.h:232
Modified on Tue Apr 23 07:39:46 2024 by modify_doxy.py rev. 669887