NCBI C++ ToolKit
seq_align_mapper_base.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seq_align_mapper_base.cpp 100569 2023-08-11 11:39:48Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aleksey Grichenko
27 *
28 * File Description:
29 * Alignment mapper base
30 *
31 */
32 
33 #include <ncbi_pch.hpp>
42 #include <algorithm>
43 
44 #define NCBI_USE_ERRCODE_X Objects_SeqAlignMap
45 
48 
49 
51  : m_Len(len),
52  m_Rows(dim),
53  m_HaveStrands(false),
54  m_GroupIdx(0),
55  m_ScoresGroupIdx(-1),
56  m_PartType(CSpliced_exon_chunk::e_not_set)
57 {
58  return;
59 }
60 
61 
64 {
65  // Make sure the row exists (this should always be true).
66  _ASSERT(m_Rows.size() > idx);
67  return m_Rows[idx];
68 }
69 
70 
72 SAlignment_Segment::CopyRow(size_t idx, const SAlignment_Row& src_row)
73 {
74  // Copy the row to this segment. m_Rows must already contain the
75  // requested index.
76  SAlignment_Row& dst_row = GetRow(idx);
77  dst_row = src_row;
78  return dst_row;
79 }
80 
81 
82 // Add new alignment row. The rows vector must contain the entry.
85  const CSeq_id& id,
86  int start,
87  bool is_set_strand,
88  ENa_strand strand)
89 {
90  SAlignment_Row& row = GetRow(idx);
91  row.m_Id = CSeq_id_Handle::GetHandle(id);
92  row.m_Start = start < 0 ? kInvalidSeqPos : start;
93  row.m_IsSetStrand = is_set_strand;
94  row.m_Strand = strand;
95  m_HaveStrands = m_HaveStrands || is_set_strand;
96  return row;
97 }
98 
99 
100 // Add new alignment row. The rows vector must contain the entry.
103  const CSeq_id_Handle& id,
104  int start,
105  bool is_set_strand,
106  ENa_strand strand)
107 {
108  SAlignment_Row& row = GetRow(idx);
109  row.m_Id = id;
110  // If start is negative (-1), use kInvalidSeqPos.
111  row.m_Start = start < 0 ? kInvalidSeqPos : start;
112  row.m_IsSetStrand = is_set_strand;
113  row.m_Strand = strand;
114  m_HaveStrands = m_HaveStrands || is_set_strand;
115  return row;
116 }
117 
118 
119 // Create an empty seq-align mapper. The mapper may be initialized later
120 // with a seq-align or an exon.
123  : m_LocMapper(loc_mapper),
124  m_OrigAlign(0),
125  m_HaveStrands(false),
126  m_Dim(0),
127  m_ScoresInvalidated(false),
128  m_DstAlign(0),
129  m_AlignFlags(eAlign_Normal)
130 {
131 }
132 
133 
134 // Initialize the mapper with a seq-align.
137  CSeq_loc_Mapper_Base& loc_mapper)
138  : m_LocMapper(loc_mapper),
139  m_OrigAlign(0),
140  m_HaveStrands(false),
141  m_Dim(0),
142  m_ScoresInvalidated(false),
143  m_DstAlign(0),
144  m_AlignFlags(eAlign_Normal)
145 {
146  x_Init(align);
147 }
148 
149 
151 {
152 }
153 
154 
155 // Helper function to copy a container (scores, user-objects, seq-locs).
156 // Copies each element, not just pointers.
157 template<class T, class C1, class C2>
158 void CloneContainer(const C1& src, C2& dst)
159 {
160  ITERATE(typename C1, it, src) {
161  CRef<T> elem(new T);
162  elem->Assign(**it);
163  dst.push_back(elem);
164  }
165 }
166 
167 
168 // Copy pointers from source to destination. Used to store scores
169 // in the parsed segments while mapping an alignment. Should never
170 // be used to create final mapped alignments.
171 template<class C1, class C2>
172 void CopyContainer(const C1& src, C2& dst)
173 {
174  ITERATE(typename C1, it, src) {
175  dst.push_back(*it);
176  }
177 }
178 
179 
180 // Parse the alignment into segments and rows.
182 {
183  m_OrigAlign.Reset(&align);
184  if (align.IsSetScore() && !align.GetScore().empty()) {
185  // Copy global scores. This copies the pointers, not
186  // the objects, so, the result should not be copies
187  // to the mapped seq-align.
188  CopyContainer<CSeq_align::TScore, TScores>(
189  align.GetScore(), m_AlignScores);
190  }
191  switch ( align.GetSegs().Which() ) {
193  x_Init(align.GetSegs().GetDendiag());
194  break;
196  x_Init(align.GetSegs().GetDenseg());
197  break;
199  x_Init(align.GetSegs().GetStd());
200  break;
202  x_Init(align.GetSegs().GetPacked());
203  break;
205  x_Init(align.GetSegs().GetDisc());
206  break;
208  x_Init(align.GetSegs().GetSpliced());
209  break;
211  x_Init(align.GetSegs().GetSparse());
212  break;
213  default:
214  break;
215  }
216 }
217 
218 
219 // Add new segment with the given length and dimension.
221  size_t dim,
222  ENa_strand strand)
223 {
224  // The order of storing parsed segments depends on the strand
225  // so that the segments always go in coordinate order, not in
226  // biological one.
227  if ( !IsReverse(strand) ) {
228  m_Segs.push_back(SAlignment_Segment(len, dim));
229  return m_Segs.back();
230  }
231  else {
232  m_Segs.push_front(SAlignment_Segment(len, dim));
233  return m_Segs.front();
234  }
235 }
236 
237 
238 // Insert new segment. Used when splitting a partially mapped segment.
240 CSeq_align_Mapper_Base::x_InsertSeg(TSegments::iterator& where,
241  int len,
242  size_t dim,
243  bool reverse)
244 {
245  TSegments::iterator ins_it =
246  m_Segs.insert(where, SAlignment_Segment(len, dim));
247  if ( reverse ) {
248  where = ins_it;
249  }
250  return *ins_it;
251 }
252 
253 
254 // Parse dense-diag alignment.
256 {
257  ITERATE(TDendiag, diag_it, diags) {
258  // Make sure all values are consistent. Post warnings and try to
259  // fix any incorrect values.
260  const CDense_diag& diag = **diag_it;
261  size_t dim = diag.GetDim();
262  if (dim != diag.GetIds().size()) {
263  ERR_POST_X(1, Warning << "Invalid 'ids' size in dendiag");
264  dim = min(dim, diag.GetIds().size());
265  }
266  if (dim != diag.GetStarts().size()) {
267  ERR_POST_X(2, Warning << "Invalid 'starts' size in dendiag");
268  dim = min(dim, diag.GetStarts().size());
269  }
270  // Remember if the original alignment contained any strands.
271  m_HaveStrands = diag.IsSetStrands();
272  if (m_HaveStrands && dim != diag.GetStrands().size()) {
273  ERR_POST_X(3, Warning << "Invalid 'strands' size in dendiag");
274  dim = min(dim, diag.GetStrands().size());
275  }
276  if (dim != m_Dim) {
277  if ( m_Dim ) {
278  // Set the flag indicating that segments have different
279  // number of rows.
281  }
282  m_Dim = max(dim, m_Dim);
283  }
284  bool have_prot = false;
285  bool have_nuc = false;
286  // Initialize next segment.
287  SAlignment_Segment& seg = x_PushSeg(diag.GetLen(), dim);
289  if ( diag.IsSetScores() ) {
290  // Store per-segment scores if any.
291  CopyContainer<CDense_diag::TScores, TScores>(
292  diag.GetScores(), seg.m_Scores);
293  }
294  for (size_t row = 0; row < dim; ++row) {
295  if ( m_HaveStrands ) {
296  strand = diag.GetStrands()[row];
297  }
298  const CSeq_id& row_id = *diag.GetIds()[row];
299  int row_start = diag.GetStarts()[row];
300  // Adjust coordinates so that they are always genomic.
302  m_LocMapper.GetSeqTypeById(row_id);
303  if (row_type == CSeq_loc_Mapper_Base::eSeq_prot) {
304  if ( !have_prot ) {
305  // Adjust segment length only once!
306  have_prot = true;
307  seg.m_Len *= 3;
308  }
309  row_start *= 3;
310  }
311  else /*if (row_type == CSeq_loc_Mapper_Base::eSeq_nuc)*/ {
312  have_nuc = true;
313  }
314  // Add row.
315  seg.AddRow(row, row_id, row_start, m_HaveStrands, strand);
316  }
317  if (have_prot && have_nuc) {
318  // This type of alignment does not support mixing sequence types.
319  NCBI_THROW(CAnnotMapperException, eBadAlignment,
320  "Dense-diags with mixed sequence types are not supported");
321  }
322  }
323 }
324 
325 
326 // Parse dense-seg.
328 {
329  m_Dim = denseg.GetDim();
330  size_t numseg = denseg.GetNumseg();
331  // Make sure all values are consistent. Post warnings and try to
332  // fix any incorrect values.
333  if (numseg != denseg.GetLens().size()) {
334  ERR_POST_X(4, Warning << "Invalid 'lens' size in denseg");
335  numseg = min(numseg, denseg.GetLens().size());
336  }
337  if (m_Dim != denseg.GetIds().size()) {
338  ERR_POST_X(5, Warning << "Invalid 'ids' size in denseg");
339  m_Dim = min(m_Dim, denseg.GetIds().size());
340  }
341  if (m_Dim*numseg != denseg.GetStarts().size()) {
342  ERR_POST_X(6, Warning << "Invalid 'starts' size in denseg");
343  m_Dim = min(m_Dim*numseg, denseg.GetStarts().size()) / numseg;
344  }
345  m_HaveStrands = denseg.IsSetStrands();
346  if (m_HaveStrands && m_Dim*numseg != denseg.GetStrands().size()) {
347  ERR_POST_X(7, Warning << "Invalid 'strands' size in denseg");
348  m_Dim = min(m_Dim*numseg, denseg.GetStrands().size()) / numseg;
349  }
350  if ( denseg.IsSetScores() ) {
351  // Store scores in the segments. Only pointers are copied,
352  // the objects are cloned only to the final mapped alignment.
353  CopyContainer<CDense_seg::TScores, TScores>(
354  denseg.GetScores(), m_SegsScores);
355  }
356  // Check sequence types.
357  bool have_nuc = false;
358  bool have_prot = false;
359  set<CSeq_id_Handle> unknown;
360  for (size_t row = 0; row < m_Dim; ++row) {
361  const CSeq_id& seq_id = *denseg.GetIds()[row];
365  switch ( seq_type ) {
367  have_nuc = true;
368  break;
370  have_prot = true;
371  break;
372  default:
373  // Collect sequences with unknown types.
374  unknown.insert(idh);
375  break;
376  }
377  }
378  if (have_prot && have_nuc) {
379  NCBI_THROW(CAnnotMapperException, eBadAlignment,
380  "Dense-segs with mixed sequence types are not supported");
381  }
382  if ((have_nuc || have_prot) && !unknown.empty()) {
383  CSeq_loc_Mapper_Base::ESeqType seq_type = have_nuc ?
385  // If there are sequences of unknown types, assume they are the same as
386  // the known ones - dense-seg can not contain mixed types.
387  ITERATE(set<CSeq_id_Handle>, it, unknown) {
388  m_LocMapper.SetSeqTypeById(*it, seq_type);
389  }
390  }
391 
392  int width = have_prot ? 3 : 1;
394  for (size_t seg = 0; seg < numseg; seg++) {
395  // Create new segment.
396  SAlignment_Segment& alnseg = x_PushSeg(denseg.GetLens()[seg], m_Dim);
397  for (size_t row = 0; row < m_Dim; ++row) {
398  if ( m_HaveStrands ) {
399  strand = denseg.GetStrands()[seg*m_Dim + row];
400  }
401  const CSeq_id& seq_id = *denseg.GetIds()[row];
402  int start = denseg.GetStarts()[seg*m_Dim + row]*width;
403  alnseg.AddRow(row, seq_id, start, m_HaveStrands, strand);
404  }
405  // For proteins segment length needs to be adjusted.
406  alnseg.m_Len *= width;
407  }
408 }
409 
410 
411 // Parse std-seg.
413 {
414  vector<int> seglens;
415  seglens.reserve(sseg.size());
416  // Several passes are required to detect sequence types and lengths.
417  ITERATE(CSeq_align::C_Segs::TStd, it, sseg) {
418  // Two different location lengths are allowed - for nucs and prots.
419  int minlen = 0;
420  int maxlen = 0;
421  // First pass - find min and max segment lengths.
422  ITERATE( CStd_seg::TLoc, it_loc, (*it)->GetLoc()) {
423  const CSeq_loc& loc = **it_loc;
424  const CSeq_id* id = loc.GetId();
425  int len = loc.GetTotalRange().GetLength();
426  if (len == 0 || loc.IsWhole()) {
427  continue; // ignore unknown lengths
428  }
429  if ( !id ) {
430  // Mixed ids in the same row?
431  NCBI_THROW(CAnnotMapperException, eBadAlignment,
432  "Locations with mixed seq-ids are not supported "
433  "in std-seg alignments");
434  }
435  // Store min and max lengths of locations. By default use min.
436  if (minlen == 0 || len == minlen) {
437  minlen = len;
438  }
439  else if (maxlen == 0 || len == maxlen) {
440  maxlen = len;
441  // If necessary, swap the two lengths.
442  if (minlen > maxlen) {
443  swap(minlen, maxlen);
444  }
445  }
446  else {
447  // Both minlen and maxlen are set, len differs from both.
448  // More than two different lengths in the same segment.
449  NCBI_THROW(CAnnotMapperException, eBadAlignment,
450  "Rows of the same std-seg have different lengths");
451  }
452  }
453  // Two different lengths were found. Try to guess sequence types.
454  if (minlen != 0 && maxlen != 0) {
455  if (minlen*3 != maxlen) {
456  NCBI_THROW(CAnnotMapperException, eBadAlignment,
457  "Inconsistent seq-loc lengths in std-seg rows");
458  }
459  // Found both nucs and prots - make the second pass and
460  // store widths for all sequences.
461  ITERATE( CStd_seg::TLoc, it_loc, (*it)->GetLoc()) {
462  const CSeq_loc& loc = **it_loc;
463  const CSeq_id* id = loc.GetId();
464  int len = loc.GetTotalRange().GetLength();
465  if (len == 0 || loc.IsWhole()) {
466  continue; // ignore unknown lengths
467  }
468  _ASSERT(id); // All locations should have been checked.
469  CSeq_loc_Mapper_Base::ESeqType newtype = (len == minlen) ?
473  // Check if seq-type is available from the location mapper.
476  if (seqtype != CSeq_loc_Mapper_Base::eSeq_unknown) {
477  if (seqtype != newtype) {
478  NCBI_THROW(CAnnotMapperException, eBadAlignment,
479  "Segment lengths in std-seg alignment are "
480  "inconsistent with sequence types");
481  }
482  }
483  else {
484  if (newtype == CSeq_loc_Mapper_Base::eSeq_prot) {
485  // Try to change all types to prot, adjust coords
486  // This is required in cases when the loc-mapper
487  // could not detect protein during initialization
488  // because there were no nucs to compare to.
490  }
491  // Set type anyway -- x_AdjustSeqTypesToProt could ignore it.
492  m_LocMapper.SetSeqTypeById(idh, newtype);
493  }
494  }
495  }
496  // -1 indicates unknown sequence type or equal lengths for all rows.
497  // We need to know this to use the correct length below, so use -1
498  // rather than real length.
499  seglens.push_back(maxlen == 0 ? -1 : maxlen);
500  }
501  // By this point all possible sequence types should be detected and
502  // stored in the loc-mapper.
503  // All unknown types are treated as nucs.
504 
505  size_t seg_idx = 0;
506  // Final pass - parse the alignment.
507  ITERATE (CSeq_align::C_Segs::TStd, it, sseg) {
508  const CStd_seg& stdseg = **it;
509  size_t dim = stdseg.GetDim();
510  if (stdseg.IsSetIds()
511  && dim != stdseg.GetIds().size()) {
512  ERR_POST_X(8, Warning << "Invalid 'ids' size in std-seg");
513  dim = min(dim, stdseg.GetIds().size());
514  }
515  // seg_len may be -1 indicating that the real length is
516  // unknown (due to unknown sequence type or a non-interval location).
517  // We'll fix this later.
518  int seg_len = seglens[seg_idx++];
519  SAlignment_Segment& seg = x_PushSeg(seg_len, dim);
520  if ( stdseg.IsSetScores() ) {
521  CopyContainer<CStd_seg::TScores, TScores>(
522  stdseg.GetScores(), seg.m_Scores);
523  }
524  unsigned int row_idx = 0;
525  ITERATE ( CStd_seg::TLoc, it_loc, (*it)->GetLoc() ) {
526  if (row_idx > dim) {
527  ERR_POST_X(9, Warning << "Invalid number of rows in std-seg");
528  dim = row_idx;
529  seg.m_Rows.resize(dim);
530  }
531  const CSeq_loc& loc = **it_loc;
532  const CSeq_id* id = loc.GetId();
533  if ( !id ) {
534  // All supported location types must have a single id.
535  NCBI_THROW(CAnnotMapperException, eBadAlignment,
536  "Missing or multiple seq-ids in std-seg alignment");
537  }
538 
541  seq_type = m_LocMapper.GetSeqTypeById(*id);
542  int width = (seq_type == CSeq_loc_Mapper_Base::eSeq_prot) ? 3 : 1;
543  // Empty and whole locations will set the correct start and length
544  // below, gon't check this now.
545  int start = loc.GetTotalRange().GetFrom()*width;
546  int len = loc.GetTotalRange().GetLength()*width;
548  bool have_strand = false;
549  switch ( loc.Which() ) {
550  case CSeq_loc::e_Empty:
551  // Adjust start, length should be 0.
552  start = (int)kInvalidSeqPos;
553  break;
554  case CSeq_loc::e_Whole:
555  start = 0;
556  len = 0; // Set length to 0 - it's unknown.
557  break;
558  case CSeq_loc::e_Int:
559  have_strand = loc.GetInt().IsSetStrand();
560  break;
561  case CSeq_loc::e_Pnt:
562  have_strand = loc.GetPnt().IsSetStrand();
563  break;
564  default:
565  NCBI_THROW(CAnnotMapperException, eBadAlignment,
566  "Unsupported seq-loc type in std-seg alignment");
567  }
568  if ( have_strand ) {
569  m_HaveStrands = true;
570  strand = loc.GetStrand();
571  }
572  // Now the final adjustment of the length. If for the current row
573  // it's set, but not equal to the segment-wide length, there are
574  // two possibilities:
575  if (len > 0 && len != seg_len) {
576  // The segment-wide length is unknown or equal for all rows.
577  // We can set it now, when we have at least one row with
578  // real length.
579  if (seg_len == -1 && seg.m_Len == -1) {
580  seg_len = len;
581  seg.m_Len = len;
582  }
583  else {
584  // The segment-wide length is known, but different from
585  // this row's length. Fail.
586  NCBI_THROW(CAnnotMapperException, eBadAlignment,
587  "Rows have different lengths in std-seg");
588  }
589  }
590  seg.AddRow(row_idx++, *id, start, m_HaveStrands, strand);
591  }
592  // Check if all segments have the same number of rows.
593  if (dim != m_Dim) {
594  if ( m_Dim ) {
596  }
597  m_Dim = max(dim, m_Dim);
598  }
599  }
600 }
601 
602 
604 {
605  m_Dim = pseg.GetDim();
606  size_t numseg = pseg.GetNumseg();
607  // Make sure all values are consistent. Post warnings and try to
608  // fix any incorrect values.
609  if (numseg != pseg.GetLens().size()) {
610  ERR_POST_X(10, Warning << "Invalid 'lens' size in packed-seg");
611  numseg = min(numseg, pseg.GetLens().size());
612  }
613  if (m_Dim != pseg.GetIds().size()) {
614  ERR_POST_X(11, Warning << "Invalid 'ids' size in packed-seg");
615  m_Dim = min(m_Dim, pseg.GetIds().size());
616  }
617  if (m_Dim*numseg != pseg.GetStarts().size()) {
618  ERR_POST_X(12, Warning << "Invalid 'starts' size in packed-seg");
619  m_Dim = min(m_Dim*numseg, pseg.GetStarts().size()) / numseg;
620  }
621  if (m_Dim*numseg != pseg.GetPresent().size()) {
622  ERR_POST_X(20, Warning << "Invalid 'present' size in packed-seg");
623  m_Dim = min(m_Dim*numseg, pseg.GetPresent().size()) / numseg;
624  }
625  m_HaveStrands = pseg.IsSetStrands();
626  if (m_HaveStrands && m_Dim*numseg != pseg.GetStrands().size()) {
627  ERR_POST_X(13, Warning << "Invalid 'strands' size in packed-seg");
628  m_Dim = min(m_Dim*numseg, pseg.GetStrands().size()) / numseg;
629  }
630  if ( pseg.IsSetScores() ) {
631  // Copy pointers to scores if any.
632  CopyContainer<CPacked_seg::TScores, TScores>(
633  pseg.GetScores(), m_SegsScores);
634  }
636  for (size_t seg = 0; seg < numseg; seg++) {
637  // By default treat the segment as nuc-only, don't adjust lengths.
638  // If there are any proteins involved, this will be set to 3.
639  int seg_width = 1;
640  // Remember if there are any nucs.
641  bool have_nuc = false;
642  SAlignment_Segment& alnseg = x_PushSeg(pseg.GetLens()[seg], m_Dim);
643  for (unsigned int row = 0; row < m_Dim; row++) {
644  if ( m_HaveStrands ) {
645  strand = pseg.GetStrands()[seg*m_Dim + row];
646  }
647  // Check sequence type for this row.
648  int row_width = 1;
649  const CSeq_id& id = *pseg.GetIds()[row];
652  // If this is a protein, adjust widths.
653  if (seqtype == CSeq_loc_Mapper_Base::eSeq_prot) {
654  seg_width = 3;
655  row_width = 3;
656  }
657  else {
658  have_nuc = true;
659  }
660  alnseg.AddRow(row, id,
661  (pseg.GetPresent()[seg*m_Dim + row] ?
662  pseg.GetStarts()[seg*m_Dim + row]*row_width : kInvalidSeqPos),
663  m_HaveStrands, strand);
664  }
665  // If there are both nucs and prots, fail.
666  if (have_nuc && seg_width == 3) {
667  NCBI_THROW(CAnnotMapperException, eBadAlignment,
668  "Packed-segs with mixed sequence types are not supported");
669  }
670  // If there are only prots, adjust segment length.
671  alnseg.m_Len *= seg_width;
672  }
673 }
674 
675 
676 // Parse align-set
678 {
679  // Iterate sub-alignments, create a new mapper for each of them.
680  const CSeq_align_set::Tdata& data = align_set.Get();
682  m_SubAligns.push_back(Ref(CreateSubAlign(**it)));
683  }
684 }
685 
686 
687 // Parse a single splices exon. A separate align-mapper is created
688 // for each exon.
690  const CSpliced_exon& exon)
691 {
692  m_OrigExon.Reset(&exon);
693  const CSeq_id* gen_id = spliced.IsSetGenomic_id() ?
694  &spliced.GetGenomic_id() : 0;
695  const CSeq_id* prod_id = spliced.IsSetProduct_id() ?
696  &spliced.GetProduct_id() : 0;
697 
698  m_Dim = 2;
699 
700  if ( exon.IsSetScores() ) {
701  // Copy pointers to scores if any.
702  CopyContainer<CScore_set::Tdata, TScores>(
703  exon.GetScores(), m_SegsScores);
704  }
705 
706  m_HaveStrands =
707  spliced.IsSetGenomic_strand() || spliced.IsSetProduct_strand();
708  ENa_strand gen_strand = spliced.IsSetGenomic_strand() ?
710  ENa_strand prod_strand = spliced.IsSetProduct_strand() ?
712 
713  // Get per-exon ids, use per-alignment ids if local ones are not set.
714  const CSeq_id* ex_gen_id = exon.IsSetGenomic_id() ?
715  &exon.GetGenomic_id() : gen_id;
716  const CSeq_id* ex_prod_id = exon.IsSetProduct_id() ?
717  &exon.GetProduct_id() : prod_id;
718  // Make sure ids are set at least somewhere.
719  if ( !ex_gen_id ) {
720  ERR_POST_X(14, Warning << "Missing genomic id in spliced-seg");
721  return;
722  }
723  if ( !ex_prod_id ) {
724  ERR_POST_X(15, Warning << "Missing product id in spliced-seg");
725  return;
726  }
728  exon.IsSetGenomic_strand() || exon.IsSetProduct_strand();
729  ENa_strand ex_gen_strand = exon.IsSetGenomic_strand() ?
730  exon.GetGenomic_strand() : gen_strand;
731  ENa_strand ex_prod_strand = exon.IsSetProduct_strand() ?
732  exon.GetProduct_strand() : prod_strand;
733 
734  int gen_start = exon.GetGenomic_start();
735  int gen_end = exon.GetGenomic_end() + 1;
736 
737  // Both start and stop will be converted to genomic coords.
738  int prod_start, prod_end;
739 
740  prod_start = exon.GetProduct_start().AsSeqPos();
741  prod_end = exon.GetProduct_end().AsSeqPos() + 1;
742 
743  if ( exon.IsSetParts() ) {
744  // Iterate exon parts.
745  ITERATE(CSpliced_exon::TParts, it, exon.GetParts()) {
746  const CSpliced_exon_chunk& part = **it;
747  // The length in spliced-seg is already genomic.
748  TSeqPos seg_len =
750  if (seg_len == 0) {
751  continue;
752  }
753 
754  SAlignment_Segment& alnseg = x_PushSeg(seg_len, 2);
755  alnseg.m_PartType = part.Which();
756 
757  int part_gen_start;
758  // Check the genomic strand only if genomic sequence is not
759  // missing.
760  if ( part.IsProduct_ins() ) {
761  part_gen_start = -1;
762  }
763  else {
764  if ( !IsReverse(ex_gen_strand) ) {
765  part_gen_start = gen_start;
766  gen_start += seg_len;
767  }
768  else {
769  gen_end -= seg_len;
770  part_gen_start = gen_end;
771  }
772  }
774  *ex_gen_id, part_gen_start, m_HaveStrands, ex_gen_strand);
775 
776  int part_prod_start;
777  // Check the product strand only if product sequence is not
778  // missing.
779  if ( part.IsGenomic_ins() ) {
780  part_prod_start = -1;
781  }
782  else {
783  if ( !IsReverse(ex_prod_strand) ) {
784  part_prod_start = prod_start;
785  prod_start += seg_len;
786  }
787  else {
788  prod_end -= seg_len;
789  part_prod_start = prod_end;
790  }
791  }
793  *ex_prod_id, part_prod_start, m_HaveStrands, ex_prod_strand);
794  }
795  }
796  else {
797  // No parts, use the whole exon.
798  TSeqPos seg_len = gen_end - gen_start;
799  SAlignment_Segment& alnseg = x_PushSeg(seg_len, 2);
802  *ex_gen_id, gen_start, m_HaveStrands, ex_gen_strand);
804  *ex_prod_id, prod_start, m_HaveStrands, ex_prod_strand);
805  }
806 }
807 
808 
809 // Parse spliced-seg.
811 {
812  // Iterate exons, create sub-mapper for each one.
813  ITERATE(CSpliced_seg::TExons, it, spliced.GetExons() ) {
814  m_SubAligns.push_back(Ref(CreateSubAlign(spliced, **it)));
815  }
816 }
817 
818 
819 // Parse sparse-seg.
821 {
822  // Only single-row alignments are currently supported
823  if ( sparse.GetRows().size() > 1) {
824  NCBI_THROW(CAnnotMapperException, eBadAlignment,
825  "Sparse-segs with multiple rows are not supported");
826  }
827  if ( sparse.GetRows().empty() ) {
828  return;
829  }
830  if ( sparse.IsSetRow_scores() ) {
831  // Copy pointers to the scores.
832  CopyContainer<CSparse_seg::TRow_scores, TScores>(
833  sparse.GetRow_scores(), m_SegsScores);
834  }
835 
836  // Make sure all values are consistent. Post warnings and try to
837  // fix any incorrect values.
838  const CSparse_align& row = *sparse.GetRows().front();
839  m_Dim = 2;
840 
841  size_t numseg = row.GetNumseg();
842  if (numseg != row.GetFirst_starts().size()) {
843  ERR_POST_X(16, Warning <<
844  "Invalid 'first-starts' size in sparse-align");
845  numseg = min(numseg, row.GetFirst_starts().size());
846  }
847  if (numseg != row.GetSecond_starts().size()) {
848  ERR_POST_X(17, Warning <<
849  "Invalid 'second-starts' size in sparse-align");
850  numseg = min(numseg, row.GetSecond_starts().size());
851  }
852  if (numseg != row.GetLens().size()) {
853  ERR_POST_X(18, Warning << "Invalid 'lens' size in sparse-align");
854  numseg = min(numseg, row.GetLens().size());
855  }
856  m_HaveStrands = row.IsSetSecond_strands();
857  if (m_HaveStrands && numseg != row.GetSecond_strands().size()) {
858  ERR_POST_X(19, Warning <<
859  "Invalid 'second-strands' size in sparse-align");
860  numseg = min(numseg, row.GetSecond_strands().size());
861  }
862 
863  // Check sequence types, make sure they are the same.
864  CSeq_loc_Mapper_Base::ESeqType first_type =
865  m_LocMapper.GetSeqTypeById(row.GetFirst_id());
866  int width = (first_type == CSeq_loc_Mapper_Base::eSeq_prot) ? 3 : 1;
867  CSeq_loc_Mapper_Base::ESeqType second_type =
868  m_LocMapper.GetSeqTypeById(row.GetSecond_id());
869  int second_width =
870  (second_type == CSeq_loc_Mapper_Base::eSeq_prot) ? 3 : 1;
871  if (width != second_width) {
872  NCBI_THROW(CAnnotMapperException, eBadAlignment,
873  "Sparse-segs with mixed sequence types are not supported");
874  }
875  ssize_t scores_group = -1;
876  if ( row.IsSetSeg_scores() ) {
877  // If per-row scores are set, store them along with the group number.
878  // Only pointers are copied.
879  scores_group = m_GroupScores.size();
880  m_GroupScores.resize(m_GroupScores.size() + 1);
881  CopyContainer<CSparse_align::TSeg_scores, TScores>(
882  row.GetSeg_scores(), m_GroupScores[scores_group]);
883  }
884  // Iterate segments.
885  for (size_t seg = 0; seg < numseg; seg++) {
886  SAlignment_Segment& alnseg =
887  x_PushSeg(row.GetLens()[seg]*width, m_Dim);
888  alnseg.m_ScoresGroupIdx = scores_group;
889  alnseg.AddRow(0, row.GetFirst_id(),
890  row.GetFirst_starts()[seg]*width,
893  alnseg.AddRow(1, row.GetSecond_id(),
894  row.GetSecond_starts()[seg]*width,
896  m_HaveStrands ? row.GetSecond_strands()[seg] : eNa_strand_unknown);
897  }
898 }
899 
900 
901 // Mapping through CSeq_loc_Mapper
902 
903 // Convert the whole seq-align.
905 {
906  m_DstAlign.Reset();
907 
908  // If the alignment is a set of sub-alignments, iterate all sub-mappers.
909  if ( !m_SubAligns.empty() ) {
911  (*it)->Convert();
912  // Check if the top-level scores must be invalidated.
913  // If any sub-mapper has invalidated its scores due
914  // to partial mapping, the global scores are also
915  // not valid anymore.
916  if ( (*it)->m_ScoresInvalidated ) {
918  }
919  }
920  return;
921  }
922  // This is a single alignment with one level - map it.
923  // NULL is a pointer to the row to be mapped. If it's NULL,
924  // all rows are mapped.
926 }
927 
928 
929 // convert a single alignment row.
931 {
932  m_DstAlign.Reset();
933 
934  // If the alignment is a set of sub-alignments, iterate all sub-mappers.
935  if ( !m_SubAligns.empty() ) {
937  (*it)->Convert(row);
938  if ( (*it)->m_ScoresInvalidated ) {
940  }
941  }
942  return;
943  }
944  // This is a single alignment with one level - map the requested row.
946 }
947 
948 
949 // Map a single alignment row if it't not NULL or all rows.
951 {
952  if ( m_Segs.empty() ) {
953  return;
954  }
955  if ( row ) {
956  x_ConvertRow(*row);
957  return;
958  }
959  for (size_t row_idx = 0; row_idx < m_Dim; ++row_idx) {
960  x_ConvertRow(row_idx);
961  }
962 }
963 
964 
965 // Map a single row.
967 {
968  CSeq_id_Handle dst_id;
969  // Iterate all segments.
970  TSegments::iterator seg_it = m_Segs.begin();
971  for ( ; seg_it != m_Segs.end(); ) {
972  if (seg_it->m_Rows.size() <= row) {
973  // No such row in the current segment
974  ++seg_it;
975  // This alignment has different number of rows in
976  // different segments.
978  continue;
979  }
980  // Try to convert the current segment.
981  CSeq_id_Handle seg_id = x_ConvertSegment(seg_it, row);
982  if (seg_id) {
983  // Success. Check if all mappings resulted in the
984  // same mapped id.
985  if (dst_id && dst_id != seg_id &&
987  // Mark the alignment as having multiple ids per row.
988  // Not all alignment types support this, so we may need
989  // to change the type from the original one later.
991  }
992  // Remember the last mapped id.
993  dst_id = seg_id;
994  }
995  }
996 }
997 
998 
999 // Convert a single segment of a single row.
1000 // This is where the real mapping is done.
1002 CSeq_align_Mapper_Base::x_ConvertSegment(TSegments::iterator& seg_it,
1003  size_t row)
1004 {
1005  // Remember the iterator position - mapping can add segments,
1006  // we need to know which should be mapped next.
1007  // old_it keeps the segment to be mapped, seg_it is the next segment,
1008  // any additional segments are inserted before it.
1009  TSegments::iterator old_it = seg_it;
1010  SAlignment_Segment& seg = *old_it;
1011  ++seg_it;
1012 
1014 
1015  // Find all matching mappings.
1018  idmap.find(m_LocMapper.x_GetPrimaryId(aln_row.m_Id));
1019  if (id_it == idmap.end()) {
1020  // Id not found in the segment, leave the row unchanged.
1021  return aln_row.m_Id;
1022  }
1023  const CMappingRanges::TRangeMap& rmap = id_it->second;
1024  if ( rmap.empty() ) {
1025  // No mappings for this segment - the row should not be
1026  // changed. Return the original id.
1027  return aln_row.m_Id;
1028  }
1029  // Sort mappings related to this segment/row.
1030  typedef vector< CRef<CMappingRange> > TSortedMappings;
1031  TSortedMappings mappings;
1032  CMappingRanges::TRangeIterator rg_it = rmap.begin();
1033  for ( ; rg_it; ++rg_it) {
1034  mappings.push_back(rg_it->second);
1035  }
1036  sort(mappings.begin(), mappings.end(), CMappingRangeRef_Less());
1037 
1038  CSeq_id_Handle dst_id;
1039 
1040  // Handle rows with gaps.
1041  if (aln_row.m_Start == kInvalidSeqPos) {
1042  // Gap. Check the mappings. If there's at least one mapping for this
1043  // id, change it to the destination one.
1044  dst_id = mappings[0]->GetDstIdHandle();
1045  // If there are multiple mappings, check if they all have the same
1046  // destination id. If there are many of them, do nothing - this gap
1047  // can not be mapped.
1048  if (mappings.size() > 1) {
1049  ITERATE(TSortedMappings, it, mappings) {
1050  if ((*it)->GetDstIdHandle() != dst_id) {
1051  return CSeq_id_Handle(); // Use empty id to report gaps.
1052  }
1053  }
1054  }
1055  // There's just one destination id, map the gap.
1056  seg.m_Rows[row].m_Id = dst_id;
1057  seg.m_Rows[row].SetMapped();
1058  return seg.m_Rows[row].m_Id;
1059  }
1060 
1061  // Prepare insert point depending on the source strand
1062  TSegments::iterator ins_point = seg_it;
1063  bool src_reverse = aln_row.m_IsSetStrand ? IsReverse(aln_row.m_Strand) : false;
1064 
1065  bool mapped = false;
1066  EAlignFlags align_flags = eAlign_Normal;
1067  TSeqPos start = aln_row.m_Start;
1068  TSeqPos stop = start + seg.m_Len - 1;
1069  // left_shift indicates which portion of the segment has been mapped
1070  // so far.
1071  TSeqPos left_shift = 0;
1072  int group_idx = 0;
1073  for (size_t map_idx = 0; map_idx < mappings.size(); ++map_idx) {
1074  CRef<CMappingRange> mapping(mappings[map_idx]);
1075  if (!mapping->CanMap(start, stop,
1076  aln_row.m_IsSetStrand &&
1078  aln_row.m_Strand)) {
1079  // Mapping does not apply to this segment/row, leave it unchanged.
1080  continue;
1081  }
1082 
1083  // Check the destination id, set the flag if the row is mapped
1084  // to multiple ids.
1085  if ( dst_id ) {
1086  if (mapping->m_Dst_id_Handle != dst_id) {
1087  align_flags = eAlign_MultiId;
1088  }
1089  }
1090  dst_id = mapping->m_Dst_id_Handle;
1091 
1092  group_idx = mapping->m_Group;
1093 
1094  // At least part of the interval was converted. Calculate
1095  // trimming coords, split each row if necessary. We will need to add
1096  // new segments on the left/right to preserve the parts which could
1097  // not be mapped.
1098  TSeqPos dl = mapping->m_Src_from <= start ?
1099  0 : mapping->m_Src_from - start;
1100  TSeqPos dr = mapping->m_Src_to >= stop ?
1101  0 : stop - mapping->m_Src_to;
1102  if ((dl || dr) &&
1104  NCBI_THROW(CAnnotMapperException, eCanNotMap,
1105  "Alignment segment can not be mapped without trimming.");
1106  }
1107  if (dl > 0) {
1108  // Add segment for the skipped range on the left.
1109  // Copy the original segment.
1110  SAlignment_Segment& lseg =
1111  x_InsertSeg(ins_point, dl, seg.m_Rows.size(), src_reverse);
1112  lseg.m_GroupIdx = group_idx;
1113  lseg.m_PartType = old_it->m_PartType;
1114  // Iterate all rows, adjust their starts.
1115  for (size_t r = 0; r < seg.m_Rows.size(); ++r) {
1117  lseg.CopyRow(r, seg.m_Rows[r]);
1118  if (r == row) {
1119  // The row which could not be mapped has a gap.
1120  lrow.m_Start = kInvalidSeqPos;
1121  lrow.m_Id = dst_id;
1122  }
1123  else if (lrow.m_Start != kInvalidSeqPos) {
1124  // All other rows have new starts.
1125  if (lrow.SameStrand(aln_row)) {
1126  lrow.m_Start += left_shift;
1127  }
1128  else {
1129  lrow.m_Start += seg.m_Len - lseg.m_Len - left_shift;
1130  }
1131  }
1132  }
1133  }
1134  start += dl;
1135  left_shift += dl;
1136  // At least part of the interval was converted. Add new segment for
1137  // this range.
1138  SAlignment_Segment& mseg = x_InsertSeg(ins_point,
1139  stop - dr - start + 1, seg.m_Rows.size(), src_reverse);
1140  mseg.m_GroupIdx = group_idx;
1141  mseg.m_PartType = old_it->m_PartType;
1142  if (!dl && !dr) {
1143  // Copy scores if there's no truncation.
1144  mseg.m_Scores = seg.m_Scores;
1146  }
1147  else {
1148  // Invalidate all scores related to the segment and all
1149  // parent's scores.
1150  x_InvalidateScores(&seg);
1151  }
1152  ENa_strand dst_strand = eNa_strand_unknown;
1153  // Fill the new segment.
1154  for (size_t r = 0; r < seg.m_Rows.size(); ++r) {
1156  mseg.CopyRow(r, seg.m_Rows[r]);
1157  if (r == row) {
1158  // Translate id and coords of the mapped row.
1159  CMappingRange::TRange mapped_rg =
1160  mapping->Map_Range(start, stop - dr);
1161  mapping->Map_Strand(
1162  aln_row.m_IsSetStrand,
1163  aln_row.m_Strand,
1164  &dst_strand);
1165  mrow.m_Id = mapping->m_Dst_id_Handle;
1166  mrow.m_Start = mapped_rg.GetFrom();
1167  mrow.m_IsSetStrand =
1168  mrow.m_IsSetStrand || (dst_strand != eNa_strand_unknown);
1169  mrow.m_Strand = dst_strand;
1170  mrow.SetMapped();
1171  mseg.m_HaveStrands = mseg.m_HaveStrands ||
1172  mrow.m_IsSetStrand;
1174  }
1175  else {
1176  // Adjust starts of all other rows.
1177  if (mrow.m_Start != kInvalidSeqPos) {
1178  if (mrow.SameStrand(aln_row)) {
1179  mrow.m_Start += left_shift;
1180  }
1181  else {
1182  mrow.m_Start +=
1183  seg.m_Len - mseg.m_Len - left_shift;
1184  }
1185  }
1186  }
1187  }
1188  left_shift += mseg.m_Len;
1189  start += mseg.m_Len;
1190  mapped = true;
1191  if (start > stop) break;
1192  }
1193  // Update alignment flags.
1194  if (align_flags == eAlign_MultiId && m_AlignFlags == eAlign_Normal) {
1195  m_AlignFlags = align_flags;
1196  }
1197  if ( !mapped ) {
1198  // Nothing could be mapped from this row, although some mappings for
1199  // the id do exist. Do not erase the segment, just change the row id
1200  // and reset start to convert it to gap on the destination sequence.
1201  // Use destination id of the first mapping for the source id. This
1202  // should not be very important, since we have a gap anyway. (?)
1203  seg.m_Rows[row].m_Start = kInvalidSeqPos;
1204  seg.m_Rows[row].m_Id = rmap.begin()->second->m_Dst_id_Handle;
1205  seg.m_Rows[row].SetMapped();
1206  x_InvalidateScores(&seg);
1207  return seg.m_Rows[row].m_Id;
1208  }
1209  if (start <= stop) {
1210  // Add the remaining unmapped range if any.
1211  SAlignment_Segment& rseg = x_InsertSeg(ins_point,
1212  stop - start + 1, seg.m_Rows.size(), src_reverse);
1213  rseg.m_GroupIdx = group_idx;
1214  rseg.m_PartType = old_it->m_PartType;
1215  for (size_t r = 0; r < seg.m_Rows.size(); ++r) {
1217  rseg.CopyRow(r, seg.m_Rows[r]);
1218  if (r == row) {
1219  // The mapped row was truncated and now has a gap.
1220  rrow.m_Start = kInvalidSeqPos;
1221  rrow.m_Id = dst_id;
1222  }
1223  else if (rrow.m_Start != kInvalidSeqPos) {
1224  if (rrow.SameStrand(aln_row)) {
1225  rrow.m_Start += left_shift;
1226  }
1227  }
1228  }
1229  }
1230  // Remove the original segment from the alignment.
1231  m_Segs.erase(old_it);
1232  return align_flags == eAlign_MultiId ? CSeq_id_Handle() : dst_id;
1233 }
1234 
1235 
1236 // Get mapped alignment
1237 
1238 // Checks each row for strand information. If found, store the
1239 // strand in the container. It will be used to set strand in gaps.
1240 // Looks only for the first known strand in each row. Does not
1241 // check if strand is the same for the whole row.
1243 {
1244  strands.clear();
1245  size_t max_rows = m_Segs.front().m_Rows.size();
1246  if (m_AlignFlags & eAlign_MultiDim) {
1247  // Segments may contain different number of rows, check each segment.
1248  ITERATE(TSegments, seg_it, m_Segs) {
1249  if (seg_it->m_Rows.size() > max_rows) {
1250  max_rows = seg_it->m_Rows.size();
1251  }
1252  }
1253  }
1254  strands.reserve(max_rows);
1255  for (size_t r_idx = 0; r_idx < max_rows; r_idx++) {
1256  ENa_strand strand = eNa_strand_unknown;
1257  // Skip gaps, try find a row with mapped strand
1258  ITERATE(TSegments, seg_it, m_Segs) {
1259  // Make sure the row exists in the current segment.
1260  if (seg_it->m_Rows.size() <= r_idx) continue;
1261  if (seg_it->m_Rows[r_idx].GetSegStart() != -1) {
1262  strand = seg_it->m_Rows[r_idx].m_Strand;
1263  break;
1264  }
1265  }
1266  // Store the strand.
1267  strands.push_back(strand == eNa_strand_unknown ?
1268  eNa_strand_plus : strand);
1269  }
1270 }
1271 
1272 
1273 // Create dense-diag alignment.
1275 {
1276  TDendiag& diags = dst->SetSegs().SetDendiag();
1277  TStrands strands;
1278  // Get information about strands for each row.
1279  x_FillKnownStrands(strands);
1280  // Create dense-diag for each segment.
1281  ITERATE(TSegments, seg_it, m_Segs) {
1282  const SAlignment_Segment& seg = *seg_it;
1283  CRef<CDense_diag> diag(new CDense_diag);
1284  diag->SetDim(static_cast<CDense_diag::TDim>(seg.m_Rows.size()));
1285  int len_width = 1;
1286  size_t str_idx = 0; // row index in the strands container
1287  // Add each row to the dense-seg.
1289  if (row->m_Start == kInvalidSeqPos) {
1290  // Dense-diags do not support gaps ('starts' contain
1291  // TSeqPos which can not be negative).
1292  NCBI_THROW(CAnnotMapperException, eBadAlignment,
1293  "Mapped alignment contains gaps and can not be "
1294  "converted to dense-diag.");
1295  }
1298  if (seq_type == CSeq_loc_Mapper_Base::eSeq_prot) {
1299  // If prots are present, segment length must be
1300  // converted to AAs.
1301  len_width = 3;
1302  }
1303  int seq_width =
1304  (seq_type == CSeq_loc_Mapper_Base::eSeq_prot) ? 3 : 1;
1305  CRef<CSeq_id> id(new CSeq_id);
1306  id.Reset(&const_cast<CSeq_id&>(*row->m_Id.GetSeqId()));
1307  diag->SetIds().push_back(id);
1308  diag->SetStarts().push_back(row->GetSegStart()/seq_width);
1309  if (seg.m_HaveStrands) { // per-segment strands
1310  // For gaps use the strand of the first mapped row,
1311  // see x_FillKnownStrands.
1312  diag->SetStrands().
1313  push_back((TSeqPos)row->GetSegStart() != kInvalidSeqPos ?
1314  row->m_Strand : strands[str_idx]);
1315  }
1316  str_idx++; // move to the strand for the next row
1317  }
1318  // Adjust segment length is there are any proteins.
1319  diag->SetLen(seg_it->m_Len/len_width);
1320  if ( !seg.m_Scores.empty() ) {
1321  // This will copy every element rather just pointers.
1322  CloneContainer<CScore, TScores, CDense_diag::TScores>(
1323  seg.m_Scores, diag->SetScores());
1324  }
1325  diags.push_back(diag);
1326  }
1327 }
1328 
1329 
1330 // Create dense-seg alignment.
1332 {
1333  // Make sure all segments have the same number of rows -
1334  // dense-seg does not support multi-dim alignments.
1336 
1337  CDense_seg& dseg = dst->SetSegs().SetDenseg();
1338  dseg.SetDim(static_cast<CDense_seg::TDim>(m_Segs.front().m_Rows.size()));
1339  dseg.SetNumseg(static_cast<CDense_seg::TNumseg>(m_Segs.size()));
1340  if ( !m_SegsScores.empty() ) {
1341  // This will copy every element rather just pointers.
1342  CloneContainer<CScore, TScores, CDense_seg::TScores>(
1343  m_SegsScores, dseg.SetScores());
1344  }
1345  int len_width = 1;
1346  // First pass: find first non-gap in each row, get its seq-id.
1347  for (size_t r = 0; r < m_Segs.front().m_Rows.size(); r++) {
1348  bool only_gaps = true;
1349  ITERATE(TSegments, seg, m_Segs) {
1350  const SAlignment_Segment::SAlignment_Row& row = seg->m_Rows[r];
1351  if (row.m_Start != kInvalidSeqPos) {
1352  // Not a gap - store the id
1353  CRef<CSeq_id> id(new CSeq_id);
1354  id.Reset(&const_cast<CSeq_id&>(*row.m_Id.GetSeqId()));
1355  dseg.SetIds().push_back(id);
1356  // Check sequence type, remember if lengths
1357  // need to be adjusted.
1360  if (seq_type != CSeq_loc_Mapper_Base::eSeq_unknown) {
1361  if (seq_type == CSeq_loc_Mapper_Base::eSeq_prot) {
1362  len_width = 3;
1363  }
1364  }
1365  only_gaps = false;
1366  break; // No need to check other segments of this row.
1367  }
1368  }
1369  // The row contains only gaps, don't know how to build a valid denseg
1370  if ( only_gaps ) {
1371  NCBI_THROW(CAnnotMapperException, eBadAlignment,
1372  "Mapped denseg contains empty row.");
1373  }
1374  }
1375  // Get information about strands for each row.
1376  TStrands strands;
1377  x_FillKnownStrands(strands);
1378  ITERATE(TSegments, seg_it, m_Segs) {
1379  dseg.SetLens().push_back(seg_it->m_Len/len_width);
1380  size_t str_idx = 0; // strands index for the current row
1381  ITERATE(SAlignment_Segment::TRows, row, seg_it->m_Rows) {
1382  int width = 1;
1383  // Are there any proteins in the alignment?
1384  if (len_width == 3) {
1385  // Adjust coordinates for proteins.
1386  if (m_LocMapper.GetSeqTypeById(row->m_Id) ==
1388  width = 3;
1389  }
1390  }
1391  int start = row->GetSegStart();
1392  if (start >= 0) {
1393  start /= width;
1394  }
1395  dseg.SetStarts().push_back(start);
1396  // Are there any strands involved at all?
1397  if (m_HaveStrands) {
1398  // For gaps use the strand of the first mapped row
1399  dseg.SetStrands().
1400  push_back((TSeqPos)row->GetSegStart() != kInvalidSeqPos ?
1401  (row->m_Strand != eNa_strand_unknown ?
1402  row->m_Strand : eNa_strand_plus): strands[str_idx]);
1403  }
1404  str_idx++;
1405  }
1406  }
1407 }
1408 
1409 
1410 // Create std-seg alignment.
1412 {
1413  TStd& std_segs = dst->SetSegs().SetStd();
1414  int non_gap_count = 0;
1415 
1416  // Check if there are exactly two rows of different types (nuc-to-prot align).
1417  // If true, collect frames from the protein row and use them later to adjust
1418  // genomic locations. See CXX-5478
1419  bool set_frames = true;
1420  int p_row = -1;
1421  vector< pair<TSeqPos, TSeqPos> > frames;
1422  frames.reserve(m_Segs.size());
1423  ITERATE(TSegments, seg_it, m_Segs) {
1424  if (seg_it->m_Rows.size() != 2) {
1425  set_frames = false;
1426  break;
1427  }
1429  m_LocMapper.GetSeqTypeById(seg_it->m_Rows[0].m_Id);
1431  m_LocMapper.GetSeqTypeById(seg_it->m_Rows[1].m_Id);
1432  if (r0_type == r1_type) {
1433  set_frames = false;
1434  break;
1435  }
1436  if (p_row == -1) {
1437  if (r0_type == CSeq_loc_Mapper_Base::eSeq_prot) {
1438  p_row = 0;
1439  }
1440  else if (r1_type == CSeq_loc_Mapper_Base::eSeq_prot) {
1441  p_row = 1;
1442  }
1443  else {
1444  set_frames = false;
1445  break;
1446  }
1447  }
1448  _ASSERT(p_row != -1);
1449  TSeqPos start = seg_it->m_Rows[p_row].m_Start;
1450  TSeqPos start_frame = 0;
1451  TSeqPos stop_frame = 0;
1452  if (start != kInvalidSeqPos) {
1453  start_frame = start % 3; // 1 or 2 bases from codon start.
1454  stop_frame = (start + seg_it->m_Len) % 3;
1455  if ( stop_frame ) {
1456  stop_frame = 3 - stop_frame; // 1 or 2 bases from codon stop.
1457  }
1458  }
1459  frames.push_back(pair<TSeqPos, TSeqPos>(start_frame, stop_frame));
1460  }
1461 
1462  size_t seg_n = 0;
1463  ITERATE(TSegments, seg_it, m_Segs) {
1464  // Create new std-seg for each segment.
1465  CRef<CStd_seg> std_seg(new CStd_seg);
1466  std_seg->SetDim(static_cast<CStd_seg::TDim>(seg_it->m_Rows.size()));
1467  if ( !seg_it->m_Scores.empty() ) {
1468  // Copy scores (not just pointers).
1469  CloneContainer<CScore, TScores, CStd_seg::TScores>(
1470  seg_it->m_Scores, std_seg->SetScores());
1471  }
1472  // Add rows.
1473  non_gap_count = 0;
1474  int row_n = 0;
1475  ITERATE(SAlignment_Segment::TRows, row, seg_it->m_Rows) {
1476  // Check sequence type, set width to 3 for prots.
1477  int width = (m_LocMapper.GetSeqTypeById(row->m_Id) ==
1479  CRef<CSeq_id> id(new CSeq_id);
1480  id.Reset(&const_cast<CSeq_id&>(*row->m_Id.GetSeqId()));
1481  std_seg->SetIds().push_back(id);
1482  CRef<CSeq_loc> loc(new CSeq_loc);
1483  // For gaps use empty seq-loc.
1484  if (row->m_Start == kInvalidSeqPos) {
1485  // empty
1486  loc->SetEmpty(*id);
1487  }
1488  else {
1489  // For normal ranges use seq-interval.
1490  loc->SetInt().SetId(*id);
1491  // Adjust coordinates according to the sequence type.
1492  TSeqPos start = row->m_Start/width;
1493  TSeqPos stop = (row->m_Start + seg_it->m_Len)/width;
1494 
1495  // For pairwise mixed-type alignments indicate frames using 'alt' fuzz.
1496  // See CXX-5478
1497 
1498  if ( set_frames ) {
1499  const SAlignment_Segment::SAlignment_Row& g_row =
1500  seg_it->m_Rows[1 - p_row];
1501  TSeqPos start_frame = frames[seg_n].first;
1502  TSeqPos stop_frame = frames[seg_n].second;
1503  if ( IsReverse(g_row.m_Strand) ) {
1504  swap(start_frame, stop_frame);
1505  }
1506  if ( row_n == p_row ) {
1507  // Trim incomplete codon if genomic start can not be
1508  // adjusted properly.
1509  TSeqPos g_start = g_row.m_Start;
1510  if (g_start < start_frame) {
1511  if ( IsReverse(g_row.m_Strand) ) {
1512  stop--;
1513  }
1514  else {
1515  start++;
1516  }
1517  // Tricky case: protein row contains just one AA,
1518  // which is partial. Start must be adjusted to
1519  // remove the incomplete codon, but this makes
1520  // the whole segment empty. So, we need to drop
1521  // the genomic row too.
1522  if (start > stop) {
1523  if (p_row == 1) {
1524  // Genomic row has been already saved.
1525  std_seg->SetLoc().pop_back();
1526  }
1527  else {
1528  // Skip genomic row
1529  ++row;
1530  }
1531  continue;
1532  }
1533  }
1534  }
1535  else {
1536  // Adjust start and stop to the start/stop of complete
1537  // codon, set fuzzes to indicate actual start/stop.
1538  if ( start_frame ) {
1539  if (start >= start_frame) {
1540  // Add 1 or 2 bases to match complete codon.
1541  loc->SetInt().SetFuzz_from().SetAlt().push_back(start);
1542  start -= start_frame;
1543  }
1544  else {
1545  // Skip the first incomplete codon.
1546  start += 3 - start_frame;
1547  }
1548  }
1549  if ( stop_frame ) {
1550  loc->SetInt().SetFuzz_to().SetAlt().push_back(stop - 1);
1551  stop += stop_frame;
1552  }
1553  }
1554  }
1555 
1556  loc->SetInt().SetFrom(start);
1557  // len may be 0 after dividing by width, check it before
1558  // decrementing stop.
1559  loc->SetInt().SetTo(stop > start ? stop - 1 : stop);
1560  if (row->m_IsSetStrand) {
1561  loc->SetInt().SetStrand(row->m_Strand);
1562  }
1563  non_gap_count++;
1564  }
1565  std_seg->SetLoc().push_back(loc);
1566  row_n++;
1567  }
1568  seg_n++;
1569  // Ignore starting segments with no actually aligned sequences.
1570  if (seg_it == m_Segs.begin() && non_gap_count < 2) {
1571  continue;
1572  }
1573  // Ignore empty segments
1574  if ( std_seg->GetLoc().empty() ) continue;
1575  std_segs.push_back(std_seg);
1576  }
1577  if (non_gap_count < 2 && !std_segs.empty()) {
1578  // Remove the last segment if there are no aligned sequences in it.
1579  std_segs.pop_back();
1580  }
1581 }
1582 
1583 
1584 // Create packed-seg alignment.
1586 {
1587  // Multi-dim alignments are not supported by this type.
1589 
1590  CPacked_seg& pseg = dst->SetSegs().SetPacked();
1591  pseg.SetDim(static_cast<CPacked_seg::TDim>(m_Segs.front().m_Rows.size()));
1592  pseg.SetNumseg(static_cast<CPacked_seg::TNumseg>(m_Segs.size()));
1593  if ( !m_SegsScores.empty() ) {
1594  // Copy elements, not just pointers.
1595  CloneContainer<CScore, TScores, CPacked_seg::TScores>(
1596  m_SegsScores, pseg.SetScores());
1597  }
1598  // Get strands for all rows.
1599  TStrands strands;
1600  x_FillKnownStrands(strands);
1601  // Populate ids.
1602  for (size_t r = 0; r < m_Segs.front().m_Rows.size(); r++) {
1603  ITERATE(TSegments, seg, m_Segs) {
1604  const SAlignment_Segment::SAlignment_Row& row = seg->m_Rows[r];
1605  if (row.m_Start != kInvalidSeqPos) {
1606  CRef<CSeq_id> id(new CSeq_id);
1607  id.Reset(&const_cast<CSeq_id&>(*row.m_Id.GetSeqId()));
1608  pseg.SetIds().push_back(id);
1609  break;
1610  }
1611  }
1612  }
1613  // Create segments and rows.
1614  ITERATE(TSegments, seg_it, m_Segs) {
1615  int len_width = 1;
1616  size_t str_idx = 0; // Strand index for the current row.
1617  ITERATE(SAlignment_Segment::TRows, row, seg_it->m_Rows) {
1618  TSeqPos start = row->GetSegStart();
1619  // Check if start needs to be converted to protein coords.
1620  if (m_LocMapper.GetSeqTypeById(row->m_Id) ==
1622  len_width = 3;
1623  if (start != kInvalidSeqPos) {
1624  start *= 3;
1625  }
1626  }
1627  pseg.SetStarts().push_back(start);
1628  pseg.SetPresent().push_back(start != kInvalidSeqPos);
1629  if (m_HaveStrands) {
1630  pseg.SetStrands().
1631  push_back((TSeqPos)row->GetSegStart() != kInvalidSeqPos ?
1632  row->m_Strand : strands[str_idx]);
1633  }
1634  str_idx++;
1635  }
1636  // If there are any proteins, length should be adjusted.
1637  pseg.SetLens().push_back(seg_it->m_Len/len_width);
1638  }
1639 }
1640 
1641 
1642 // Create disc-alignment.
1644 {
1645  CSeq_align_set::Tdata& data = dst->SetSegs().SetDisc().Set();
1646  // Iterate sub-mappers, let each of them create a mapped alignment,
1647  // store results to the disc-align.
1649  try {
1650  data.push_back((*it)->GetDstAlign());
1651  }
1652  catch (CAnnotMapperException&) {
1653  // Skip invalid sub-alignments.
1654  }
1655  }
1656 }
1657 
1658 
1659 // Creating exon parts - helper function to set part length
1660 // depending on its type.
1663  TSeqPos len)
1664 {
1665  switch ( ptype ) {
1667  part.SetMatch(len);
1668  break;
1670  part.SetMismatch(len);
1671  break;
1673  part.SetDiag(len);
1674  break;
1676  part.SetProduct_ins(len);
1677  break;
1679  part.SetGenomic_ins(len);
1680  break;
1681  default:
1682  break;
1683  }
1684 }
1685 
1686 
1687 // Create and add a new exon part.
1689  CRef<CSpliced_exon_chunk>& last_part,
1691  int part_len,
1692  CSpliced_exon& exon) const
1693 {
1694  if (last_part && last_part->Which() == part_type) {
1695  // Merge parts of the same type.
1696  SetPartLength(*last_part, part_type,
1698  sx_GetExonPartLength(*last_part) + part_len);
1699  }
1700  else {
1701  // Add a new part.
1702  last_part.Reset(new CSpliced_exon_chunk);
1703  SetPartLength(*last_part, part_type, part_len);
1704  // Parts order does not depend on strands - preserve the original one.
1705  exon.SetParts().push_back(last_part);
1706  }
1707 }
1708 
1709 
1711 {
1712  return chunk_type == CSpliced_exon_chunk::e_Genomic_ins ||
1713  chunk_type == CSpliced_exon_chunk::e_Product_ins;
1714 }
1715 
1716 
1717 // Create spliced-seg exon.
1719 x_GetDstExon(CSpliced_seg& spliced,
1720  TSegments::const_iterator& seg,
1721  CSeq_id_Handle& gen_id,
1722  CSeq_id_Handle& prod_id,
1723  ENa_strand& gen_strand,
1724  ENa_strand& prod_strand,
1725  bool& last_exon_partial,
1726  const CSeq_id_Handle& last_gen_id,
1727  const CSeq_id_Handle& last_prod_id) const
1728 {
1729  bool partial_left = false;
1730  bool partial_right = false;
1732  if (seg != m_Segs.begin() && last_exon_partial) {
1733  // This is not the first segment, exon was split for some reason.
1734  // Mark it partial.
1735  exon->SetPartial(true);
1736  partial_left = true;
1737  }
1738 
1739  last_exon_partial = false;
1740  int gen_start = -1;
1741  int prod_start = -1;
1742  int gen_end = 0;
1743  int prod_end = 0;
1744  gen_strand = eNa_strand_unknown;
1745  prod_strand = eNa_strand_unknown;
1746  bool gstrand_set = false;
1747  bool pstrand_set = false;
1748  bool aln_protein = false;
1749  size_t mapped_gaps = 0;
1750  int non_gap_gen_start = 0;
1751  int non_gap_prod_start = 0;
1752  int non_gap_gen_end = 0;
1753  int non_gap_prod_end = 0;
1754 
1755  if ( spliced.IsSetProduct_type() ) {
1756  aln_protein =
1758  }
1759 
1760  CRef<CSpliced_exon_chunk> last_part; // last exon part added
1761  int group_idx = -1;
1762  bool have_non_gaps = false; // are there any non-gap parts at all?
1763  // Continue iterating segments. Each segment becomes a new part.
1764  for ( ; seg != m_Segs.end(); ++seg) {
1765  // Zero group may indicate that all mappings were applied to gaps.
1766  // Do not break exon on such segments.
1767  if (group_idx != -1 && seg->m_GroupIdx && seg->m_GroupIdx != group_idx) {
1768  // New group found - start a new exon.
1769  partial_right = true;
1770  break;
1771  }
1772  // Remember the last segment's group if non-zero.
1773  if ( seg->m_GroupIdx ) {
1774  group_idx = seg->m_GroupIdx;
1775  }
1776 
1777  const SAlignment_Segment::SAlignment_Row& gen_row =
1779  const SAlignment_Segment::SAlignment_Row& prod_row =
1781  // Spliced-seg can not have more than 2 rows.
1782  if (seg->m_Rows.size() > 2) {
1783  NCBI_THROW(CAnnotMapperException, eBadAlignment,
1784  "Can not construct spliced-seg with more than two rows");
1785  }
1786 
1787  int gstart = gen_row.GetSegStart();
1788  int pstart = prod_row.GetSegStart();
1789  int gend = gstart + seg->m_Len;
1790  int pend = pstart + seg->m_Len;
1791  if (gstart >= 0) {
1792  // Not a genetic gap. Check the id.
1793  if (gen_id) {
1794  // If it's already set and the new segment has a different id,
1795  // fail.
1796  if (gen_id != gen_row.m_Id) {
1797  // New id - start new exon.
1798  break;
1799  }
1800  }
1801  else {
1802  // Genetic id not yet set. Remember it.
1803  gen_id = gen_row.m_Id;
1804  exon->SetGenomic_id(const_cast<CSeq_id&>(*gen_id.GetSeqId()));
1805  }
1808  }
1809  if (pstart >= 0) {
1810  // Not a product gap. Check the id.
1811  if (prod_id) {
1812  // Id already set, make sure the new one is the same.
1813  if (prod_id != prod_row.m_Id) {
1814  // New id - start new exon.
1815  break;
1816  }
1817  }
1818  else {
1819  // Product id not yet set.
1820  prod_id = prod_row.m_Id;
1821  exon->SetProduct_id(const_cast<CSeq_id&>(*prod_id.GetSeqId()));
1822  }
1823  if ( !spliced.IsSetProduct_type() ) {
1824  CSeq_loc_Mapper_Base::ESeqType prod_type =
1825  m_LocMapper.GetSeqTypeById(prod_id);
1826  // If the product is not mapped try to use the original
1827  // product type. If a protein was mapped to an unknown type,
1828  // throw.
1829  if (prod_type == CSeq_loc_Mapper_Base::eSeq_unknown && m_OrigExon) {
1830  if ( m_OrigExon->GetProduct_start().IsProtpos() ) {
1831  if (!prod_row.m_Mapped) {
1832  aln_protein = true;
1833  }
1834  else {
1835  NCBI_THROW(CAnnotMapperException, eOtherError,
1836  "Can not map protein product to a sequence "
1837  "of unknown type.");
1838  }
1839  }
1840  }
1841  else {
1842  aln_protein = (prod_type == CSeq_loc_Mapper_Base::eSeq_prot);
1843  }
1844  spliced.SetProduct_type(aln_protein ?
1847  }
1848  }
1849 
1850  CSpliced_exon_chunk::E_Choice orig_ptype = seg->m_PartType;
1851  if (orig_ptype == CSpliced_exon_chunk::e_not_set) {
1852  orig_ptype = CSpliced_exon_chunk::e_Match;
1853  }
1854  CSpliced_exon_chunk::E_Choice ptype = orig_ptype;
1855 
1856  // Check strands consistency
1857  bool gen_reverse = false;
1858  bool prod_reverse = false;
1859  // Check genomic strand if it's not a gap.
1860  if (gstart >= 0 && gen_row.m_IsSetStrand) {
1861  if ( !gstrand_set ) {
1862  gen_strand = gen_row.m_Strand;
1863  gstrand_set = true;
1864  }
1865  else if (gen_strand != gen_row.m_Strand) {
1866  NCBI_THROW(CAnnotMapperException, eBadAlignment,
1867  "Can not construct spliced-seg "
1868  "with different genomic strands in the same exon");
1869  }
1870  }
1871  // Remember genomic strand.
1872  if ( gstrand_set ) {
1873  gen_reverse = IsReverse(gen_strand);
1874  }
1875  // Check product strand if it's not a gap.
1876  if (pstart >= 0 && prod_row.m_IsSetStrand) {
1877  if ( !pstrand_set ) {
1878  prod_strand = prod_row.m_Strand;
1879  pstrand_set = true;
1880  }
1881  else if (prod_strand != prod_row.m_Strand) {
1882  NCBI_THROW(CAnnotMapperException, eBadAlignment,
1883  "Can not construct spliced-seg "
1884  "with different product strands in the same exon");
1885  }
1886  }
1887  // Remember product strand.
1888  if ( pstrand_set ) {
1889  prod_reverse = IsReverse(prod_strand);
1890  }
1891 
1892  int gins_len = 0;
1893  int pins_len = 0;
1894 
1895  if (pstart < 0) {
1896  // Gap on product
1897  if (gstart < 0) {
1898  // Both gen and prod are missing - start new exon.
1899  last_exon_partial = true;
1900  exon->SetPartial(true);
1901  partial_right = true;
1902  seg++;
1903  break;
1904  }
1905  // Genomic is present.
1907  }
1908  else {
1909  // Product is present.
1910  // Check parts order and intersection if the last part's coordinates
1911  // are known.
1912  if (prod_start >= 0 && prod_end > 0) {
1913  if (!prod_reverse) {
1914  // Plus strand.
1915  if (pstart < prod_end) {
1916  // Intersection or bad order.
1917  partial_right = true;
1918  break;
1919  }
1920  if (pstart > prod_end) {
1921  // Parts are not abutting, add insertion.
1922  pins_len = pstart - prod_end;
1923  }
1924  }
1925  else {
1926  // Minus strand.
1927  if (pend > prod_start) {
1928  // Intersection or bad order.
1929  partial_right = true;
1930  break;
1931  }
1932  if (pend < prod_start) {
1933  // Add insertion.
1934  pins_len = prod_start - pend;
1935  }
1936  }
1937  }
1938  }
1939 
1940  if (gstart < 0) {
1941  // Missing genomic sequence. Add product insertion.
1942  _ASSERT(pstart >= 0);
1944  }
1945  else {
1946  // Genomic sequence is present.
1947  // Check parts order and intersection if the last part's coordinates
1948  // are known.
1949  if (gen_start >= 0 && gen_end > 0) {
1950  if (!gen_reverse) {
1951  // Plus strand.
1952  if (gstart < gen_end) {
1953  // Intersection or bad order.
1954  partial_right = true;
1955  break;
1956  }
1957  if (gstart > gen_end) {
1958  // Parts are not abutting, add insertion.
1959  gins_len = gstart - gen_end;
1960  }
1961  }
1962  else {
1963  // Minus strand.
1964  if (gend > gen_start) {
1965  // Intersection or bad order.
1966  partial_right = true;
1967  break;
1968  }
1969  if (gend < gen_start) {
1970  // Add insertion.
1971  gins_len = gen_start - gend;
1972  }
1973  }
1974  }
1975  }
1976 
1977  // Now when we know exon is not split, it's safe to update exon extremes.
1978  if (pstart >= 0) {
1979  if (prod_start < 0 || prod_start > pstart) {
1980  prod_start = pstart;
1981  }
1982  if (prod_end < pend) {
1983  prod_end = pend;
1984  }
1985  }
1986  if (gstart >= 0) {
1987  // Update last part's start and end.
1988  if (gen_start < 0 || gen_start > gstart) {
1989  gen_start = gstart;
1990  }
1991  if (gen_end < gend) {
1992  gen_end = gend;
1993  }
1994  }
1995 
1996  // Add genomic or product insertions if any.
1997  if (gins_len > 0) {
1998  if ( !exon->GetParts().empty() ) {
2000  gins_len, *exon);
2001  }
2002  }
2003  if (pins_len > 0) {
2004  if ( !exon->GetParts().empty() ) {
2006  pins_len, *exon);
2007  }
2008  }
2009 
2010  // Remember if there are any non-gap parts.
2011  bool is_gap = IsExonGap(ptype);
2012  if ( !is_gap ) {
2013  have_non_gaps = true;
2014  }
2015 
2016  // Add the mapped part except if it's a gap in the first position.
2017  if (!is_gap || !exon->GetParts().empty() || orig_ptype == ptype) {
2018  size_t old_size = exon->GetParts().size();
2019  x_PushExonPart(last_part, ptype, seg->m_Len, *exon);
2020  // Count trailing gaps resulting from mapping to remove them
2021  // when the exon is ready.
2022  if (is_gap && orig_ptype != ptype) {
2023  // Gaps can be merged, check the actual size change.
2024  mapped_gaps += exon->GetParts().size() - old_size;
2025  }
2026  else {
2027  mapped_gaps = 0;
2028  // NOTE: This may be a gap, but copied from the original exon rather than mapped.
2029  // Such gaps may be trimmed later.
2030  non_gap_prod_start = prod_start;
2031  non_gap_gen_start = gen_start;
2032  non_gap_prod_end = prod_end;
2033  non_gap_gen_end = gen_end;
2034  }
2035  }
2036  else {
2037  if (ptype == CSpliced_exon_chunk::e_Genomic_ins) {
2038  if ( !gen_reverse ) {
2039  gen_start += seg->m_Len;
2040  }
2041  else {
2042  gen_end -= seg->m_Len;
2043  }
2044  }
2045  else if (ptype == CSpliced_exon_chunk::e_Product_ins) {
2046  if ( !prod_reverse ) {
2047  prod_start += seg->m_Len;
2048  }
2049  else {
2050  prod_end -= seg->m_Len;
2051  }
2052  }
2053  }
2054  }
2055 
2056  // Trim trailing gaps resulting from mapping.
2057  if (mapped_gaps > 0) {
2058  CSpliced_exon::TParts& parts = exon->SetParts();
2059  _ASSERT(parts.size() >= mapped_gaps);
2060  for (; mapped_gaps > 0; mapped_gaps--) {
2061  parts.pop_back();
2062  }
2063  gen_start = non_gap_gen_start;
2064  prod_start = non_gap_prod_start;
2065  gen_end = non_gap_gen_end;
2066  prod_end = non_gap_prod_end;
2067  }
2068 
2069  // The whole alignment becomes partial if any its exon is partial.
2070  if (!have_non_gaps || exon->GetParts().empty()) {
2071  // No parts were inserted (or only gaps were found) - truncated exon.
2072  // Discard it completely.
2073  last_exon_partial = true;
2074  if (!spliced.GetExons().empty()) {
2075  // Mark previous exon partial
2076  CSpliced_exon& last_exon = *spliced.SetExons().back();
2077  last_exon.SetPartial(true);
2078  if (last_exon.IsSetGenomic_strand() &&
2079  IsReverse(last_exon.GetGenomic_strand())) {
2080  // Minus strand - reset acceptor of the last exon
2081  last_exon.ResetAcceptor_before_exon();
2082  }
2083  else {
2084  last_exon.ResetDonor_after_exon();
2085  }
2086  }
2087  return false;
2088  }
2089 
2090  if ( IsReverse(gen_strand) ) {
2091  if ( !partial_right && m_OrigExon &&
2093  exon->SetAcceptor_before_exon().Assign(
2095  }
2096  if ( !partial_left && m_OrigExon &&
2098  exon->SetDonor_after_exon().Assign(
2100  }
2101  }
2102  else {
2103  if ( !partial_left && m_OrigExon &&
2105  exon->SetAcceptor_before_exon().Assign(
2107  }
2108  if ( !partial_right && m_OrigExon &&
2110  exon->SetDonor_after_exon().Assign(
2112  }
2113  }
2114 
2115  // If some id was not found in this exon, use the last known one.
2116  if (!gen_id && last_gen_id) {
2117  gen_id = last_gen_id;
2118  exon->SetGenomic_id(const_cast<CSeq_id&>(*gen_id.GetSeqId()));
2119  }
2120  if (!prod_id && last_prod_id) {
2121  prod_id = last_prod_id;
2122  exon->SetProduct_id(const_cast<CSeq_id&>(*prod_id.GetSeqId()));
2123  }
2124  // Set the whole exon's coordinates.
2125  exon->SetGenomic_start(gen_start);
2126  exon->SetGenomic_end(gen_end - 1);
2127  if (gen_strand != eNa_strand_unknown) {
2128  exon->SetGenomic_strand(gen_strand);
2129  }
2130  if ( aln_protein ) {
2131  // For proteins adjust coords and set frames.
2132  exon->SetProduct_start().SetProtpos().SetAmin(prod_start/3);
2133  exon->SetProduct_start().SetProtpos().SetFrame(prod_start%3 + 1);
2134  exon->SetProduct_end().SetProtpos().SetAmin((prod_end - 1)/3);
2135  exon->SetProduct_end().SetProtpos().SetFrame((prod_end - 1)%3 + 1);
2136  }
2137  else {
2138  exon->SetProduct_start().SetNucpos(prod_start);
2139  exon->SetProduct_end().SetNucpos(prod_end - 1);
2140  if (prod_strand != eNa_strand_unknown) {
2141  exon->SetProduct_strand(prod_strand);
2142  }
2143  }
2144  // Scores should be copied from the original exon.
2145  // If the mapping was partial, the scores should have been invalidated
2146  // and cleared.
2147  if ( !m_SegsScores.empty() ) {
2148  CloneContainer<CScore, TScores, CScore_set::Tdata>(
2149  m_SegsScores, exon->SetScores().Set());
2150  }
2151  // Copy ext from the original exon.
2152  if ( m_OrigExon && m_OrigExon->IsSetExt() ) {
2153  CloneContainer<CUser_object, CSpliced_exon::TExt, CSpliced_exon::TExt>(
2154  m_OrigExon->GetExt(), exon->SetExt());
2155  }
2156  // Add the new exon to the spliced-seg.
2157  spliced.SetExons().push_back(exon);
2158  return true;
2159 }
2160 
2161 
2163  CSpliced_seg& spliced,
2164  const CSeq_align_Mapper_Base& sub_align,
2165  bool& last_exon_partial,
2166  CSeq_id_Handle& gen_id,
2167  CSeq_id_Handle& last_gen_id,
2168  bool& single_gen_id,
2169  ENa_strand& gen_strand,
2170  bool& single_gen_str,
2171  CSeq_id_Handle& prod_id,
2172  CSeq_id_Handle& last_prod_id,
2173  bool& single_prod_id,
2174  ENa_strand& prod_strand,
2175  bool& single_prod_str,
2176  bool& partial) const
2177 {
2178  TSegments::const_iterator seg = sub_align.m_Segs.begin();
2179  // Convert the current sub-mapper to an exon.
2180  // In some cases the exon can be split (e.g. if a gap is found in
2181  // both rows). In this case 'seg' iterator will not be set to
2182  // m_Segs.end() by x_GetDstExon and the next iteration will be
2183  // performed.
2184  while (seg != sub_align.m_Segs.end()) {
2185  CSeq_id_Handle ex_gen_id;
2186  CSeq_id_Handle ex_prod_id;
2187  ENa_strand ex_gen_strand = eNa_strand_unknown;
2188  ENa_strand ex_prod_strand = eNa_strand_unknown;
2189  bool added_exon = sub_align.x_GetDstExon(spliced, seg, ex_gen_id, ex_prod_id,
2190  ex_gen_strand, ex_prod_strand, last_exon_partial,
2191  last_gen_id, last_prod_id);
2192  partial = partial || last_exon_partial;
2193  if (added_exon) {
2194  // Check if all exons have the same ids in genomic and product
2195  // rows.
2196  if (ex_gen_id) {
2197  last_gen_id = ex_gen_id;
2198  if ( !gen_id ) {
2199  gen_id = ex_gen_id;
2200  }
2201  else {
2202  single_gen_id &= gen_id == ex_gen_id;
2203  }
2204  }
2205  if (ex_prod_id) {
2206  if ( !prod_id ) {
2207  prod_id = ex_prod_id;
2208  }
2209  else {
2210  single_prod_id &= prod_id == ex_prod_id;
2211  }
2212  }
2213  // Check if all exons have the same strands.
2214  if (ex_gen_strand != eNa_strand_unknown) {
2215  single_gen_str &= (gen_strand == eNa_strand_unknown) ||
2216  (gen_strand == ex_gen_strand);
2217  gen_strand = ex_gen_strand;
2218  }
2219  else {
2220  single_gen_str &= gen_strand == eNa_strand_unknown;
2221  }
2222  if (ex_prod_strand != eNa_strand_unknown) {
2223  single_prod_str &= (prod_strand == eNa_strand_unknown) ||
2224  (prod_strand == ex_prod_strand);
2225  prod_strand = ex_prod_strand;
2226  }
2227  else {
2228  single_prod_str &= prod_strand == eNa_strand_unknown;
2229  }
2230  }
2231  }
2232 }
2233 
2234 
2235 // Create spliced-seg.
2237 {
2238  CSpliced_seg& spliced = dst->SetSegs().SetSpliced();
2239  CSeq_id_Handle gen_id; // per-alignment genomic id
2240  CSeq_id_Handle prod_id; // per-alignment product id
2241  CSeq_id_Handle last_gen_id; // last exon's genomic id
2242  CSeq_id_Handle last_prod_id; // last exon's product id
2243  ENa_strand gen_strand = eNa_strand_unknown;
2244  ENa_strand prod_strand = eNa_strand_unknown;
2245  bool single_gen_id = true;
2246  bool single_gen_str = true;
2247  bool single_prod_id = true;
2248  bool single_prod_str = true;
2249  bool partial = false;
2250  bool last_exon_partial = false;
2251 
2252  if ( m_SubAligns.empty() ) {
2258  // Check if rows have correct types. If not, try to swap.
2260  swap(gen_row, prod_row);
2261  }
2262  }
2263  x_GetDstSplicedSubAlign(spliced, *this,
2264  last_exon_partial, gen_id, last_gen_id, single_gen_id,
2265  gen_strand, single_gen_str,
2266  prod_id, last_prod_id, single_prod_id,
2267  prod_strand, single_prod_str,
2268  partial);
2269  }
2270  else {
2272  x_GetDstSplicedSubAlign(spliced, **it,
2273  last_exon_partial, gen_id, last_gen_id, single_gen_id,
2274  gen_strand, single_gen_str,
2275  prod_id, last_prod_id, single_prod_id,
2276  prod_strand, single_prod_str,
2277  partial);
2278  }
2279  }
2280 
2282  // Make sure the first and the last parts are not gaps. By now there should
2283  // be no exons with only gaps in them, so no trimming should result in an
2284  // empty exon.
2285  CRef<CSpliced_exon> exon = spliced.SetExons().front();
2286  bool ex_gen_reverse = false;
2287  bool ex_prod_reverse = false;
2288  if ( exon->IsSetGenomic_strand() ) {
2289  ex_gen_reverse = IsReverse(exon->GetGenomic_strand());
2290  }
2291  else if ( spliced.IsSetGenomic_strand() ) {
2292  ex_gen_reverse = IsReverse(spliced.GetGenomic_strand());
2293  }
2294  if ( exon->IsSetProduct_strand() ) {
2295  ex_prod_reverse = IsReverse(exon->GetProduct_strand());
2296  }
2297  else if ( spliced.IsSetProduct_strand() ) {
2298  ex_prod_reverse = IsReverse(spliced.GetProduct_strand());
2299  }
2300  TSeqPos gen_start = exon->GetGenomic_start();
2301  TSeqPos gen_end = exon->GetGenomic_end();
2302  TSeqPos prod_start, prod_end;
2304  _ASSERT(exon->GetProduct_start().IsProtpos());
2305  _ASSERT(exon->GetProduct_end().IsProtpos());
2306  prod_start = exon->GetProduct_start().GetProtpos().GetAmin()*3
2307  + exon->GetProduct_start().GetProtpos().GetFrame() - 1;
2308  prod_end = exon->GetProduct_end().GetProtpos().GetAmin()*3
2309  + exon->GetProduct_end().GetProtpos().GetFrame() - 1;
2310  }
2311  else {
2312  _ASSERT(exon->GetProduct_start().IsNucpos());
2313  _ASSERT(exon->GetProduct_end().IsNucpos());
2314  prod_start = exon->GetProduct_start().GetNucpos();
2315  prod_end = exon->GetProduct_end().GetNucpos();
2316  }
2317  while (IsExonGap(exon->GetParts().front()->Which()) ) {
2318  const CSpliced_exon_chunk& chunk = *exon->GetParts().front();
2319  if ( chunk.IsGenomic_ins() ) {
2320  if ( ex_gen_reverse ) {
2321  gen_end -= chunk.GetGenomic_ins();
2322  }
2323  else {
2324  gen_start += chunk.GetGenomic_ins();
2325  }
2326  }
2327  else {
2328  if ( ex_prod_reverse ) {
2329  prod_end -= chunk.GetProduct_ins();
2330  }
2331  else {
2332  prod_start += chunk.GetProduct_ins();
2333  }
2334  }
2335  exon->SetParts().pop_front();
2336  }
2337  exon->SetGenomic_start(gen_start);
2338  exon->SetGenomic_end(gen_end);
2340  exon->SetProduct_start().SetProtpos().SetAmin(prod_start/3);
2341  exon->SetProduct_start().SetProtpos().SetFrame(prod_start%3 + 1);
2342  exon->SetProduct_end().SetProtpos().SetAmin(prod_end/3);
2343  exon->SetProduct_end().SetProtpos().SetFrame(prod_end%3 + 1);
2344  }
2345  else {
2346  exon->SetProduct_start().SetNucpos(prod_start);
2347  exon->SetProduct_end().SetNucpos(prod_end);
2348  }
2349  // Trim the last exon.
2350  exon = spliced.SetExons().back();
2351  ex_gen_reverse = false;
2352  ex_prod_reverse = false;
2353  if ( exon->IsSetGenomic_strand() ) {
2354  ex_gen_reverse = IsReverse(exon->GetGenomic_strand());
2355  }
2356  else if ( spliced.IsSetGenomic_strand() ) {
2357  ex_gen_reverse = IsReverse(spliced.GetGenomic_strand());
2358  }
2359  if ( exon->IsSetProduct_strand() ) {
2360  ex_prod_reverse = IsReverse(exon->GetProduct_strand());
2361  }
2362  else if ( spliced.IsSetProduct_strand() ) {
2363  ex_prod_reverse = IsReverse(spliced.GetProduct_strand());
2364  }
2365  gen_start = exon->GetGenomic_start();
2366  gen_end = exon->GetGenomic_end();
2368  _ASSERT(exon->GetProduct_start().IsProtpos());
2369  _ASSERT(exon->GetProduct_end().IsProtpos());
2370  prod_start = exon->GetProduct_start().GetProtpos().GetAmin()*3
2371  + exon->GetProduct_start().GetProtpos().GetFrame() - 1;
2372  prod_end = exon->GetProduct_end().GetProtpos().GetAmin()*3
2373  + exon->GetProduct_end().GetProtpos().GetFrame() - 1;
2374  }
2375  else {
2376  _ASSERT(exon->GetProduct_start().IsNucpos());
2377  _ASSERT(exon->GetProduct_end().IsNucpos());
2378  prod_start = exon->GetProduct_start().GetNucpos();
2379  prod_end = exon->GetProduct_end().GetNucpos();
2380  }
2381  while (IsExonGap(exon->GetParts().back()->Which()) ) {
2382  const CSpliced_exon_chunk& chunk = *exon->GetParts().back();
2383  if ( chunk.IsGenomic_ins() ) {
2384  if ( ex_gen_reverse ) {
2385  gen_start += chunk.GetGenomic_ins();
2386  }
2387  else {
2388  gen_end -= chunk.GetGenomic_ins();
2389  }
2390  }
2391  else {
2392  if ( ex_prod_reverse ) {
2393  prod_start += chunk.GetProduct_ins();
2394  }
2395  else {
2396  prod_end -= chunk.GetProduct_ins();
2397  }
2398  }
2399  exon->SetParts().pop_back();
2400  }
2401  exon->SetGenomic_start(gen_start);
2402  exon->SetGenomic_end(gen_end);
2404  exon->SetProduct_start().SetProtpos().SetAmin(prod_start/3);
2405  exon->SetProduct_start().SetProtpos().SetFrame(prod_start%3 + 1);
2406  exon->SetProduct_end().SetProtpos().SetAmin(prod_end/3);
2407  exon->SetProduct_end().SetProtpos().SetFrame(prod_end%3 + 1);
2408  }
2409  else {
2410  exon->SetProduct_start().SetNucpos(prod_start);
2411  exon->SetProduct_end().SetNucpos(prod_end);
2412  }
2413  }
2414 
2415  // Try to propagate some properties to the alignment level.
2416  if ( !gen_id ) {
2417  // Don't try to use genomic id if not set
2418  single_gen_id = false;
2419  }
2420  if ( !prod_id ) {
2421  // Don't try to use product id if not set
2422  single_prod_id = false;
2423  }
2424  if ( single_gen_id ) {
2425  spliced.SetGenomic_id(const_cast<CSeq_id&>(*gen_id.GetSeqId()));
2426  }
2427  if (single_gen_str && gen_strand != eNa_strand_unknown) {
2428  spliced.SetGenomic_strand(gen_strand);
2429  }
2430  if ( single_prod_id ) {
2431  spliced.SetProduct_id(const_cast<CSeq_id&>(*prod_id.GetSeqId()));
2432  }
2433  if (single_prod_str && prod_strand != eNa_strand_unknown) {
2434  spliced.SetProduct_strand(prod_strand);
2435  }
2436  // Update bounds if defined in the original alignment.
2437  if (single_prod_id && single_gen_id && m_OrigAlign->IsSetBounds() &&
2440  bounds.clear();
2442  CRef<CSeq_loc> mapped_it = m_LocMapper.Map(**it);
2443  _ASSERT(mapped_it);
2444  if ( mapped_it->IsNull() ) {
2445  // Could not map the location
2446  mapped_it->Assign(**it);
2447  }
2448  bounds.push_back(mapped_it);
2449  }
2450  }
2451 
2452  // Reset local values where possible if the global ones are set.
2453  // Fill ids in gaps.
2455  if ( single_gen_id ) {
2456  (*it)->ResetGenomic_id();
2457  }
2458  else if ( gen_id && !(*it)->IsSetGenomic_id() ) {
2459  // Use the first known genomic id to fill gaps.
2460  (*it)->SetGenomic_id(const_cast<CSeq_id&>(*gen_id.GetSeqId()));
2461  }
2462  if ( single_prod_id ) {
2463  (*it)->ResetProduct_id();
2464  }
2465  else if ( prod_id && !(*it)->IsSetProduct_id() ) {
2466  // Use the first known product id to fill gaps.
2467  (*it)->SetProduct_id(const_cast<CSeq_id&>(*prod_id.GetSeqId()));
2468  }
2469  if ( single_gen_str ) {
2470  (*it)->ResetGenomic_strand();
2471  }
2472  if ( single_prod_str ) {
2473  (*it)->ResetProduct_strand();
2474  }
2475  }
2476 
2477  if ( m_OrigAlign->GetSegs().IsSpliced() ) {
2479  // Copy some values from the original alignment.
2480  if ( orig.IsSetPoly_a() ) {
2481  spliced.SetPoly_a(orig.GetPoly_a());
2482  }
2483  if ( orig.IsSetProduct_length() ) {
2484  spliced.SetProduct_length(orig.GetProduct_length());
2485  }
2486  // Some properties can be copied only if the alignment was not
2487  // truncated.
2488  if (!partial && orig.IsSetModifiers()) {
2491  orig.GetModifiers(), spliced.SetModifiers());
2492  }
2493  }
2494 }
2495 
2496 
2497 // Create sparse-seg alignment.
2499 {
2500  CSparse_seg& sparse = dst->SetSegs().SetSparse();
2501  if ( !m_SegsScores.empty() ) {
2502  // Copy scores (each element, not just pointers).
2503  CloneContainer<CScore, TScores, CSparse_seg::TRow_scores>(
2504  m_SegsScores, sparse.SetRow_scores());
2505  }
2507  sparse.SetRows().push_back(aln);
2508  aln->SetNumseg(static_cast<CSparse_align::TNumseg>(m_Segs.size()));
2509 
2510  CSeq_id_Handle first_idh;
2511  CSeq_id_Handle second_idh;
2512  size_t s = 0;
2513  // Check if all segments are related to the same group of scores.
2514  // Need two special values: -2 indicates that the scores group is
2515  // not yet set; -1 is used if there are segments with different
2516  // groups and scores should not be copied from the original align.
2517  ssize_t scores_group = -2; // -2 -- not yet set; -1 -- already reset.
2518  ITERATE(TSegments, seg, m_Segs) {
2519  if (seg->m_Rows.size() > 2) {
2520  NCBI_THROW(CAnnotMapperException, eBadAlignment,
2521  "Can not construct sparse-seg with more than two ids");
2522  }
2523  const SAlignment_Segment::SAlignment_Row& first_row = seg->m_Rows[0];
2524  const SAlignment_Segment::SAlignment_Row& second_row = seg->m_Rows[1];
2525 
2526  // Skip gaps.
2527  int first_start = first_row.GetSegStart();
2528  int second_start = second_row.GetSegStart();
2529  if (first_start < 0 || second_start < 0) {
2530  continue; // gap in one row
2531  }
2532 
2533  // All segments must have the same seq-id.
2534  if ( first_idh ) {
2535  if (first_idh != first_row.m_Id) {
2536  NCBI_THROW(CAnnotMapperException, eBadAlignment,
2537  "Can not construct sparse-seg with multiple ids per row");
2538  }
2539  }
2540  else {
2541  first_idh = first_row.m_Id;
2542  aln->SetFirst_id(const_cast<CSeq_id&>(*first_row.m_Id.GetSeqId()));
2543  }
2544  if ( second_idh ) {
2545  if (second_idh != second_row.m_Id) {
2546  NCBI_THROW(CAnnotMapperException, eBadAlignment,
2547  "Can not construct sparse-seg with multiple ids per row");
2548  }
2549  }
2550  else {
2551  second_idh = second_row.m_Id;
2552  aln->SetSecond_id(const_cast<CSeq_id&>(*second_row.m_Id.GetSeqId()));
2553  }
2554  // Check sequence types, adjust coordinates.
2555  bool first_prot = m_LocMapper.GetSeqTypeById(first_idh) ==
2557  bool second_prot = m_LocMapper.GetSeqTypeById(second_idh) ==
2559  int first_width = first_prot ? 3 : 1;
2560  int second_width = second_prot ? 3 : 1;
2561  // If at least one row is on a protein, lengths should be
2562  // in AAs, not bases.
2563  int len_width = (first_prot || second_prot) ? 3 : 1;
2564 
2565  aln->SetFirst_starts().push_back(first_start/first_width);
2566  aln->SetSecond_starts().push_back(second_start/second_width);
2567  aln->SetLens().push_back(seg->m_Len/len_width);
2568 
2569  // Set strands.
2570  if (aln->IsSetSecond_strands() ||
2571  first_row.m_IsSetStrand || second_row.m_IsSetStrand) {
2572  // Add missing strands to the container if necessary.
2573  for (size_t i = aln->SetSecond_strands().size(); i < s; i++) {
2574  aln->SetSecond_strands().push_back(eNa_strand_unknown);
2575  }
2576  ENa_strand first_strand = first_row.m_IsSetStrand ?
2577  first_row.m_Strand : eNa_strand_unknown;
2578  ENa_strand second_strand = second_row.m_IsSetStrand ?
2579  second_row.m_Strand : eNa_strand_unknown;
2580  aln->SetSecond_strands().push_back(IsForward(first_strand)
2581  ? second_strand : Reverse(second_strand));
2582  }
2583 
2584  // Check scores for consistency.
2585  if (scores_group == -2) { // not yet set
2586  scores_group = seg->m_ScoresGroupIdx;
2587  }
2588  else if (scores_group != seg->m_ScoresGroupIdx) {
2589  scores_group = -1; // reset
2590  }
2591  }
2592  // Copy scores if possible. All segments must be assigned to the same
2593  // group of scores.
2594  if (scores_group >= 0) {
2595  CloneContainer<CScore, TScores, CSparse_align::TSeg_scores>(
2596  m_GroupScores[scores_group], aln->SetSeg_scores());
2597  }
2598 }
2599 
2600 
2601 // When the mapped alignment can not be stored using the original
2602 // alignment type (e.g. most types do not allow multiple ids per row),
2603 // the whole mapped alignment is converted to a disc-align containing
2604 // several dense-segs. The following method attempts to put as many
2605 // mapped segments as possible to the dense-seg sub-alignment.
2607  size_t start_seg) const
2608 {
2609  CDense_seg& dseg = dst->SetSegs().SetDenseg();
2611  dseg.SetDim(static_cast<CDense_seg::TDim>(m_Segs.front().m_Rows.size()));
2612 
2613  int len_width = 1;
2614 
2615  // First, find the requested segment. Since TSegments is a list, we
2616  // have to iterate over it and skip 'start_seg' items.
2617  TSegments::const_iterator start_seg_it = m_Segs.begin();
2618  for (size_t s = 0; s < start_seg && start_seg_it != m_Segs.end();
2619  s++, start_seg_it++) {
2620  }
2621  if (start_seg_it == m_Segs.end()) {
2622  return -1; // The requested segment does not exist.
2623  }
2624  const SAlignment_Segment& start_segment = *start_seg_it;
2625  // Remember number of rows in the first segment. Break the dense-seg
2626  // when the next segment has a different number of rows.
2627  size_t num_rows = start_segment.m_Rows.size();
2628  auto last_seg = m_Segs.size() - 1;
2629 
2630  // Find first non-gap in each row, get its seq-id, detect the first
2631  // one which is different. Also stop if number or rows per segment
2632  // changes. Collect all seq-ids.
2633  vector<CSeq_id_Handle> ids;
2634  TStrands strands(num_rows, eNa_strand_unknown);
2635  ids.resize(num_rows);
2636  for (size_t r = 0; r < num_rows; r++) {
2637  CSeq_id_Handle last_id;
2638  TSegments::const_iterator seg_it = start_seg_it;
2639  auto seg_idx = start_seg;
2640  int left = -1;
2641  int right = -1;
2642  for ( ; seg_idx <= last_seg && seg_it != m_Segs.end();
2643  seg_idx++, seg_it++) {
2644  // Check number of rows.
2645  if (seg_it->m_Rows.size() != num_rows) {
2646  // Adjust the last segment index.
2647  last_seg = seg_idx - 1;
2648  break;
2649  }
2650  const SAlignment_Segment::SAlignment_Row& row = seg_it->m_Rows[r];
2651  // Check ids.
2652  if (last_id && last_id != row.m_Id) {
2653  last_seg = seg_idx - 1;
2654  break;
2655  }
2656  if ( !last_id ) {
2657  last_id = row.m_Id;
2658  ids[r] = row.m_Id;
2659  }
2660  // Check strands and overlaps for non-gaps
2661  int seg_start = row.GetSegStart();
2662  int seg_stop = seg_start == -1 ? -1 : seg_start + seg_it->m_Len;
2663  if (seg_start != -1) {
2664  // Check strands
2665  if (strands[r] == eNa_strand_unknown) {
2666  if ( row.m_IsSetStrand ) {
2667  strands[r] = row.m_Strand;
2668  }
2669  }
2670  else {
2671  if ( !SameOrientation(strands[r], row.m_Strand) ) {
2672  last_seg = seg_idx - 1;
2673  break;
2674  }
2675  }
2676  // Check overlaps
2677  if (left == -1) {
2678  left = seg_start;
2679  right = seg_stop;
2680  }
2681  else {
2682  if (row.m_IsSetStrand && IsReverse(row.m_Strand)) {
2683  if (seg_stop > left) {
2684  last_seg = seg_idx - 1;
2685  break;
2686  }
2687  left = seg_start;
2688  }
2689  else {
2690  if (seg_start < right) {
2691  last_seg = seg_idx - 1;
2692  break;
2693  }
2694  right = seg_stop;
2695  }
2696  }
2697  }
2698  }
2699  }
2700  // At lease one segment may be used.
2701  _ASSERT(last_seg >= start_seg);
2702 
2703  // Now when number of rows is known, fill the ids.
2704  for (size_t i = 0; i < num_rows; i++) {
2705  CRef<CSeq_id> id(new CSeq_id);
2706  id->Assign(*ids[i].GetSeqId());
2707  dseg.SetIds().push_back(id);
2708  // Check sequence type and adjust length width.
2711  if (seq_type == CSeq_loc_Mapper_Base::eSeq_prot) {
2712  len_width = 3;
2713  }
2714  }
2715 
2716  // Detect strands for all rows, they will be used for gaps.
2717  x_FillKnownStrands(strands);
2718  // Count number of non-gap segments in each row.
2719  // If a row has only gaps, the whole sub-alignment should be
2720  // discarded.
2721  vector<size_t> segs_per_row(num_rows, 0);
2722  // Count total number of segments added to the alignment
2723  // where at least one row is non-gap.
2724  int non_empty_segs = 0;
2725  auto cur_seg = start_seg;
2726  for (TSegments::const_iterator it = start_seg_it; it != m_Segs.end();
2727  ++it, ++cur_seg) {
2728  if (cur_seg > last_seg) {
2729  break;
2730  }
2731  // Check if at least one row in the current segment is non-gap.
2732  bool only_gaps = true;
2733  for (size_t row = 0; row < it->m_Rows.size(); row++) {
2734  if (it->m_Rows[row].m_Start != kInvalidSeqPos) {
2735  segs_per_row[row]++;
2736  only_gaps = false;
2737  }
2738  }
2739  if (only_gaps) continue; // ignore empty rows
2740 
2741  // Set segment length.
2742  dseg.SetLens().push_back(it->m_Len/len_width);
2743 
2744  size_t str_idx = 0;
2745  non_empty_segs++; // count segments added to the dense-seg
2746  // Now iterate all rows and add them to the dense-seg.
2747  ITERATE(SAlignment_Segment::TRows, row, it->m_Rows) {
2748  int width = 1;
2749  // Don't check sequence type if there are no proteins in the
2750  // used segments (len_width == 1).
2751  if (len_width == 3 && m_LocMapper.GetSeqTypeById(row->m_Id) ==
2753  width = 3;
2754  }
2755  int start = row->GetSegStart();
2756  if (start >= 0) {
2757  start /= width;
2758  }
2759  dseg.SetStarts().push_back(start);
2760  if (m_HaveStrands) { // Are per-alignment strands set?
2761  // For gaps use the strand of the first mapped row
2762  dseg.SetStrands().
2763  push_back((TSeqPos)row->GetSegStart() != kInvalidSeqPos ?
2764  (row->m_Strand != eNa_strand_unknown ?
2765  row->m_Strand : eNa_strand_plus): strands[str_idx]);
2766  }
2767  str_idx++;
2768  }
2769  }
2770  if (non_empty_segs == 0) {
2771  // The sub-align contains only gaps in all rows, ignore it
2772  dst.Reset();
2773  }
2774  else {
2775  ITERATE(vector<size_t>, row, segs_per_row) {
2776  if (*row == 0) {
2777  // The row contains only gaps. Discard the sub-alignment.
2778  dst.Reset();
2779  break;
2780  }
2781  }
2782  }
2783  if ( dst ) {
2784  dseg.SetNumseg(non_empty_segs);
2785  }
2786  return last_seg + 1;
2787 }
2788 
2789 
2790 // If the original alignment type does not support some features of
2791 // the mapped alignment (multi-id rows, segments with different number
2792 // of rows etc.), convert it to disc-align with multiple dense-segs.
2794 {
2795  // Ignore m_SegsScores -- if we are here, they are probably not valid.
2796  // Anyway, there's no place to put them in. The same about m_AlignScores.
2797  CSeq_align_set::Tdata& data = dst->SetSegs().SetDisc().Set();
2798  ssize_t seg = 0;
2799  // The iteration stops when the last segment is converted or
2800  // when an error occurs and x_GetPartialDenseg returns -1.
2801  while (seg >= 0 && seg < (ssize_t)m_Segs.size()) {
2802  // Convert as many segments as possible to a single dense-seg.
2803  CRef<CSeq_align> dseg(new CSeq_align);
2804  seg = x_GetPartialDenseg(dseg, seg);
2805  if (!dseg) continue; // The sub-align had only gaps
2806  data.push_back(dseg);
2807  }
2808 }
2809 
2810 
2811 // Check if the mapped alignment contains different sequence types.
2813 {
2814  bool have_prot = false;
2815  bool have_nuc = false;
2816  ITERATE(TSegments, seg, m_Segs) {
2817  ITERATE(SAlignment_Segment::TRows, row, seg->m_Rows) {
2820  if (seqtype == CSeq_loc_Mapper_Base::eSeq_prot) {
2821  have_prot = true;
2822  }
2823  else /*if (seqtype == CSeq_loc_Mapper_Base::eSeq_nuc)*/ {
2824  // unknown == nuc
2825  have_nuc = true;
2826  }
2827  if (have_prot && have_nuc) return true;
2828  }
2829  }
2830  return false;
2831 }
2832 
2833 
2834  // Check if each row contains only one strand.
2836 {
2837  if ( m_Segs.empty() ) {
2838  return false;
2839  }
2840  vector<ENa_strand> strands(m_Segs.front().m_Rows.size(), eNa_strand_unknown);
2841  ITERATE(TSegments, seg, m_Segs) {
2842  for (size_t r = 0; r < seg->m_Rows.size(); ++r) {
2843  if (r >= strands.size()) {
2844  strands.resize(r, eNa_strand_unknown);
2845  }
2846  const SAlignment_Segment::SAlignment_Row& row = seg->m_Rows[r];
2847  // Skip gaps - they may have wrong strands.
2848  if (row.GetSegStart() == -1) {
2849  continue;
2850  }
2851  if (strands[r] == eNa_strand_unknown) {
2852  if ( row.m_IsSetStrand ) {
2853  strands[r] = row.m_Strand;
2854  }
2855  }
2856  else {
2857  if ( !SameOrientation(strands[r], row.m_Strand) ) {
2858  return true;
2859  }
2860  }
2861  }
2862  }
2863  return false;
2864 }
2865 
2866 
2868 {
2869  if ( !m_Segs.empty() ) {
2870  // Check if there's at least one segment with at least two rows.
2871  ITERATE(TSegments, seg, m_Segs) {
2872  if (seg->m_Rows.size() < 2) continue;
2873  int non_empty = 0;
2874  ITERATE(SAlignment_Segment::TRows, row, seg->m_Rows) {
2875  if (row->m_Start == kInvalidSeqPos) continue;
2876  if (++non_empty >= 2) {
2877  return false; // Found non-empty segment.
2878  }
2879  }
2880  }
2881  }
2882  // No non-empty segments. Check sub-mappers.
2883  ITERATE(TSubAligns, sub, m_SubAligns) {
2884  if ( !(*sub)->x_IsEmpty() ) return false;
2885  }
2886  // No non-empty segments or sub-mappers.
2887  return true;
2888 }
2889 
2890 
2891 // Get mapped alignment. In most cases the mapper tries to
2892 // preserve the original alignment type and copy as much
2893 // information as possible (scores, bounds etc.).
2895 {
2896  if (m_DstAlign) {
2897  // The mapped alignment has been created, just use it.
2898  return m_DstAlign;
2899  }
2900 
2901  if ( x_IsEmpty() ) {
2902  NCBI_THROW(CAnnotMapperException, eBadAlignment,
2903  "Mapping resulted in an empty alignment, "
2904  "can not initialize Seq-align.");
2905  }
2906 
2907  // Find first non-gap in each row, get its seq-id.
2908  TSegments::iterator seg = m_Segs.begin();
2909  vector<CSeq_id_Handle> row_ids;
2910  for ( ; seg != m_Segs.end(); ++seg) {
2911  if (row_ids.size() < seg->m_Rows.size()) {
2912  row_ids.resize(seg->m_Rows.size());
2913  }
2914  for (size_t r = 0; r < seg->m_Rows.size(); r++) {
2915  SAlignment_Segment::SAlignment_Row& row = seg->m_Rows[r];
2916  if (row.m_Start != kInvalidSeqPos) {
2917  // Remember seq-id used in the last non-gap segment
2918  row_ids[r] = row.m_Id;
2919  continue;
2920  }
2921  // Check if an id for this row is known
2922  if ( !row_ids[r] ) {
2923  // Try to look forward - find non-gap
2924  TSegments::iterator fwd = seg;
2925  ++fwd;
2926  for ( ; fwd != m_Segs.end(); ++fwd) {
2927  if (fwd->m_Rows.size() <= r) continue;
2928  SAlignment_Segment::SAlignment_Row& fwd_row = fwd->m_Rows[r];
2929  if (fwd_row.m_Start != kInvalidSeqPos) {
2930  row_ids[r] = fwd_row.m_Id;
2931  break;
2932  }
2933  }
2934  }
2935  if ( row_ids[r] ) {
2936  row.m_Id = row_ids[r];
2937  }
2938  }
2939  }
2940 
2942 
2943  CRef<CSeq_align> dst(new CSeq_align);
2944  // Copy some information from the original alignment.
2945  dst->SetType(m_OrigAlign->GetType());
2946  if (m_OrigAlign->IsSetDim()) {
2947  dst->SetDim(m_OrigAlign->GetDim());
2948  }
2949  if ( !m_AlignScores.empty() ) {
2950  CloneContainer<CScore, TScores, CSeq_align::TScore>(
2951  m_AlignScores, dst->SetScore());
2952  }
2953  if (m_OrigAlign->IsSetBounds()) {
2954  CloneContainer<CSeq_loc, CSeq_align::TBounds, CSeq_align::TBounds>(
2955  m_OrigAlign->GetBounds(), dst->SetBounds());
2956  }
2957  if (m_OrigAlign->IsSetId()) {
2958  CloneContainer<CObject_id, CSeq_align::TId, CSeq_align::TId>(
2959  m_OrigAlign->GetId(), dst->SetId());
2960  }
2961  if (m_OrigAlign->IsSetExt()) {
2962  CloneContainer<CUser_object, CSeq_align::TExt, CSeq_align::TExt>(
2963  m_OrigAlign->GetExt(), dst->SetExt());
2964  }
2965  if ( x_HaveMixedSeqTypes() ) {
2966  // Only std and spliced can support mixed sequence types.
2967  // Since spliced-segs are mapped in a different way (through
2968  // sub-mappers which return mapped exons rather than whole alignments),
2969  // here we should always use std-seg.
2971  row_ids.size() == 2 &&
2972  m_LocMapper.GetSeqTypeById(row_ids[0]) != m_LocMapper.GetSeqTypeById(row_ids[1])) {
2973  // Try to use spliced-seg for mixed-type pairwise alignments.
2974  x_GetDstSpliced(dst);
2975  }
2976  else {
2977  x_GetDstStd(dst);
2978  }
2979  }
2980  /*
2981  // Commented out as it looks to be wrong approach - it discards scores and
2982  // changes seq-align type.
2983 
2984  // Even with mixed strand, do not convert std-segs - they can hold mixed
2985  // strands without any problems.
2986  else if (x_HaveMixedStrand() && orig_choice != CSeq_align::TSegs::e_Std) {
2987  x_ConvToDstDisc(dst);
2988  }
2989 
2990  */
2991  else {
2992  // Get the proper mapped alignment. Some types still may need
2993  // to be converted to disc-seg.
2994  switch ( orig_choice ) {
2996  {
2997  x_GetDstDendiag(dst);
2998  break;
2999  }
3001  {
3002  if (m_AlignFlags == eAlign_Normal) {
3003  x_GetDstDenseg(dst);
3004  }
3005  else {
3006  x_ConvToDstDisc(dst);
3007  }
3008  break;
3009  }
3011  {
3012  x_GetDstStd(dst);
3013  break;
3014  }
3016  {
3017  if (m_AlignFlags == eAlign_Normal) {
3018  x_GetDstPacked(dst);
3019  }
3020  else {
3021  x_ConvToDstDisc(dst);
3022  }
3023  break;
3024  }
3026  {
3027  x_GetDstDisc(dst);
3028  break;
3029  }
3031  {
3032  x_GetDstSpliced(dst);
3033  break;
3034  }
3036  {
3037  x_GetDstSparse(dst);
3038  break;
3039  }
3040  default:
3041  {
3042  // Unknown original type, just copy the original alignment.
3043  dst->Assign(*m_OrigAlign);
3044  break;
3045  }
3046  }
3047  }
3048  return m_DstAlign = dst;
3049 }
3050 
3051 
3054 {
3055  // Create a sub-mapper instance for the given sub-alignment.
3056  return new CSeq_align_Mapper_Base(align, m_LocMapper);
3057 }
3058 
3059 
3062  const CSpliced_exon& exon)
3063 {
3064  // Create a sub-mapper instance for the exon.
3065  unique_ptr<CSeq_align_Mapper_Base> sub(
3067  sub->InitExon(spliced, exon);
3068  return sub.release();
3069 }
3070 
3071 
3073 {
3074  if ( m_Segs.empty() || idx >= m_Segs.begin()->m_Rows.size() ) {
3075  NCBI_THROW(CAnnotMapperException, eOtherError,
3076  "Invalid row index");
3077  }
3078  return m_Segs.begin()->m_Rows[idx].m_Id;
3079 }
3080 
3081 
3084 {
3085  // Reset all scores which are related to the segment including
3086  // all higher-level scores. This is done when a segment is truncated
3087  // and scores become invalid.
3088  m_ScoresInvalidated = true;
3089  // Invalidate all global scores
3090  m_AlignScores.clear();
3091  m_SegsScores.clear();
3092  if ( seg ) {
3093  // Invalidate segment-related scores
3094  seg->m_Scores.clear();
3095  seg->m_ScoresGroupIdx = -1;
3096  }
3097 }
3098 
3099 
3102 
bool IsForward(ENa_strand s)
Definition: Na_strand.hpp:68
bool IsReverse(ENa_strand s)
Definition: Na_strand.hpp:75
ENa_strand Reverse(ENa_strand s)
Definition: Na_strand.hpp:90
bool SameOrientation(ENa_strand a, ENa_strand b)
Definition: Na_strand.hpp:83
Seq-loc and seq-align mapper exceptions.
CPacked_seg –.
Definition: Packed_seg.hpp:66
TSeqPos AsSeqPos() const
Definition: Product_pos.cpp:56
Class used to map seq-alignments.
CSeq_loc_Mapper_Base & m_LocMapper
void x_PushExonPart(CRef< CSpliced_exon_chunk > &last_part, CSpliced_exon_chunk::E_Choice part_type, int part_len, CSpliced_exon &exon) const
void x_ConvToDstDisc(CRef< CSeq_align > &dst) const
CSeq_align::C_Segs::TDendiag TDendiag
void x_GetDstSparse(CRef< CSeq_align > &dst) const
CRef< CSeq_align > GetDstAlign(void) const
Create mapped alignment.
CSeq_align_Mapper_Base(const CSeq_align &align, CSeq_loc_Mapper_Base &loc_mapper)
void Convert(void)
Map the whole alignment through the linked seq-loc mapper.
void x_InvalidateScores(SAlignment_Segment *seg=NULL)
void x_GetDstDenseg(CRef< CSeq_align > &dst) const
void x_GetDstDisc(CRef< CSeq_align > &dst) const
vector< CRef< CSeq_align_Mapper_Base > > TSubAligns
void x_FillKnownStrands(TStrands &strands) const
bool x_GetDstExon(CSpliced_seg &spliced, TSegments::const_iterator &seg, CSeq_id_Handle &gen_id, CSeq_id_Handle &prod_id, ENa_strand &gen_strand, ENa_strand &prod_strand, bool &last_exon_partial, const CSeq_id_Handle &last_gen_id, const CSeq_id_Handle &last_prod_id) const
SAlignment_Segment & x_InsertSeg(TSegments::iterator &where, int len, size_t dim, bool reverse)
void InitExon(const CSpliced_seg &spliced, const CSpliced_exon &exon)
void x_GetDstDendiag(CRef< CSeq_align > &dst) const
CConstRef< CSpliced_exon > m_OrigExon
vector< ENa_strand > TStrands
void x_GetDstSplicedSubAlign(CSpliced_seg &spliced, const CSeq_align_Mapper_Base &sub_align, bool &last_exon_partial, CSeq_id_Handle &gen_id, CSeq_id_Handle &last_gen_id, bool &single_gen_id, ENa_strand &gen_strand, bool &single_gen_str, CSeq_id_Handle &prod_id, CSeq_id_Handle &last_prod_id, bool &single_prod_id, ENa_strand &prod_strand, bool &single_prod_str, bool &partial) const
void x_GetDstStd(CRef< CSeq_align > &dst) const
CConstRef< CSeq_align > m_OrigAlign
CSeq_align::C_Segs::TStd TStd
ssize_t x_GetPartialDenseg(CRef< CSeq_align > &dst, size_t start_seg) const
void x_Init(const CSeq_align &align)
const CSeq_id_Handle & GetRowId(size_t idx) const
Get seq-id for the given row.
virtual CSeq_align_Mapper_Base * CreateSubAlign(const CSeq_align &align)
void x_GetDstPacked(CRef< CSeq_align > &dst) const
void x_GetDstSpliced(CRef< CSeq_align > &dst) const
CSeq_id_Handle x_ConvertSegment(TSegments::iterator &seg_it, size_t row)
SAlignment_Segment & x_PushSeg(int len, size_t dim, ENa_strand strand=eNa_strand_unknown)
list< SAlignment_Segment > TSegments
CSeq_loc_Mapper_Base –.
CSpliced_exon_chunk –.
CSpliced_seg_modifier –.
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
bool empty() const
Definition: set.hpp:133
static const char * bounds[]
#define T(s)
Definition: common.h:230
#define false
Definition: bool.h:36
char data[12]
Definition: iconv.c:80
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
CConstRef< CSeq_id > GetSeqId(void) const
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
void SetEmpty(TEmpty &v)
Definition: Seq_loc.hpp:981
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
ESeqType GetSeqTypeById(const CSeq_id_Handle &idh) const
Methods for getting sequence types, use cached types (m_SeqTypes) if possible.
const TIdMap & GetIdMap() const
void SetSeqTypeById(const CSeq_id_Handle &idh, ESeqType seqtype) const
Methods for setting sequence types.
bool x_IsSetMiscFlag(EMiscFlags flag) const
static TSeqPos sx_GetExonPartLength(const CSpliced_exon_chunk &part)
CRef< CMappingRanges > m_Mappings
const CSeq_id_Handle & x_GetPrimaryId(const CSeq_id_Handle &synonym) const
TIdMap::const_iterator TIdIterator
void x_AdjustSeqTypesToProt(const CSeq_id_Handle &idh)
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
position_type GetLength(void) const
Definition: range.hpp:158
const_iterator begin(void) const
Definition: rangemap.hpp:451
bool empty(void) const
Definition: rangemap.hpp:428
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
const TScores & GetScores(void) const
Get the Scores member data.
Definition: Std_seg_.hpp:382
bool IsSetProduct_strand(void) const
should be 'plus' or 'minus' Check if a value has been assigned to Product_strand data member.
const TDonor_after_exon & GetDonor_after_exon(void) const
Get the Donor_after_exon member data.
const TExt & GetExt(void) const
Get the Ext member data.
const TId & GetId(void) const
Get the Id member data.
Definition: Seq_align_.hpp:976
const TDenseg & GetDenseg(void) const
Get the variant data.
Definition: Seq_align_.cpp:153
TModifiers & SetModifiers(void)
Assign a value to Modifiers data member.
TId & SetId(void)
Assign a value to Id data member.
Definition: Seq_align_.hpp:982
bool IsSetProduct_type(void) const
Check if a value has been assigned to Product_type data member.
TScore & SetScore(void)
Assign a value to Score data member.
Definition: Seq_align_.hpp:902
vector< CRef< CSeq_loc > > TLoc
Definition: Std_seg_.hpp:93
const TScores & GetScores(void) const
Get the Scores member data.
TRow_scores & SetRow_scores(void)
Assign a value to Row_scores data member.
bool IsProtpos(void) const
Check if variant Protpos is selected.
TLens & SetLens(void)
Assign a value to Lens data member.
Definition: Dense_seg_.hpp:561
const TGenomic_id & GetGenomic_id(void) const
Get the Genomic_id member data.
bool IsSetBounds(void) const
regions of sequence over which align was computed Check if a value has been assigned to Bounds data m...
Definition: Seq_align_.hpp:939
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_align_.hpp:691
bool IsSetParts(void) const
basic seqments always are in biologic order Check if a value has been assigned to Parts data member.
list< CRef< CStd_seg > > TStd
Definition: Seq_align_.hpp:196
TPresent & SetPresent(void)
Assign a value to Present data member.
bool IsSetId(void) const
alignment id Check if a value has been assigned to Id data member.
Definition: Seq_align_.hpp:964
bool IsSetStrands(void) const
Check if a value has been assigned to Strands data member.
Definition: Dense_seg_.hpp:568
TProduct_ins & SetProduct_ins(void)
Select the variant.
void SetProduct_id(TProduct_id &value)
Assign a value to Product_id data member.
E_Choice
Choice variants.
Definition: Seq_align_.hpp:131
bool IsSetExt(void) const
extra info Check if a value has been assigned to Ext data member.
Definition: Seq_align_.hpp:989
bool IsSetProduct_strand(void) const
should be 'plus' or 'minus' Check if a value has been assigned to Product_strand data member.
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
const TStarts & GetStarts(void) const
Get the Starts member data.
TStarts & SetStarts(void)
Assign a value to Starts data member.
TDim GetDim(void) const
Get the Dim member data.
Definition: Seq_align_.hpp:856
const TProduct_id & GetProduct_id(void) const
Get the Product_id member data.
bool IsSetScores(void) const
score for each seg Check if a value has been assigned to Scores data member.
Definition: Dense_seg_.hpp:593
TGenomic_start GetGenomic_start(void) const
Get the Genomic_start member data.
const TProduct_id & GetProduct_id(void) const
Get the Product_id member data.
const TAcceptor_before_exon & GetAcceptor_before_exon(void) const
Get the Acceptor_before_exon member data.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
TDiag & SetDiag(void)
Select the variant.
bool IsSetAcceptor_before_exon(void) const
splice sites Check if a value has been assigned to Acceptor_before_exon data member.
TExons & SetExons(void)
Assign a value to Exons data member.
void SetProduct_strand(TProduct_strand value)
Assign a value to Product_strand data member.
const TIds & GetIds(void) const
Get the Ids member data.
Definition: Std_seg_.hpp:332
TScores & SetScores(void)
Assign a value to Scores data member.
TScores & SetScores(void)
Assign a value to Scores data member.
Definition: Dense_seg_.hpp:611
bool IsSetScores(void) const
score for each segment Check if a value has been assigned to Scores data member.
TNumseg GetNumseg(void) const
Get the Numseg member data.
void SetProduct_length(TProduct_length value)
Assign a value to Product_length data member.
TLen GetLen(void) const
Get the Len member data.
void SetDim(TDim value)
Assign a value to Dim data member.
Definition: Seq_align_.hpp:865
TProduct_type GetProduct_type(void) const
Get the Product_type member data.
TRows & SetRows(void)
Assign a value to Rows data member.
void SetDim(TDim value)
Assign a value to Dim data member.
Definition: Dense_seg_.hpp:427
TGenomic_strand GetGenomic_strand(void) const
Get the Genomic_strand member data.
bool IsSetStrands(void) const
Check if a value has been assigned to Strands data member.
void SetDim(TDim value)
Assign a value to Dim data member.
bool IsSetGenomic_strand(void) const
genomic-strand represents the strand of translation Check if a value has been assigned to Genomic_str...
const TScores & GetScores(void) const
Get the Scores member data.
bool IsSetProduct_id(void) const
product is either protein or transcript (cDNA) Check if a value has been assigned to Product_id data ...
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:818
const TParts & GetParts(void) const
Get the Parts member data.
const TProduct_start & GetProduct_start(void) const
Get the Product_start member data.
void ResetDonor_after_exon(void)
Reset Donor_after_exon data member.
TMismatch & SetMismatch(void)
Select the variant.
TDim GetDim(void) const
Get the Dim member data.
const TIds & GetIds(void) const
Get the Ids member data.
const TProduct_end & GetProduct_end(void) const
Get the Product_end member data.
bool IsSetProduct_id(void) const
product is either protein or transcript (cDNA) Check if a value has been assigned to Product_id data ...
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
bool IsSetIds(void) const
Check if a value has been assigned to Ids data member.
Definition: Std_seg_.hpp:320
TIds & SetIds(void)
Assign a value to Ids data member.
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:421
list< CRef< CSpliced_seg_modifier > > TModifiers
const TLens & GetLens(void) const
Get the Lens member data.
list< CRef< CSeq_loc > > TBounds
Definition: Seq_align_.hpp:400
bool IsGenomic_ins(void) const
Check if variant Genomic_ins is selected.
const TPacked & GetPacked(void) const
Get the variant data.
Definition: Seq_align_.cpp:175
bool IsSetExt(void) const
extra info Check if a value has been assigned to Ext data member.
TGenomic_ins GetGenomic_ins(void) const
Get the variant data.
TExt & SetExt(void)
Assign a value to Ext data member.
bool IsSetScores(void) const
Check if a value has been assigned to Scores data member.
Definition: Std_seg_.hpp:370
bool IsSetGenomic_strand(void) const
Check if a value has been assigned to Genomic_strand data member.
const TStd & GetStd(void) const
Get the variant data.
Definition: Seq_align_.hpp:752
const TScores & GetScores(void) const
Get the Scores member data.
TStarts & SetStarts(void)
Assign a value to Starts data member.
Definition: Dense_seg_.hpp:536
void SetPartial(TPartial value)
Assign a value to Partial data member.
const TIds & GetIds(void) const
Get the Ids member data.
void SetProduct_type(TProduct_type value)
Assign a value to Product_type data member.
TStrands & SetStrands(void)
Assign a value to Strands data member.
Definition: Dense_seg_.hpp:586
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
void SetPoly_a(TPoly_a value)
Assign a value to Poly_a data member.
TDim GetDim(void) const
Get the Dim member data.
Definition: Std_seg_.hpp:295
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
const TDendiag & GetDendiag(void) const
Get the variant data.
Definition: Seq_align_.hpp:726
TGenomic_strand GetGenomic_strand(void) const
Get the Genomic_strand member data.
bool IsSetStrands(void) const
Check if a value has been assigned to Strands data member.
const TPresent & GetPresent(void) const
Get the Present member data.
TGenomic_ins & SetGenomic_ins(void)
Select the variant.
void ResetAcceptor_before_exon(void)
Reset Acceptor_before_exon data member.
TType GetType(void) const
Get the Type member data.
Definition: Seq_align_.hpp:809
TProduct_strand GetProduct_strand(void) const
Get the Product_strand member data.
TParts & SetParts(void)
Assign a value to Parts data member.
void SetGenomic_id(TGenomic_id &value)
Assign a value to Genomic_id data member.
bool IsSetDim(void) const
dimensionality Check if a value has been assigned to Dim data member.
Definition: Seq_align_.hpp:837
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
Definition: Dense_seg_.hpp:474
const TStrands & GetStrands(void) const
Get the Strands member data.
const TRow_scores & GetRow_scores(void) const
Get the Row_scores member data.
const TIds & GetIds(void) const
Get the Ids member data.
Definition: Dense_seg_.hpp:505
const TExt & GetExt(void) const
Get the Ext member data.
const TStrands & GetStrands(void) const
Get the Strands member data.
list< CRef< CSpliced_exon_chunk > > TParts
TMatch & SetMatch(void)
Select the variant.
bool IsSetScore(void) const
for whole alignment Check if a value has been assigned to Score data member.
Definition: Seq_align_.hpp:884
TGenomic_end GetGenomic_end(void) const
Get the Genomic_end member data.
const TStarts & GetStarts(void) const
Get the Starts member data.
void SetGenomic_strand(TGenomic_strand value)
Assign a value to Genomic_strand data member.
bool IsSpliced(void) const
Check if variant Spliced is selected.
Definition: Seq_align_.hpp:778
TNumseg GetNumseg(void) const
Get the Numseg member data.
Definition: Dense_seg_.hpp:465
TProduct_strand GetProduct_strand(void) const
Get the Product_strand member data.
list< CRef< CSeq_align > > Tdata
const TSparse & GetSparse(void) const
Get the variant data.
Definition: Seq_align_.cpp:241
TIds & SetIds(void)
Assign a value to Ids data member.
Definition: Dense_seg_.hpp:511
bool IsSetGenomic_id(void) const
Check if a value has been assigned to Genomic_id data member.
bool IsProduct_ins(void) const
Check if variant Product_ins is selected.
const TScore & GetScore(void) const
Get the Score member data.
Definition: Seq_align_.hpp:896
TBounds & SetBounds(void)
Assign a value to Bounds data member.
Definition: Seq_align_.hpp:957
TDim GetDim(void) const
Get the Dim member data.
const TScores & GetScores(void) const
Get the Scores member data.
Definition: Dense_seg_.hpp:605
const TRows & GetRows(void) const
Get the Rows member data.
TProduct_ins GetProduct_ins(void) const
Get the variant data.
const TDisc & GetDisc(void) const
Get the variant data.
Definition: Seq_align_.cpp:197
const TStrands & GetStrands(void) const
Get the Strands member data.
Definition: Dense_seg_.hpp:580
const Tdata & Get(void) const
Get the member data.
TStrands & SetStrands(void)
Assign a value to Strands data member.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
bool IsSetScores(void) const
Check if a value has been assigned to Scores data member.
const TGenomic_id & GetGenomic_id(void) const
Get the Genomic_id member data.
bool IsSetDonor_after_exon(void) const
Check if a value has been assigned to Donor_after_exon data member.
bool IsSetRow_scores(void) const
per-row scores Check if a value has been assigned to Row_scores data member.
TLens & SetLens(void)
Assign a value to Lens data member.
const TBounds & GetBounds(void) const
Get the Bounds member data.
Definition: Seq_align_.hpp:951
bool IsSetGenomic_id(void) const
Check if a value has been assigned to Genomic_id data member.
bool IsSetScores(void) const
scores for this exon Check if a value has been assigned to Scores data member.
E_Choice Which(void) const
Which variant is currently selected.
@ e_Product_ins
insertion in product sequence (i.e. gap in the genomic sequence)
@ e_Diag
both sequences are represented, there is sufficient similarity between product and genomic sequences....
@ e_Genomic_ins
insertion in genomic sequence (i.e. gap in the product sequence)
@ e_Match
both sequences represented, product and genomic sequences match
@ e_not_set
No variant selected.
@ e_Mismatch
both sequences represented, product and genomic sequences do not match
@ eType_partial
mapping pieces together
Definition: Seq_align_.hpp:103
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
const TPnt & GetPnt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:238
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_loc_.hpp:475
bool IsSetStrand(void) const
Check if a value has been assigned to Strand data member.
bool IsSetStrand(void) const
Check if a value has been assigned to Strand data member.
Definition: Seq_point_.hpp:331
bool IsWhole(void) const
Check if variant Whole is selected.
Definition: Seq_loc_.hpp:522
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
bool IsNull(void) const
Check if variant Null is selected.
Definition: Seq_loc_.hpp:504
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ e_Empty
to NULL one Seq-id in a collection
Definition: Seq_loc_.hpp:99
@ e_Int
from to
Definition: Seq_loc_.hpp:101
@ e_Whole
whole sequence
Definition: Seq_loc_.hpp:100
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
Definition of all error codes used in objects libraries.
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is orig
@ e_not_set
int i
int len
constexpr auto sort(_Init &&init)
int ssize_t
Definition: ncbiconf_msvc.h:93
T max(T x_, T y_)
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static bool GetSeqId(const T &d, set< string > &labels, const string name="", bool detect=false, bool found=false)
bool IsExonGap(CSpliced_exon_chunk::E_Choice chunk_type)
void CopyContainer(const C1 &src, C2 &dst)
void SetPartLength(CSpliced_exon_chunk &part, CSpliced_exon_chunk::E_Choice ptype, TSeqPos len)
void CloneContainer(const C1 &src, C2 &dst)
#define row(bind, expected)
Definition: string_bind.c:73
Single row of a single alignment segment.
TSeqPos m_Start
kInvalidSeqPos means gap
int GetSegStart(void) const
Get segment start or -1 if it's a gap.
bool m_IsSetStrand
Is strand set for the row?
bool SameStrand(const SAlignment_Row &r) const
Check if the query row has the same strand orientation.
bool m_Mapped
Flag indicating mapped rows.
void SetMapped(void)
Mark the row as mapped.
Structure to hold information about a single alignment segment.
vector< SAlignment_Row > TRows
TRows m_Rows
Segment rows.
SAlignment_Row & CopyRow(size_t idx, const SAlignment_Row &src_row)
Create a copy of the given row, store is to this segment as row number 'idx'.
TScores m_Scores
Scores for this segment.
ssize_t m_ScoresGroupIdx
Group of scores.
bool m_HaveStrands
Do at least some rows have strand set?
int m_Len
Segment length.
SAlignment_Row & GetRow(size_t idx)
Get row data with the given index.
SAlignment_Row & AddRow(size_t idx, const CSeq_id &id, int start, bool is_set_strand, ENa_strand strand)
Add new row.
int m_GroupIdx
Group of segments (e.g.
#define _ASSERT
Modified on Fri Apr 26 16:25:23 2024 by modify_doxy.py rev. 669887