NCBI C++ ToolKit
project_exons.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: project_exons.cpp 77281 2017-04-07 16:21:08Z chetvern $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Alex Astashyn
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
34 #include <corelib/ncbitime.hpp>
36 #include <objmgr/scope.hpp>
37 #include <objmgr/bioseq_handle.hpp>
38 #include <objmgr/annot_ci.hpp>
39 #include <objmgr/feat_ci.hpp>
41 #include <objmgr/util/sequence.hpp>
42 #include <objmgr/util/feature.hpp>
43 #include <objmgr/seq_vector.hpp>
44 
65 #include <objects/seq/seq__.hpp>
68 
71 
72 
73 /// Recursively convert empty container-locs to null-locs,
74 /// drop null sublocs from containers, and unwrap singleton containers
76 {
77  if(loc.IsMix()) {
79  Canonicalize(**it);
80  }
81  //erase NULL sublocs
82  CSeq_loc::TMix::Tdata::iterator dest = loc.SetMix().Set().begin();
84  CRef<CSeq_loc> subloc = *it;
85  if(!subloc->IsNull()) {
86  *dest = subloc;
87  dest++;
88  }
89  }
90  loc.SetMix().Set().erase(dest, loc.SetMix().Set().end());
91 
92  if(loc.GetMix().Get().size() == 1) {
93  CRef<CSeq_loc> content_loc = loc.SetMix().Set().front();
94  loc.Assign(*content_loc);
95  } else if(loc.GetMix().Get().size() == 0) {
96  loc.SetNull();
97  }
98  } else if(loc.IsPacked_int()) {
99  if(loc.GetPacked_int().Get().size() == 1) {
100  CRef<CSeq_interval> seq_int = loc.GetPacked_int().Get().front();
101  loc.SetInt(*seq_int);
102  } else if(loc.GetPacked_int().Get().size() == 0) {
103  loc.SetNull();
104  }
105  }
106 }
107 
108 
109 /// retrun true iff abutting on query (in nucpos-coords)
111  const CSpliced_exon& exon2)
112 {
113  TSeqPos max_start = max(exon1.GetProduct_start().AsSeqPos(),
114  exon2.GetProduct_start().AsSeqPos());
115 
116  TSeqPos min_stop = min(exon1.GetProduct_end().AsSeqPos(),
117  exon2.GetProduct_end().AsSeqPos());
118 
119  return max_start == min_stop + 1;
120 }
121 
122 /// ::first and ::second indicate partialness
123 /// for of a loc or an exon, 5' and 3' end respectively.
124 typedef pair<bool, bool> T53Partialness;
125 
126 /// Return whether 5' and/or 3' end of exon is partial based on
127 /// consensus splicing with upstream/downstream exons.
129  const CSpliced_exon& target_exon)
130 {
131  bool is_5p_partial(false);
132  bool is_3p_partial(false);
133 
134  CConstRef<CSpliced_exon> prev_exon;
135  ITERATE(CSpliced_seg::TExons, it, spliced_aln.GetSegs().GetSpliced().GetExons()) {
136  CConstRef<CSpliced_exon> current_exon = *it;
137  if( !prev_exon
138  || AreAbuttingOnProduct(*prev_exon, *current_exon))
139  {
140  prev_exon = current_exon;
141  continue;
142  }
143 
144  //gap between exons. Determine which exon is partial based on consensus splice
145  const bool is_consensus_donor =
146  prev_exon->IsSetDonor_after_exon()
147  && prev_exon->GetDonor_after_exon().GetBases() == "GT";
148 
149  const bool is_consensus_acceptor =
150  current_exon->IsSetAcceptor_before_exon()
151  && current_exon->GetAcceptor_before_exon().GetBases() == "AG";
152 
153  if(current_exon == CConstRef<CSpliced_exon>(&target_exon)
154  && (!is_consensus_acceptor || is_consensus_donor))
155  {
156  is_5p_partial = true;
157  }
158 
159  if(prev_exon == CConstRef<CSpliced_exon>(&target_exon)
160  && (!is_consensus_donor || is_consensus_acceptor))
161  {
162  is_3p_partial = true;
163  }
164 
165  prev_exon = current_exon;
166  }
167 
168  return T53Partialness(is_5p_partial, is_3p_partial);
169 }
170 
171 size_t GetUnalignedLength_3p(const CSeq_align& spliced_aln)
172 {
173  return spliced_aln.GetSegs().GetSpliced().IsSetPoly_a()
174  ? 0
175 
176  : spliced_aln.GetSeqStrand(0) == eNa_strand_minus
177  ? spliced_aln.GetSeqStart(0)
178 
179  : spliced_aln.GetSegs().GetSpliced().IsSetProduct_length()
180  ? spliced_aln.GetSegs().GetSpliced().GetProduct_length()
181  - spliced_aln.GetSeqStop(0) - 1
182 
183  : 0;
184 }
185 
186 size_t GetUnalignedLength_5p(const CSeq_align& spliced_aln)
187 {
188  return spliced_aln.GetSeqStrand(0) != eNa_strand_minus
189  ? spliced_aln.GetSeqStart(0)
190 
191  : spliced_aln.GetSegs().GetSpliced().IsSetProduct_length()
192  ? spliced_aln.GetSegs().GetSpliced().GetProduct_length()
193  - spliced_aln.GetSeqStop(0) - 1
194 
195  : 0;
196 }
197 
198 /// Return whether 5' and/or 3' end of exons-loc is partial
199 /// based on unaligned tails in case of RNA,
200 /// or overlap of product-cds-loc with unaligned tails, in case of CDS.
202  const CSeq_align& spliced_aln,
203  CConstRef<CSeq_loc> product_cds_loc,
204  size_t unaligned_ends_partialness_thr)
205 {
206  T53Partialness partialness(false, false);
207  if(!product_cds_loc) {
208  // For RNA set partialness based on whether the unaligned
209  // ends are longer than allow_terminal_unaligned_bases
210  partialness.first = GetUnalignedLength_5p(spliced_aln)
211  > unaligned_ends_partialness_thr;
212 
213  partialness.second = GetUnalignedLength_3p(spliced_aln)
214  > unaligned_ends_partialness_thr;
215  } else {
216  // cds-exons 5p/3p-terminal partialness is based on whether the
217  // product-cds-loc terminals are covered by the alignment.
218  TSeqPos cds_start = product_cds_loc->GetStart(eExtreme_Positional);
219  TSeqPos cds_stop = product_cds_loc->GetStop(eExtreme_Positional);
220  if (spliced_aln.GetSegs().GetSpliced().GetProduct_type()
222  cds_start *= 3;
223  cds_stop = cds_stop*3+2;
224  }
225 
226  bool start_covered = false;
227  bool stop_covered = false;
229  spliced_aln.GetSegs().GetSpliced().GetExons())
230  {
231  const CSpliced_exon& exon = **it;
232  start_covered |= cds_start >= exon.GetProduct_start().AsSeqPos()
233  && cds_start <= exon.GetProduct_end ().AsSeqPos();
234 
235  stop_covered |= cds_stop >= exon.GetProduct_start().AsSeqPos()
236  && cds_stop <= exon.GetProduct_end ().AsSeqPos();
237  }
238 
239  partialness.first = !start_covered
240  || product_cds_loc->IsPartialStart(eExtreme_Positional);
241 
242  partialness.second = !stop_covered
243  || product_cds_loc->IsPartialStop(eExtreme_Positional);
244 
245  if(spliced_aln.GetSeqStrand(0) == eNa_strand_minus) {
246  swap(partialness.first, partialness.second);
247  }
248  }
249  return partialness;
250 }
251 
253 {
254  if(partialness.first) {
256  }
257  if(partialness.second) {
259  }
260 }
261 
262 
263 #if 1
264 
265 /*
266  * GP-13080
267  *
268  * Tweak the biostops of projected exon's subintervals
269  * while preserving the reading frame.
270  *
271  * Stitch all non-frameshifting indels.
272  *
273  * Truncate all overlaps and close all gops down to 1 or 2 bp gap
274  * as to preserve the frame.
275  *
276  * Note:
277  * This approach suffers from some problems - we could be truncating
278  * bases in perfectly good codons instead of truncating
279  * bases strictly from affected codons that contain product-ins.
280  * An alternative approach is possible, but will require rewriting
281  * the whole projection code from scratch:
282  *
283  * Alternative approach:
284  * Implement a function (spliced-seg, product-cds-loc) -> tweaked-product-cds
285  * where the output CDS has only non-frameshifting gaps and has lossless
286  * mapping through the spliced-seg.
287  * (i.e. project gaps onto query and tweak them to codon boundaries;
288  * pay special attention to ribosomal-slippage case).
289  *
290  * When projecting CDS, use tweaked-product-cds to map to genome, and
291  * collapse mod-3 exonic gaps.
292  *
293  * When projecting mRNA, replace the cds-range on mRNA with
294  * tweaked-product-cds, and map like CDS. Afterwards, collapse all
295  * indels UTRs.
296  *
297  * When projecting RNA, map the product-range and collase all indels.
298  *
299  */
301 {
302 public:
303  static CRef<CSeq_loc> TweakExon(const CSeq_loc& orig_loc, bool convert_overlaps)
304  {
305  if(!orig_loc.IsPacked_int()) {
306  NCBI_THROW(CException, eUnknown, "Expected packed-int");
307  }
308 
309  CRef<CSeq_loc> loc = Clone(orig_loc);
310 
311  // Note: Adjusting-biostops is done twice: first time
312  // before subsuming micro-intervals (as they're not
313  // 'micro' prior to this), and after subsuming, which
314  // may have elongated some intervals, while the
315  // next step expects upper bound of 2bp on all overlaps.
316 
317  if(false /*orig_loc.GetStart(eExtreme_Positional) == 1988953*/) {
318  NcbiCerr << "Orig: " << AsString(loc->GetPacked_int()) << "\n";
319  AdjustBiostops (loc->SetPacked_int());
320  NcbiCerr << "Adj1: " << AsString(loc->GetPacked_int()) << "\n";
322  NcbiCerr << "Subs: " << AsString(loc->GetPacked_int()) << "\n";
323  AdjustBiostops (loc->SetPacked_int());
324  NcbiCerr << "Adj2: " << AsString(loc->GetPacked_int()) << "\n";
325  if (convert_overlaps) {
327  NcbiCerr << "Ovlp: " << AsString(loc->GetPacked_int()) << "\n";
328  }
330  NcbiCerr << "Final: " << AsString(loc->GetPacked_int()) << "\n";
331  NcbiCerr << "Tweaked: "
332  << (orig_loc.Equals(*loc) ? "equal" : "not-equal")
333  << "\n\n";
334  } else {
335  AdjustBiostops (loc->SetPacked_int());
337  AdjustBiostops (loc->SetPacked_int());
338  if (convert_overlaps) {
340  }
342  }
343 
344  Validate(orig_loc, *loc);
345  return loc;
346  }
347 
348  template<typename T>
349  static CRef<T> Clone(const T& x)
350  {
351  return CRef<T>(SerialClone<T>(x));
352  }
353 
354 
355  // advance iterator by d (can be negative);
356  // if out of bounds, assign end.
357  template<typename iterator_t>
358  static void safe_advance(
359  iterator_t begin,
360  iterator_t end,
361  iterator_t& it,
362  Int8 d)
363  {
364  if( (d < 0 && distance(begin, it) < abs(d))
365  || (d > 0 && distance(it, end) < abs(d)))
366  {
367  it = end;
368  } else {
369  std::advance(it, d);
370  }
371  }
372 
373 
374  // return element at iterator advanced by d.
375  // if out of bounds, return sentinel value
376  template<typename container_t>
378  const container_t& container,
379  typename container_t::const_iterator it,
380  Int8 delta,
381  typename container_t::value_type default_value)
382  {
383  safe_advance(container.begin(),
384  container.end(),
385  it, delta);
386 
387  return it == container.end() ? default_value : *it;
388  }
389 
390 
392  const CPacked_seqint::Tdata& seqints,
393  const CPacked_seqint::Tdata::const_iterator it)
394  {
395  return rel_at(seqints, it, 1, CRef<CSeq_interval>(NULL));
396  }
397 
398 
400  const CPacked_seqint::Tdata& seqints,
401  const CPacked_seqint::Tdata::const_iterator it)
402  {
403  return rel_at(seqints, it, -1, CRef<CSeq_interval>(NULL));
404  }
405 
406 
407  // Will subsume micro-intervals smaller than this into upstream
408  // neighboring intervals; This number can't be smaller than 4,
409  // because an interval must be able to survive truncation by
410  // 3 bases when changing overlaps to gaps
411  static const TSeqPos k_min_len = 4;
412 
413 
414  // Will not subsume intervals of this length or longer.
415  // Otherwise can subsume if this creates a nonframeshifting gap
416  static const TSeqPos k_keep_len = 6;
417 
418 
420  const CSeq_interval& upst,
421  const CSeq_interval& downst)
422  {
425 
426  return d * (MinusStrand(upst) ? -1 : 1);
427  }
428 
429 
431  const CSeq_interval& a,
432  const CSeq_interval& b)
433  {
435  c->SetFrom( min(a.GetFrom(), b.GetFrom()));
436  c->SetTo( max(a.GetTo(), b.GetTo()));
437  return c;
438  }
439 
440 
441  static bool MinusStrand(const CSeq_interval& seqint)
442  {
443  return seqint.IsSetStrand()
444  && seqint.GetStrand() == eNa_strand_minus;
445  }
446 
447 
448  static bool SameIdAndStrand(
449  const CSeq_interval& a,
450  const CSeq_interval& b)
451  {
452  return a.GetId().Equals(b.GetId())
453  && MinusStrand(a) == MinusStrand(b);
454  }
455 
456 
457  // return true iff subsuming curr into prev will result
458  // in a non-frameshifting indel between prev and next.
460  const CSeq_interval& prev,
461  const CSeq_interval& curr,
462  const CSeq_interval& next)
463  {
465  - prev.GetLength()
466  - curr.GetLength();
467  return (g % 3) == 0;
468  }
469 
470  // can be negative in case of overlap
472  const CSeq_interval& prev,
473  const CSeq_interval& curr)
474  {
475  return GetBiostartsDelta(prev, curr)
476  - prev.GetLength();
477  }
478 
479 
480  static void SetBioStop(CSeq_interval& seqint, TSeqPos pos)
481  {
482  (MinusStrand(seqint) ? seqint.SetFrom(pos)
483  : seqint.SetTo(pos));
484  }
485 
486 
487  static void SetBioStart(CSeq_interval& seqint, TSeqPos pos)
488  {
489  (MinusStrand(seqint) ? seqint.SetTo(pos)
490  : seqint.SetFrom(pos));
491  }
492 
493 
494  // amount < 0 -> truncate 3'end upstream; else extend downstream
495  static void AdjustBioStop(
496  CSeq_interval& seqint,
497  TSignedSeqPos amt)
498  {
499  amt *= MinusStrand(seqint) ? -1 : 1;
500  amt += seqint.GetStop(eExtreme_Biological);
501  SetBioStop(seqint, amt);
502  }
503 
504 
505  // amount < 0 -> extend 5'end upstream; else truncate downstream
506  static void AdjustBioStart(
507  CSeq_interval& seqint,
508  TSignedSeqPos amt)
509  {
510  amt *= MinusStrand(seqint) ? -1 : 1;
511  amt += seqint.GetStart(eExtreme_Biological);
512  SetBioStart(seqint, amt);
513  }
514 
515 
516  static void CheckIdAndStrand(const CPacked_seqint& ps)
517  {
518  ITERATE(CPacked_seqint::Tdata, it, ps.Get()) {
519  CRef<CSeq_interval> prev = Prev(ps.Get(), it);
520  if(prev && !SameIdAndStrand(*prev, **it)) {
521  NcbiCerr << MSerial_AsnText << ps;
523  eUnknown,
524  "Expected same seq-id and strand");
525  }
526  }
527  }
528 
529 
530  // maximally close gaps and truncate overlaps in multiples of three.
532  {
534  CRef<CSeq_interval> prev = Prev(ps.Set(), it);
535 
536  if(!prev) {
537  continue;
538  }
539 
540  const TSignedSeqPos indel = GetGapLength(*prev, **it);
541  AdjustBioStop(*prev, indel / 3 * 3);
542  }
543  }
544 
545 
546  // Subsume micro-intervals into upstream predecessors, as long as
547  // it does not affect the terminals.
549  {
550  CPacked_seqint::Tdata& seqints = ps.Set();
551  CPacked_seqint::Tdata::iterator dest = seqints.begin();
552 
554  CRef<CSeq_interval> current = *it;
555  CRef<CSeq_interval> next = Next(seqints, it);
556 
557  if(it == dest) {
558  continue; // no predecessor
559  }
560 
561  const bool can_subsume =
562  !next ? false
563  : current->GetLength() < k_min_len ? true
564  : current->GetLength() >= k_keep_len ? false
566  **dest, *current, *next) ? true
567  : false;
568 
569  // Note: if we don't have next, that means that this
570  // interval is last. We are not allowed to subsume it
571  // because dropping it and adjusting the upstream
572  // interval's length to compensate will affect the
573  // packed_seqint's biostop. GP-15942
574 
575  if(can_subsume) {
576  // drop current and extend prev to compensate
577  AdjustBioStop(**dest, current->GetLength());
578  } else {
579  // keep current
580  ++dest;
581  *dest = current;
582  }
583  }
584 
585  seqints.erase(seqints.end() == dest ? dest : ++dest,
586  seqints.end());
587  }
588 
589 
590  // Precondition: overlaps are at most 2bp
592  {
594  CRef<CSeq_interval> current = *it;
595  CRef<CSeq_interval> prev = Prev(ps.Set(), it);
596 
597  if(!prev) {
598  continue;
599  }
600 
601  const TSignedSeqPos overlap = -1 * GetGapLength(*prev, *current);
602 
603  if(overlap <= 0) {
604  continue; //gap
605  } else if(overlap > 2) {
606  NcbiCerr << MSerial_AsnText << ps;
607  NCBI_THROW(CException, eUnknown, "Unexpected overlap");
608  } else if(prev->GetLength() > 3) {
609  AdjustBioStop(*prev, -3);
610  } else if(current->GetLength() > 3) {
611  AdjustBioStart(*current, 3);
612  }
613  }
614  }
615 
616 
618  {
619  CPacked_seqint::Tdata& seqints = ps.Set();
620  CPacked_seqint::Tdata::iterator dest = seqints.begin();
621 
623  if(it == dest) {
624  continue;
625  }
626 
627  if(GetGapLength(**dest, **it) % 3 == 0) {
628  *dest = Collapse(**dest, **it);
629  } else {
630  ++dest;
631  *dest = *it;
632  }
633  }
634 
635  seqints.erase(seqints.end() == dest ? dest : ++dest,
636  seqints.end());
637  }
638 
639 public:
640  // It's difficult to make sense of a printed seq-loc ASN
641  // or label the way it is done in the toolkit, especially
642  // when the coordinates are large.
643  //
644  // This prints intervals as comma-delimited sequence of tuples
645  // "header,length" in bio-order, where header is either gap
646  // length to previous interval iff seq-id and strands are the same,
647  // and otherwise the header is seq-id@signed-biostart.
648  //
649  // E.g. NC_000001.11@-12304058:100,+1,150,-2,300
650  // ^strand
651  // ^biostart
652  // ^ ^ ^ lengths (unsigned)
653  // ^gap ^overlap (signed)
654  static string AsString(const CPacked_seqint& packed_seqint)
655  {
656  if(packed_seqint.Get().empty()) {
657  return "Empty-Packed-seqint";
658  }
659 
660  CNcbiOstrstream ostr;
661  CConstRef<CSeq_interval> prev_seqint;
662  ITERATE(CPacked_seqint::Tdata, it, packed_seqint.Get()) {
663  CConstRef<CSeq_interval> seqint = *it;
664 
665  if(prev_seqint && SameIdAndStrand(*prev_seqint, *seqint)) {
666  TSignedSeqPos d =
667  (seqint->GetStart(eExtreme_Biological))
668  - ( prev_seqint->GetStop(eExtreme_Biological)
669  + (MinusStrand(*seqint) ? -1 : 1));
670 
671  d *= MinusStrand(*seqint) ? -1 : 1;
672 
673  ostr << ","
674  << (d < 0 ? "-" : "+") << abs(d)
675  << "," << seqint->GetLength();
676  } else {
677  ostr << (!prev_seqint ? "" : ",")
678  << seqint->GetId().GetSeqIdString()
679  << "@" << (MinusStrand(*seqint) ? "-" : "+")
680  << seqint->GetStart(eExtreme_Biological) + 1
681  << ":" << seqint->GetLength();
682  }
683 
684  prev_seqint = seqint;
685  }
686 
687  return CNcbiOstrstreamToString(ostr);
688  }
689 
690 
691  static void Validate(const CSeq_loc& orig_loc, const CSeq_loc& final_loc)
692  {
695  {
696  NCBI_USER_THROW("Change in positional-starts");
697  }
698 
699 
701  != sequence::GetStop(final_loc, NULL, eExtreme_Positional))
702  {
703  NCBI_USER_THROW("Change in positional-stops");
704  }
705 
706 
707  if( sequence::GetLength(final_loc, NULL) % 3
708  != sequence::GetLength(orig_loc, NULL) % 3)
709  {
711  "Logic error - frame not preserved");
712  }
713 
714  string problem_str;
715  CConstRef<CSeq_interval> prev_seqint(NULL);
716  ITERATE(CPacked_seqint::Tdata, it, final_loc.GetPacked_int().Get()) {
717 
718  const CSeq_interval& seqint = **it;
719 
720  if(seqint.GetFrom() > seqint.GetTo()) {
721  problem_str += "invalid seqint";
722  }
723 
724  const TSignedSeqPos d =
725  !prev_seqint ? 1 : GetGapLength(*prev_seqint, seqint);
726 
727  if(d != 1 && d != 2) {
728  //problem_str += "Gap length is not 1 or 2";
729 
730  // this can hapen if we couldn't subsume a terminal micro-interval
731  // and it was too short to be truncatable, so it was left as-is.
732  }
733 
734  if(!problem_str.empty()) {
735  NcbiCerr << "orig_loc: "
736  << AsString(orig_loc.GetPacked_int())
737  << "\nfinal_loc: "
738  << AsString(final_loc.GetPacked_int())
739  << "\ndownstream-int: "
740  << MSerial_AsnText << seqint;
741 
742  if(prev_seqint) {
743  NcbiCerr << "upstream-int: "
744  << MSerial_AsnText << *prev_seqint;
745  }
746  NCBI_THROW(CException, eUnknown, problem_str);
747  }
748 
749  prev_seqint = *it;
750  }
751  }
752 };
753 
754 
755 
756 
757 // Project exon to genomic coordinates, preserving discontinuities.
759  const CSpliced_exon& spliced_exon,
760  const CSeq_id& aln_genomic_id, //of the parent alignment (if not specified in spliced_exon)
761  ENa_strand aln_genomic_strand, //of the parent alignment (if not specified in spliced_exon)
762  bool convert_overlaps)
763 {
765 
766  const CSeq_id& genomic_id =
767  spliced_exon.IsSetGenomic_id() ? spliced_exon.GetGenomic_id()
768  : aln_genomic_id;
769  const ENa_strand genomic_strand =
770  spliced_exon.IsSetGenomic_strand() ? spliced_exon.GetGenomic_strand()
771  : aln_genomic_strand;
772 
773  //Don't have exon details - create based on exon boundaries and return.
774  if(!spliced_exon.IsSetParts()) {
775  exon_loc->SetInt().SetId().Assign( genomic_id);
776  exon_loc->SetInt().SetStrand( genomic_strand);
777  exon_loc->SetInt().SetFrom( spliced_exon.GetGenomic_start());
778  exon_loc->SetInt().SetTo( spliced_exon.GetGenomic_end());
779  return exon_loc;
780  }
781 
782  typedef vector<pair<int, int> > TExonStructure;
783  // Each element is an exon chunk comprised of alignment diag
784  // (match or mismatch run) and abutting downstream gaps.
785  // ::first is diag+query_gap ,
786  // corresponding to the transcribed chunk length.
787  //
788  // ::second is diag+subject_gap,
789  // corresponding to distance to the start of the next chunk.
790 
791  TExonStructure exon_structure;
792  bool last_is_diag = false;
793  ITERATE(CSpliced_exon::TParts, it, spliced_exon.GetParts()) {
794  const CSpliced_exon_chunk& chunk = **it;
795  int len = chunk.IsMatch() ? chunk.GetMatch()
796  : chunk.IsMismatch() ? chunk.GetMismatch()
797  : chunk.IsDiag() ? chunk.GetDiag()
798  : chunk.IsGenomic_ins() ? chunk.GetGenomic_ins()
799  : chunk.IsProduct_ins() ? chunk.GetProduct_ins()
800  : 0;
801  bool is_diag = chunk.IsMatch() || chunk.IsMismatch() || chunk.IsDiag();
802 
803  if(is_diag && last_is_diag) { //alternating match/mismatch runs go into the same chunk
804  exon_structure.back().first += len;
805  exon_structure.back().second += len;
806  } else if(is_diag) {
807  exon_structure.push_back(TExonStructure::value_type(len, len));
808  } else {
809  if(exon_structure.empty()) {
810  exon_structure.push_back(TExonStructure::value_type(0, 0));
811  }
812  (chunk.IsProduct_ins() ? exon_structure.back().first
813  : exon_structure.back().second) += len;
814  }
815  last_is_diag = is_diag;
816  }
817 
818  // make the subject values cumulative
819  // (i.e. relative to the exon boundary,
820  // rather than neighboring chunk)
821  // After this, the biological start of
822  // a chunk relative to the exon-start
823  // is ::second of the previous element
824  NON_CONST_ITERATE(TExonStructure, it, exon_structure) {
825  if(it != exon_structure.begin()) {
826  it->second += (it-1)->second;
827  }
828  }
829 
830  int genomic_sign = genomic_strand == eNa_strand_minus ? -1 : 1;
831  TSeqPos exon_bio_start_pos =
832  genomic_sign > 0 ? spliced_exon.GetGenomic_start()
833  : spliced_exon.GetGenomic_end();
834  exon_loc->SetPacked_int();
835  ITERATE(TExonStructure, it, exon_structure) {
836  int chunk_length = it->first;
837  int chunk_offset = it == exon_structure.begin() ? 0 : (it-1)->second;
838 
839  if(chunk_length == 0) {
840  // can happen if we have a gap-only chunk
841  // (e.g. arising from truncating alignment to CDS)
842  continue;
843  }
844 
845  TSeqPos bio_start = exon_bio_start_pos + (chunk_offset * genomic_sign);
846  TSeqPos bio_stop = bio_start + (chunk_length - 1) * genomic_sign;
847  // -1 because stop is inclusive
848 
850  chunk->SetId().Assign(genomic_id);
851  chunk->SetStrand(genomic_strand);
852  chunk->SetFrom(genomic_sign > 0 ? bio_start : bio_stop);
853  chunk->SetTo(genomic_sign > 0 ? bio_stop : bio_start);
854  exon_loc->SetPacked_int().Set().push_back(chunk);
855  }
856 
857  exon_loc = NTweakExon::TweakExon(*exon_loc, convert_overlaps);
858  return exon_loc;
859 }
860 
861 #endif
862 
863 
864 // Each block represents a diag followed by downstream gap(s)
865 // A gap-only or diag-only blocks are possible.
866 //
867 // The block is frameshifting iff the difference in
868 // projection-on-query vs projection-on-subject is not
869 // a multiple of 3, or, equivalently, the projections
870 // are not congruent mod-3.
871 //
872 // (diag + q_ins) % 3 != (diag + s_ins ) % 3
873 // q_ins % 3 != s_ins % 3;
874 struct SBlock
875 {
876  int diag; // matches and/or mismatches
877 
878  int q_ins; // gap(s) following the diag.
879  int s_ins; // on query and subject respectively.
880 
881  SBlock() :
882  diag(0),
883  q_ins(0),
884  s_ins(0)
885  {}
886 
887  bool HasGap() const
888  {
889  return q_ins > 0 || s_ins > 0;
890  }
891 
892  bool IsFrameshifting() const
893  {
894  return (q_ins - s_ins) % 3 != 0;
895  }
896 
897  void Add(const SBlock& other)
898  {
899  diag += other.diag;
900  q_ins += other.q_ins;
901  s_ins += other.s_ins;
902  }
903 
904  // Return length of block's projection on
905  // subject minimally truncated as to be
906  // congruent mod-3 with projection on query.
907  //
908  // Note: in very rare cases where we need
909  // to truncate by more bases than we have
910  // available, we'll extend by 1 or 2 bases
911  // instead as to preserve frame.
913  {
914  const int q_len = diag + q_ins;
915  int s_len = diag + s_ins;
916 
917  // Now truncate s_len by 0, 1, or 2 bases
918  // until the length is congruent mod-3 with q_len.
919  //
920  // Note: s_len can become 0 or negative, e.g.
921  // q_len = 2, and s_len = 1, (arising
922  // from a degenerate case where we have an
923  // exon consisting of a single-base diag and
924  // query-ins of 1)
925 
926  while( (q_len - s_len) % 3 ) {
927  s_len--;
928  }
929 
930  while(s_len < 0) {
931  s_len += 3;
932  }
933 
934  return s_len;
935  }
936 };
937 
938 
939 struct SBlocks : public vector<SBlock>
940 {
941 public:
942 
944  {
945  SBlocks& blocks = *this;
946  blocks.clear();
947 
948  bool last_is_diag = false;
949  ITERATE(CSpliced_exon::TParts, it, spliced_exon.GetParts()) {
950  const CSpliced_exon_chunk& chunk = **it;
951 
952  const int len =
953  chunk.IsMatch() ? chunk.GetMatch()
954  : chunk.IsMismatch() ? chunk.GetMismatch()
955  : chunk.IsDiag() ? chunk.GetDiag()
956  : chunk.IsGenomic_ins() ? chunk.GetGenomic_ins()
957  : chunk.IsProduct_ins() ? chunk.GetProduct_ins()
958  : 0;
959 
960  bool is_diag = chunk.IsMatch()
961  || chunk.IsMismatch()
962  || chunk.IsDiag();
963 
964  const bool start_new_block =
965  blocks.empty() ? true // need non-empty blocks
966  : len == 0 ? false // will be a no-op
967  : !is_diag ? false // gaps go into current block
968  : last_is_diag ? false // abutting diags get merged
969  : true; // new non-abutting diag
970 
971  blocks.resize(blocks.size() + start_new_block);
972 
973  int& block_len =
974  chunk.IsProduct_ins() ? blocks.back().q_ins
975  : chunk.IsGenomic_ins() ? blocks.back().s_ins
976  : blocks.back().diag;
977 
978  block_len += len;
979 
980  last_is_diag = is_diag;
981  }
982 
983  // If spliced-exon has terminal subject-ins, then exon
984  // projected with details will have missing bases at
985  // the terminal. We need to modify the alignment
986  // (i.e. the blocks-based representations thereof)
987  // to insert anchor diags at the beginning and/or
988  // end of the alignment, and truncate the alignment
989  // inward accordingly, such that exon terminals are
990  // exactly preserved despite terminal gaps.
991  //
992  // Note: this wouldn't be a problem if seq-locs with
993  // 0-base intervals were allowed, so we wolud have a
994  // 0-base interval in the beginning of packed-int,
995  // followed by int-loc separated by a gap, thus capturing
996  // the original terminal gap while preserving extremes.
997 
998  while( blocks.size() > 1
999  && blocks.front().diag < 3)
1000  {
1001  // The first block may be followed by a gap, and so
1002  // may need to be truncated by 1 or 2 bases to make
1003  // it frame-preserving, so we need it to be at least 3.
1004  if(blocks[1].diag > 0) {
1005  blocks[0].diag++;
1006  blocks[1].diag--;
1007  } else {
1008  // next block ran out
1009  blocks[1].Add(blocks[0]);
1010  blocks.erase(blocks.begin());
1011  }
1012  }
1013 
1014  if( !blocks.empty()
1015  && blocks.back().diag > 0
1016  && blocks.back().HasGap())
1017  {
1018  // Handling the last block is similar, except
1019  // since the artificial diag will not have
1020  // gaps following it, we know that it will never
1021  // need truncation, and so we only need to borrow
1022  // one base from the alignment.
1023  blocks.back().diag--;
1024  blocks.push_back(SBlock());
1025  blocks.back().diag = 1;
1026  }
1027 
1028  // Fold abutting blocks if either of:
1029  // * one of them does not have a diag
1030  // (essentially, absorbing the neigbor's gaps),
1031  //
1032  // * gap between them (i.e. that of upstream's block)
1033  // is non-frameshifting.
1034  //
1035  // * the downstream block's diag is 3 bases or less, except
1036  // for the case of final diag that must be preserved.
1037  SBlocks::iterator dest = this->begin();
1038  ITERATE(SBlocks, it, *this) {
1039  if(it == dest) {
1040  ;
1041  } else if( !it->diag
1042  || !dest->diag
1043  || !dest->IsFrameshifting()
1044  || ( it->diag <= 3
1045  && &(*it) != &back()
1046  && &(*it) != &front() ) )
1047  {
1048  dest->Add(*it);
1049  } else {
1050  ++dest;
1051  *dest = *it;
1052  }
1053  }
1054  EraseAfter(dest);
1055  }
1056 
1057 private:
1058  void EraseAfter(SBlocks::iterator it)
1059  {
1060  if(it != this->end()) {
1061  ++it;
1062  }
1063  this->erase(it, this->end());
1064  }
1065 };
1066 
1067 
1068 // Project exon to genomic coordinates, preserving discontinuities.
1069 //
1070 // aln_genomic* params are only used if not specified within exon
1072  const CSpliced_exon& exon,
1073  const CSeq_id& aln_genomic_id,
1074  ENa_strand aln_genomic_strand)
1075 {
1077 
1078  const CSeq_id& genomic_id =
1079  exon.IsSetGenomic_id() ? exon.GetGenomic_id()
1080  : aln_genomic_id;
1081 
1082  const ENa_strand genomic_strand =
1083  exon.IsSetGenomic_strand() ? exon.GetGenomic_strand()
1084  : aln_genomic_strand;
1085 
1086  //Don't have exon details - create based on exon boundaries and return.
1087  if(!exon.IsSetParts()) {
1088  exon_loc->SetInt().SetId().Assign( genomic_id);
1089  exon_loc->SetInt().SetStrand( genomic_strand);
1090  exon_loc->SetInt().SetFrom( exon.GetGenomic_start());
1091  exon_loc->SetInt().SetTo( exon.GetGenomic_end());
1092  return exon_loc;
1093  } else {
1094  exon_loc->SetPacked_int();
1095  }
1096 
1097  const int genomic_sign = (genomic_strand == eNa_strand_minus) ? -1 : 1;
1098 
1099  const TSeqPos exon_bio_start_pos =
1100  genomic_sign > 0 ? exon.GetGenomic_start()
1101  : exon.GetGenomic_end();
1102 
1103  // chunk's start in exon-local coords
1104  size_t exon_relative_subj_start = 0;
1105 
1106  const SBlocks blocks(exon);
1107  ITERATE(SBlocks, it, blocks) {
1108 
1109  const TSeqPos bio_start = exon_bio_start_pos
1110  + (exon_relative_subj_start * genomic_sign);
1111 
1112  const size_t len = it->GetFramePreservingSubjLen();
1113 
1114  const TSeqPos bio_stop = bio_start + ((len - 1) * genomic_sign);
1115  // -1 because stop is inclusive
1116 
1118  chunk->SetId().Assign(genomic_id);
1119  chunk->SetStrand( genomic_strand);
1120  chunk->SetFrom( genomic_sign > 0 ? bio_start : bio_stop);
1121  chunk->SetTo( genomic_sign > 0 ? bio_stop : bio_start);
1122 
1123  if(len > 0) {
1124  exon_loc->SetPacked_int().Set().push_back(chunk);
1125  }
1126 
1127  exon_relative_subj_start += it->diag + it->s_ins;
1128  }
1129 
1132 
1133  try {
1134  // Validate
1135  const size_t exon_product_len =
1136  exon.GetProduct_end().GetNucpos()
1137  - exon.GetProduct_start().GetNucpos() + 1;
1138 
1139  const size_t loc_len = sequence::GetLength(*exon_loc, NULL);
1140 
1141  if(loc_len % 3 != exon_product_len % 3) {
1143  "Logic error - frame not preserved");
1144  }
1145 
1146  if( exon.GetGenomic_start()
1147  != exon_loc->GetStart(eExtreme_Positional))
1148  {
1149  NCBI_USER_THROW("Change in positional-starts");
1150  }
1151 
1152  if(exon.GetGenomic_end()
1153  != exon_loc->GetStop(eExtreme_Positional))
1154  {
1155  NCBI_USER_THROW("Change in positional-stops");
1156  }
1157 
1158  } catch (CException& e) {
1159  NcbiCerr << MSerial_AsnText << exon;
1160  NcbiCerr << MSerial_AsnText << *exon_loc;
1161  NCBI_RETHROW_SAME(e, "Invalid result");
1162  }
1163 
1164 
1165  return exon_loc;
1166 }
1167 
1168 // GP-17626
1170  const CSpliced_exon& exon,
1171  const CSeq_id& aln_genomic_id,
1172  ENa_strand aln_genomic_strand,
1173  bool convert_overlaps)
1174 {
1175  CRef<CSeq_loc> exon_loc;
1176 
1177  try {
1178  exon_loc = ProjectExon_oldlogic(exon, aln_genomic_id, aln_genomic_strand, convert_overlaps);
1179  } catch(CException&) {
1180  exon_loc = ProjectExon_newlogic(exon, aln_genomic_id, aln_genomic_strand);
1181 
1182  ERR_POST(Warning << " Project-exon logic failed. Using new logic: "
1183  << NTweakExon::AsString(exon_loc->GetPacked_int()) << "\n");
1184  }
1185 
1186  return exon_loc;
1187 }
1188 
1189 
1190 /// Create an exon with the structure consisting of
1191 /// two diags extending inwards from the exon terminals
1192 /// with a single gap of required length in the middle.
1193 /// This is used for projecting cds-exons consisting
1194 /// entirely of gaps (see ProjectExons)
1196 {
1197  CRef<CSpliced_exon> exon(SerialClone(orig_exon));
1198  TSeqPos query_range = exon->GetProduct_end().GetNucpos()
1199  - exon->GetProduct_start().GetNucpos() + 1;
1200  TSeqPos subject_range = exon->GetGenomic_end() - exon->GetGenomic_start() + 1;
1201  TSeqPos min_range = min(query_range, subject_range);
1202  TSeqPos max_range = max(query_range, subject_range);
1203 
1207  diag1->SetDiag(min_range / 2);
1208  diag2->SetDiag(min_range - diag1->GetDiag());
1209  if(max_range == subject_range) {
1210  gap_chunk->SetGenomic_ins(max_range - min_range);
1211  } else {
1212  gap_chunk->SetProduct_ins(max_range - min_range);
1213  }
1214  exon->SetParts().clear();
1215  exon->SetParts().push_back(diag1);
1216  exon->SetParts().push_back(gap_chunk);
1217  exon->SetParts().push_back(diag2);
1218  return exon;
1219 }
1220 
1221 #if 0
1222 /*
1223 This logic is now disabled, and CDS-exons are computed based on
1224 genomic-cds overlap with projected mRNA feature.
1225 see: TruncateToCDS(...)
1226 
1227 Caveat-1 is no longer relevant because overlaps are converted to
1228 gaps in TweakExon.
1229 
1230 Caveat-2 is dealt with by projecting and collapsing each product-cds-range
1231 individually, and then intersecting rna-loc with each range.
1232 see: ProjectAndCollapseCDS(...)
1233 
1234 Caveat-3 is not applicable in the new logic.
1235 
1236 Caveat-4 is no longer relevant because the rules were restricted so
1237 that we're no longer obligated to preserve the product length during
1238 mapping, only the frame. According to the current rule it would be
1239 unexpected for the projected product or cds loc to extend past the
1240 terminals of "naively-mapped" exons.
1241 */
1242 
1243 
1244 // Creating exon-loc for CDS:
1245 //
1246 // Caveat-1:
1247 // A naive way to project CDS would be to take the genomic cds-range
1248 // and intersect with projected RNA.
1249 // Such approach is clear, but will not work when the genomic cds boundary
1250 // is in the overlap of the exon chunks (in product-ins).
1251 // Instead, we'll truncate the original spliced-seg down to product-CDS
1252 // and will generate exons-loc the same way as for RNA.
1253 //
1254 // Caveat-2: a CDS on product can itself have discontinuities
1255 // (e.g. ribosomal slippage), and simply projecting truncated-to-cds alignment
1256 // will not capture these. Instead, we'll take each product-cds chunk,
1257 // tructate alignment to that, project, and combine the results.
1258 // During the combination of results we'll have to combine sublocs pertaining
1259 // to same exon in the same subloc of the container mix.
1260 //
1261 // Caveat-3: currently Seq-loc-Mapper has a bug, such that truncating
1262 // an alignment to CDS yields wrong result for multi-exon cases (CXX-3724),
1263 // so to work-around that we'll create a single-exon alignment for each exon.
1264 // Additionally, doing it exon-by-exon makes it easy to
1265 // combine projected result for each exon (from multiple cds sublocs).
1266 //
1267 // Caveat-4: Remapping may produce gap-only exons, which would ordinarily yield
1268 // no genomic projection counterpart, but in the context of
1269 // discontinuity-preservation we'll need to calculate genomic projection "manually".
1270 CRef<CSeq_loc> ProjectCDSExon(
1271  const CSeq_align& spliced_aln,
1272  const CSpliced_exon& spliced_exon,
1273  const CSeq_loc& product_cds_loc,
1274  bool convert_overlaps)
1275 {
1276  CRef<CSeq_align> exon_aln(SerialClone(spliced_aln));
1277  exon_aln->ResetScore();
1278  exon_aln->ResetExt();
1279 
1280  //Create alignment to represent only the current exon
1281  exon_aln->SetSegs().SetSpliced().SetExons().clear();
1282  exon_aln->SetSegs().SetSpliced().SetExons().push_back(
1284 
1285  CRef<CSeq_loc> query_exon_loc = exon_aln->CreateRowSeq_loc(0);
1286 
1288 
1289  for(CSeq_loc_CI ci(product_cds_loc,
1291  CSeq_loc_CI::eOrder_Biological); ci; ++ci)
1292  {
1293  CConstRef<CSeq_loc> cds_subloc = ci.GetRangeAsSeq_loc();
1294 
1296  *query_exon_loc,
1297  *cds_subloc,
1298  NULL,
1300  {
1301  // exon does not overlap the CDS interval
1302  // (i.e. UTR-only, or, in rare case of translational-frameshifts,
1303  // not specific to this cds-chunk)
1304  continue;
1305  }
1306 
1307 
1308  // truncate the exon-alignment to the query-cds-subloc
1309  CRef<CSeq_loc_Mapper> mapper(
1310  new CSeq_loc_Mapper(*cds_subloc, *cds_subloc, NULL));
1311  mapper->SetTrimSplicedSeg(false);
1312 
1313  CRef<CSeq_align> truncated_exon_aln;
1314  try {
1315  truncated_exon_aln = mapper->Map(*exon_aln);
1316  } catch (CAnnotMapperException& e) {
1317  // It used to be the case that the mapper would return an empty alignment,
1318  // but in GP-11467 it was discovered that it can also throw
1319  // "Mapping resulted in an empty alignment, can not initialize Seq-align."
1321  truncated_exon_aln.Reset(new CSeq_align);
1322  truncated_exon_aln->Assign(*exon_aln);
1323  truncated_exon_aln->SetSegs().SetSpliced().SetExons().clear();
1324  } else {
1325  NcbiCerr << MSerial_AsnText << *cds_subloc;
1326  NcbiCerr << MSerial_AsnText << *exon_aln;
1327  NCBI_RETHROW_SAME(e, "Can't truncate alignment to CDS");
1328  }
1329  }
1330 
1331 #if 0
1332  NcbiCerr << MSerial_AsnText << *cds_subloc;
1333  NcbiCerr << MSerial_AsnText << *exon_aln;
1334  NcbiCerr << MSerial_AsnText << *truncated_exon_aln;
1335  NcbiCerr << "\n";
1336 #endif
1337 
1338  if(truncated_exon_aln->GetSegs().GetSpliced()
1339  .GetExons().empty())
1340  {
1341  // NcbiCerr << "gap-only cds-exon: "
1342  // << MSerial_AsnText <<spliced_aln;
1343  //
1344  // This is a rare case where the exon overlaps the CDS,
1345  // but truncating the alignment to the CDS
1346  // produced empty alignment - how can this happen?
1347  // This is the case where an exon has a product-ins
1348  // abutting the exon terminal, and the CDS part does
1349  // not extend past the gap, such that the result of
1350  // truncation is a gap-only alignment.
1351  // To deal with this we'll take a chunk of required length
1352  // starting at genomic exon boundary (i.e. as if the exon
1353  // structure abutted a diag rather than a gap).
1354  //
1355  // We'll do this by ignoring the exon structure, and
1356  // instead create a dummy exon consisting of two diags
1357  // extending from the exon terminals with a gap of
1358  // necessary length in the middle.
1359  //
1360  // Note: The result is the same as if the seq-loc-mapper
1361  // preserved the gap-only alignment instead of
1362  // throwing away the exon, which would result in
1363  // |product-ins| nucleotides being translated from the
1364  // genomic exon boundary.
1365  CRef<CSpliced_exon> collapsed_exon =
1367  *exon_aln->SetSegs().SetSpliced().SetExons().front());
1368 
1369  exon_aln->SetSegs().SetSpliced().SetExons().front() = collapsed_exon;
1370  truncated_exon_aln = mapper->Map(*exon_aln);
1371 
1372  if(truncated_exon_aln->GetSegs().GetSpliced()
1373  .GetExons().empty())
1374  {
1375  continue;
1376  //theoretically this shouldn't happen,
1377  //but we can't proceed otherwise
1378  }
1379  }
1380 
1381  CRef<CSeq_loc> exon_subloc = ProjectExon(
1382  *truncated_exon_aln->GetSegs().GetSpliced().GetExons().front(),
1383  spliced_aln.GetSeq_id(1),
1384  spliced_aln.GetSeqStrand(1));
1385 
1386 
1387 #if 0
1388  // GP-15635
1389  // This is wrong, because this will add partialness to every cds-exon.
1390  // Instead, the caller will make sure that the partialness is
1391  // properly inherited from cds-loc for the aggregate exons-loc
1392  // (see GetTerminalPartialness(...)
1394  *exon_subloc,
1396  cds_subloc->IsPartialStart(eExtreme_Biological),
1397  cds_subloc->IsPartialStop(eExtreme_Biological)));
1398 #endif
1399 
1400  exon_loc->SetPacked_int().Set().insert(
1401  exon_loc->SetPacked_int().Set().end(),
1402  exon_subloc->SetPacked_int().Set().begin(),
1403  exon_subloc->SetPacked_int().Set().end());
1404 
1405  }
1406  return exon_loc;
1407 }
1408 #endif
1409 
1410 
1411 // for each product-cds-subloc:
1412 // project to genome coords and collapse to single-range.
1413 // Output: translational-frameshift-preserving-collapsed-CDS
1415  const CSeq_align& spliced_aln,
1416  CConstRef<CSeq_loc> product_cds_loc)
1417 {
1418  if(!product_cds_loc) {
1419  return CRef<CPacked_seqint>(NULL);
1420  }
1421 
1423 
1424  CRef<CSeq_loc_Mapper> mapper(
1425  new CSeq_loc_Mapper(spliced_aln, 1, NULL));
1426 
1427  for(CSeq_loc_CI ci(*product_cds_loc,
1429  CSeq_loc_CI::eOrder_Biological); ci; ++ci)
1430  {
1431  CConstRef<CSeq_loc> cds_subloc = ci.GetRangeAsSeq_loc();
1432  CConstRef<CSeq_loc> mapped_cds_subloc = mapper->Map(*cds_subloc);
1433  CRef<CSeq_loc> mapped_collapsed_cds_subloc =
1435  *mapped_cds_subloc,
1437  NULL);
1438 
1439  if(mapped_collapsed_cds_subloc->IsNull()) {
1440  ;
1441  } else if(mapped_collapsed_cds_subloc->IsInt()) {
1442  CRef<CSeq_interval> seqint(
1443  &mapped_collapsed_cds_subloc->SetInt());
1444  out->Set().push_back(seqint);
1445  } else {
1446  NCBI_USER_THROW("Expected seqint or null-loc");
1447  }
1448  }
1449  return out;
1450 }
1451 
1453  const CSeq_loc& src,
1454  CSeq_loc& dest)
1455 {
1456  if( !src.GetId()
1457  || !dest.GetId()
1458  || !src.GetId()->Equals(*dest.GetId())
1459  || src.GetStrand() != dest.GetStrand() )
1460  {
1461  return;
1462  }
1463 
1465  const bool same_start = src.GetStart(ext) == dest.GetStart(ext);
1466  const bool same_stop = src.GetStop(ext) == dest.GetStop(ext);
1467 
1468  if(same_start && src.IsPartialStart(ext))
1469  dest.SetPartialStart(true, ext);
1470 
1471  if(same_start && src.IsTruncatedStart(ext))
1472  dest.SetTruncatedStart(true, ext);
1473 
1474  if(same_stop && src.IsPartialStop(ext))
1475  dest.SetPartialStop(true, ext);
1476 
1477  if(same_stop && src.IsTruncatedStop(ext))
1478  dest.SetTruncatedStop(true, ext);
1479 }
1480 
1481 
1483  CRef<CSeq_loc> detailed_rna_exon, //mix-of-(int-or-packed-seqint)
1484  CConstRef<CPacked_seqint> collapsed_genomic_cds)
1485 {
1486  if(!collapsed_genomic_cds) {
1487  return detailed_rna_exon;
1488  }
1489 
1490  CRef<CSeq_loc> out_loc(new CSeq_loc(CSeq_loc::e_Mix));
1491 
1492  ITERATE(CPacked_seqint::Tdata, it, collapsed_genomic_cds->Get()) {
1493  CSeq_loc cds_subrange;
1494  cds_subrange.SetInt(const_cast<CSeq_interval&>(**it));
1495 
1496  CRef<CSeq_loc> cds_exon =
1497  detailed_rna_exon->Intersect(cds_subrange, 0, NULL);
1498 
1499  if(cds_exon->IsNull()) {
1500  continue;
1501  }
1502 
1503  TransferPartialness(cds_subrange, *cds_exon);
1504 
1505  out_loc->SetMix().Set().push_back(cds_exon);
1506  }
1507 
1508  out_loc->ChangeToPackedInt();
1509  return out_loc;
1510 }
1511 
1512 
1513 
1515  CConstRef<CSeq_loc> product_cds_loc,
1516  bool convert_overlaps,
1517  size_t unaligned_ends_partialness_thr = 0)
1518 {
1519  CRef<CSeq_loc> exons_loc(new CSeq_loc(CSeq_loc::e_Mix));
1520 
1521  CConstRef<CPacked_seqint> genomic_collapsed_cds =
1522  ProjectAndCollapseCDS(spliced_aln, product_cds_loc);
1523 
1525  spliced_aln.GetSegs().GetSpliced().GetExons())
1526  {
1527  const CSpliced_exon& spliced_exon = **it;
1528 
1529  CRef<CSeq_loc> exon_loc =
1530  TruncateToCDS(
1531  ProjectExon(
1532  spliced_exon,
1533  spliced_aln.GetSeq_id(1),
1534  spliced_aln.GetSeqStrand(1),
1535  convert_overlaps),
1536  genomic_collapsed_cds);
1537 
1538  const T53Partialness partialness =
1539  GetExonPartialness(spliced_aln, spliced_exon);
1540 
1541  if(!product_cds_loc) {
1542  AugmentPartialness(*exon_loc, partialness);
1543  } else if(exon_loc->GetId()) {
1544  // note: if no seq-id, GetStart/GetStop will throw - GP-15887
1545  //
1546  // GP-15635/case-(3,4):
1547  // Inherit partialness only if the CDS
1548  // mapped up to the exon's terminal
1549  bool start_partial = partialness.first;
1550  bool stop_partial = partialness.second;
1551 
1552  // convert to positional
1553  if(spliced_aln.GetSeqStrand(1) == eNa_strand_minus) {
1554  swap(start_partial, stop_partial);
1555  }
1556 
1557  if(start_partial
1558  && sequence::GetStart(*exon_loc, NULL)
1559  == spliced_exon.GetGenomic_start())
1560  {
1561  exon_loc->SetPartialStart(true, eExtreme_Positional);
1562  }
1563 
1564  if(stop_partial
1565  && sequence::GetStop(*exon_loc, NULL)
1566  == spliced_exon.GetGenomic_end())
1567  {
1568  exon_loc->SetPartialStop(true, eExtreme_Positional);
1569  }
1570  }
1571 
1572  exons_loc->SetMix().Set().push_back(exon_loc);
1573  }
1574 
1575  Canonicalize(*exons_loc);
1576 
1578  *exons_loc,
1580  spliced_aln,
1581  product_cds_loc,
1582  unaligned_ends_partialness_thr));
1583 
1584  return exons_loc;
1585 }
1586 
1587 
1588 /// Precondition: input loc is discontinuity-preserving RNA loc
1589 /// Postcontition: adjacent packed-ints having the discontinuity
1590 /// between them entirely outside of cds-range are merged into single interval.
1592  const CSeq_loc& loc,
1593  TSeqPos cds_start,
1594  TSeqPos cds_stop)
1595 {
1596  CRef<CSeq_loc> collapsed_loc(new CSeq_loc(CSeq_loc::e_Null));
1597  if(loc.IsMix()) {
1598  //each subloc is an exon - recurse on each.
1599  collapsed_loc->SetMix();
1600  ITERATE(CSeq_loc::TMix::Tdata, it, loc.GetMix().Get()) {
1601 
1602  CRef<CSeq_loc> collapsed_exon_loc =
1603  CollapseDiscontinuitiesInUTR(**it, cds_start, cds_stop);
1604 
1605  collapsed_loc->SetMix().Set().push_back(collapsed_exon_loc);
1606  }
1607 
1608  } else if(loc.IsPacked_int()) {
1609 
1610  //each subloc is a chunk in an exon
1611  //- will merge compatible adjacent chunks iff outside of CDS
1612  collapsed_loc->SetPacked_int();
1614  const CSeq_interval& interval = **it;
1615 
1616  if(collapsed_loc->GetPacked_int().Get().empty()) {
1617  collapsed_loc->SetPacked_int().Set().push_back(
1618  CRef<CSeq_interval>(SerialClone(interval)));
1619  continue;
1620  }
1621 
1622  CSeq_interval& last_interval =
1623  *collapsed_loc->SetPacked_int().Set().back();
1624 
1625  // We can collapse intervals iff the discontinuity
1626  // (overlap or gap) between them lies outside of the CDS.
1627  // Equivalently, the count of interval terminals overlapping CDS
1628  // being at most 1 is necessary and sufficient
1629  // (allowing for one of the intervals to be partially in the CDS)
1630 
1631  size_t count_terminals_within_cds =
1632  (last_interval.GetFrom() >= cds_start && last_interval.GetFrom() <= cds_stop ? 1 : 0)
1633  + ( interval.GetFrom() >= cds_start && interval.GetFrom() <= cds_stop ? 1 : 0)
1634  + (last_interval.GetTo() >= cds_start && last_interval.GetTo() <= cds_stop ? 1 : 0)
1635  + ( interval.GetTo() >= cds_start && interval.GetTo() <= cds_stop ? 1 : 0);
1636 
1637  if( count_terminals_within_cds <= 1
1638  && last_interval.GetStrand() == interval.GetStrand()
1639  && last_interval.GetId().Equals(interval.GetId()))
1640  {
1641  CSeq_loc loc1, loc2;
1642  loc1.SetInt(last_interval);
1643  loc2.SetInt(const_cast<CSeq_interval&>(interval));
1645  loc1, loc2, CSeq_loc::fMerge_SingleRange, NULL);
1646  last_interval.Assign(loc3->GetInt());
1647  } else {
1648 #if 0
1649  NcbiCerr << "Retaining UTR indel: "
1650  << "cds: " << cds_start << ".." << cds_stop << "; "
1651  << "terminals_in_cds: " << count_terminals_within_cds << "; "
1652  << "last: " << MSerial_AsnText << last_interval
1653  << "this: " << MSerial_AsnText << interval;
1654 #endif
1655  collapsed_loc->SetPacked_int().Set().push_back(
1656  CRef<CSeq_interval>(SerialClone(interval)));
1657  }
1658  }
1659 
1660  // even if the original was canonicalized,
1661  // we may have collapsed packed-int sublocs such
1662  // that there's only one remaining,
1663  // so need to recanonicalize
1664  Canonicalize(*collapsed_loc);
1665  } else {
1666  collapsed_loc->Assign(loc);
1667  }
1668 
1669  return collapsed_loc;
1670 }
1671 
1672 
1673 ///////////////////////////////////////////////////////////////////////////////
1674 
1676  const CSeq_align& spliced_aln,
1677  CConstRef<CSeq_loc> product_cds_loc,
1678  size_t unaligned_ends_partialness_thr)
1679 {
1680  CRef<CSeq_loc> projected_rna_loc =
1681  ProjectExons(
1682  spliced_aln,
1684  true,
1685  unaligned_ends_partialness_thr);
1686 
1687  TSeqPos cds_start(kInvalidSeqPos),
1688  cds_stop(kInvalidSeqPos);
1689 
1690  if(product_cds_loc) {
1691  CRef<CSeq_loc_Mapper> mapper(
1692  new CSeq_loc_Mapper(spliced_aln, 1, NULL));
1693  mapper->SetTrimSplicedSeg(false);
1694 
1695  CRef<CSeq_loc> genomic_cds_range = mapper->Map(*product_cds_loc);
1696 
1697  genomic_cds_range =
1699  *genomic_cds_range,
1701  NULL);
1702 
1703  cds_start = genomic_cds_range->GetStart(eExtreme_Positional);
1704  cds_stop = genomic_cds_range->GetStop(eExtreme_Positional);
1705  }
1706 
1707  // note, if there's no product-cds-loc,
1708  // this will collapse discontinuities in every exon
1710  *projected_rna_loc,
1711  cds_start,
1712  cds_stop);
1713 }
1714 
1716  const CSeq_align& spliced_aln,
1717  const CSeq_loc& product_cds_loc,
1718  bool convert_overlaps)
1719 {
1720  return ProjectExons(
1721  spliced_aln,
1722  CConstRef<CSeq_loc>(&product_cds_loc),
1723  convert_overlaps);
1724 }
1725 
1726 
1727 ///////////////////////////////////////////////////////////////////////////////
1728 
1729 #if 0
1730 
1731 todo: move to unit-test
1732 
1733 void CollapseMatches(CSeq_align& spliced_aln)
1734 {
1735  NON_CONST_ITERATE(CSpliced_seg::TExons, it, spliced_aln.SetSegs().SetSpliced().SetExons()) {
1736  CSpliced_exon& spliced_exon = **it;
1738  se->Assign(spliced_exon);
1739 
1740  spliced_exon.SetParts().clear();
1741  ITERATE(CSpliced_exon::TParts, it2, se->GetParts()) {
1742  const CSpliced_exon_chunk& chunk = **it2;
1743 
1744  int len = chunk.IsMatch() ? chunk.GetMatch()
1745  : chunk.IsMismatch() ? chunk.GetMismatch()
1746  : chunk.IsDiag() ? chunk.GetDiag()
1747  : chunk.IsGenomic_ins() ? chunk.GetGenomic_ins()
1748  : chunk.IsProduct_ins() ? chunk.GetProduct_ins()
1749  : 0;
1750 
1751  bool current_is_diag = chunk.IsMatch() || chunk.IsDiag() || chunk.IsMismatch();
1752 
1753  if(spliced_exon.GetParts().size() > 0 && spliced_exon.GetParts().back()->IsDiag() && current_is_diag) {
1754  spliced_exon.SetParts().back()->SetDiag() += len;
1755  } else {
1757  chunk2->Assign(chunk);
1758  if(current_is_diag) {
1759  chunk2->SetDiag(len);
1760  }
1761  spliced_exon.SetParts().push_back(chunk2);
1762  }
1763  }
1764  }
1765 }
1766 
1767 /*
1768  fp_cds := Create Frame-preserving cds loc.
1769  covered_cds := Intersect CDS on product with the alignment's query-loc.
1770  query_seq := Instantiate covered_cds sequence.
1771  genomic_seq := Instantiate fp_cds sequence.
1772  ASSERT: query_seq and genomic_seq are of the same length and the count of matches is at least
1773  as in original alignment truncated to query-cds
1774 */
1775 bool CFeatureGenerator::TestProjectExons(const CSeq_align& aln2)
1776 {
1777  CScope& scope = *m_impl->m_scope;
1778  CRef<CSeq_align> aln_ref(new CSeq_align);
1779  aln_ref->Assign(aln2);
1780  CollapseMatches(*aln_ref);
1781  const CSeq_align& aln = *aln_ref;
1782 
1783  CBioseq_Handle product_bsh = scope.GetBioseqHandle(aln.GetSeq_id(0));
1784  CRef<CSeq_loc> query_loc = aln.CreateRowSeq_loc(0);
1785 
1786  bool all_ok = false; //for every CDS on query (normally just one)
1787  for(CFeat_CI ci(product_bsh, SAnnotSelector(CSeqFeatData::e_Cdregion)); ci; ++ci) {
1788  bool this_ok = true;
1789  const CMappedFeat& mf= *ci;
1790 
1791  //CRef<CSeq_loc> covered_cds = query_loc->Intersect(mf.GetLocation(), 0, NULL);
1792  //Note: this intersect is incorrect, at it will not represent overlaps within cds-loc
1793  //The intersect below is correct.
1794  CRef<CSeq_loc> covered_cds = mf.GetLocation().Intersect(*query_loc, 0, NULL);
1795 
1796  if(covered_cds->IsNull()) {
1797  continue;
1798  }
1799 
1800  static const size_t allowed_unaligned_ends_len = 6;
1801  CRef<CSeq_loc> rna_loc = ProjectRNA(aln, CConstRef<CSeq_loc>(&mf.GetLocation()), allowed_unaligned_ends_len);
1802  CRef<CSeq_loc> cds_loc = ProjectCDS(aln, mf.GetLocation());
1803 
1804  CSeqVector query_sv(*covered_cds, scope, CBioseq_Handle::eCoding_Iupac);
1805  CSeqVector subject_sv(*cds_loc, scope, CBioseq_Handle::eCoding_Iupac);
1806 
1807  if(query_sv.size() != subject_sv.size()) {
1808  ERR_POST(Error << "In alignment of " << aln.GetSeq_id(0).AsFastaString()
1809  << "->" << aln.GetSeq_id(1).AsFastaString() << ": "
1810  << "|query-cds truncated to aln|=" << query_sv.size()
1811  << "; |projected-cds|=" << subject_sv.size());
1812  this_ok = false;
1813  } else {
1814  //we expect the count of matches in seq-vectors to be equal or greater to the
1815  //count of matches in the alignment truncated to CDS
1816  //(accounting for matches in the alignment, plus random matches in overlaps
1817  //corresponding to product-insertions)
1818 
1819  size_t aln_cds_matches(0);
1820  {{
1822  CRef<CSeq_align> cds_aln = mapper->Map(aln2);
1823  for(CTypeConstIterator<CSpliced_exon_chunk> it(Begin(*cds_aln)); it; ++it) {
1824  const CSpliced_exon_chunk& chunk = *it;
1825  if(chunk.IsMatch()) {
1826  aln_cds_matches += chunk.GetMatch();
1827  }
1828  }
1829  }}
1830 
1831  size_t seq_cds_matches(0);
1832  for(size_t i = 0; i < query_sv.size(); i++) {
1833  seq_cds_matches += (query_sv[i] == subject_sv[i] ? 1 : 0);
1834  }
1835 
1836  if(seq_cds_matches < aln_cds_matches) {
1837  ERR_POST(Error << "In alignment of " << aln.GetSeq_id(0).AsFastaString()
1838  << "->" << aln.GetSeq_id(1).AsFastaString() << ": "
1839  << aln_cds_matches << " matches in alignment truncated to CDS, but only "
1840  << seq_cds_matches << " matches in seq-vector");
1841  this_ok = false;
1842  }
1843  }
1844 #if 0
1845  //for debugging
1846  if(!ok) {
1847  NcbiCerr << MSerial_AsnText << aln << "aln(0): " << MSerial_AsnText << *query_loc << "cds(0): " << MSerial_AsnText << mf.GetLocation() << "aln-cds(0): " << MSerial_AsnText << *covered_cds << MSerial_AsnText << *rna_loc << sequence::GetLength(*rna_loc, NULL) << "\n" << MSerial_AsnText << *cds_loc;
1848  }
1849 #endif
1850  all_ok = all_ok & this_ok;
1851  }
1852 
1853  return all_ok;
1854 }
1855 #endif
1856 
ESeqLocExtremes
Used to determine the meaning of a location's Start/Stop positions.
Definition: Na_strand.hpp:61
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CRef< CSpliced_exon > spliced_exon(const CModelExon &e, EStrand strand)
Definition: asn1.cpp:701
Seq-loc and seq-align mapper exceptions.
CBioseq_Handle –.
CFeat_CI –.
Definition: feat_ci.hpp:64
unique_ptr< SImplementation > m_impl
Definition: gene_model.hpp:232
static CRef< objects::CSeq_loc > s_ProjectCDS(const objects::CSeq_align &spliced_aln, const objects::CSeq_loc &product_cds_loc, bool convert_overlaps=true)
Similar to s_ProjectRNA(...) Postcondition: seq-vector of the returned loc is of exact same length an...
static CRef< objects::CSeq_loc > s_ProjectRNA(const objects::CSeq_align &spliced_aln, CConstRef< objects::CSeq_loc > product_cds_loc=CConstRef< objects::CSeq_loc >(NULL), size_t unaligned_ends_partialness_thr=kDefaultAllowedUnaligned)
Project RNA, preserving discontinuities in the CDS.
CMappedFeat –.
Definition: mapped_feat.hpp:59
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
TSeqPos AsSeqPos() const
Definition: Product_pos.cpp:56
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
TSeqPos GetSeqStop(TDim row) const
Definition: Seq_align.cpp:273
CRef< CSeq_loc > CreateRowSeq_loc(TDim row) const
Definition: Seq_align.cpp:2028
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
TSeqPos GetSeqStart(TDim row) const
Definition: Seq_align.cpp:252
ENa_strand GetSeqStrand(TDim row) const
Get strand (the first one if segments have different strands).
Definition: Seq_align.cpp:294
TSeqPos GetStart(ESeqLocExtremes ext) const
TSeqPos GetLength(void) const
TSeqPos GetStop(ESeqLocExtremes ext) const
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
CSeq_loc_Mapper –.
CSpliced_exon_chunk –.
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
#define T(s)
Definition: common.h:230
std::ofstream out("events_result.xml")
main entry point for tests
#define test(a, b, c, d, e)
Definition: numeric.c:170
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define NCBI_USER_THROW(message)
Throw a quick-and-dirty runtime exception of type 'CException' with the given error message and error...
Definition: ncbiexpt.hpp:715
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:453
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
#define NCBI_RETHROW_SAME(prev_exception, message)
Generic macro to re-throw the same exception.
Definition: ncbiexpt.hpp:749
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
@ eUnknown
Definition: app_popup.hpp:72
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
C * SerialClone(const C &src)
Create on heap a clone of the source object.
Definition: serialbase.hpp:512
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
void SetTruncatedStart(bool val, ESeqLocExtremes ext)
set / remove e_Lim fuzz on start or stop (tl/tr - indicating removed parts of the seq-loc)
Definition: Seq_loc.cpp:3398
void SetPacked_int(TPacked_int &v)
Definition: Seq_loc.hpp:984
void SetMix(TMix &v)
Definition: Seq_loc.hpp:987
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
Definition: Seq_loc.cpp:3222
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
void ChangeToPackedInt(void)
Works only if location is currently an interval, point, packed-int (handled trivially),...
Definition: Seq_loc.cpp:3670
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
bool IsTruncatedStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3372
bool IsTruncatedStart(ESeqLocExtremes ext) const
check if parts of the seq-loc are missing
Definition: Seq_loc.cpp:3346
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
void SetPartialStart(bool val, ESeqLocExtremes ext)
set / remove e_Lim fuzz on start or stop (lt/gt - indicating partial interval)
Definition: Seq_loc.cpp:3280
CRef< CSeq_loc > Intersect(const CSeq_loc &other, TOpFlags flags, ISynonymMapper *syn_mapper) const
Find the intersection with the seq-loc, merge/sort resulting ranges depending on flags.
Definition: Seq_loc.cpp:5183
void SetPartialStop(bool val, ESeqLocExtremes ext)
Definition: Seq_loc.cpp:3313
void SetNull(void)
Override all setters to incorporate cache invalidation.
Definition: Seq_loc.hpp:960
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
void SetTruncatedStop(bool val, ESeqLocExtremes ext)
Definition: Seq_loc.cpp:3431
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
@ eOrder_Biological
Iterate sub-locations in positional order.
Definition: Seq_loc.hpp:462
@ fMerge_SingleRange
Definition: Seq_loc.hpp:332
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
TSeqPos GetStop(const CSeq_loc &loc, CScope *scope, ESeqLocExtremes ext=eExtreme_Positional)
If only one CBioseq is represented by CSeq_loc, returns the position at the stop of the location.
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
TSeqPos GetStart(const CSeq_loc &loc, CScope *scope, ESeqLocExtremes ext=eExtreme_Positional)
If only one CBioseq is represented by CSeq_loc, returns the position at the start of the location.
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
CRef< CSeq_loc > Seq_loc_Merge(const CSeq_loc &loc, CSeq_loc::TOpFlags flags, CScope *scope)
Merge ranges in the seq-loc.
CRef< CSeq_loc > Seq_loc_Add(const CSeq_loc &loc1, const CSeq_loc &loc2, CSeq_loc::TOpFlags flags, CScope *scope)
Add two seq-locs.
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ eNoOverlap
CSeq_locs do not overlap or abut.
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
CSeq_loc_Mapper_Base & SetTrimSplicedSeg(bool trim)
For mapping spliced-segs only: preserve or trim starting/ending indels.
@ eBadAlignment
Unsuported or invalid alignment.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
const CSeq_loc & GetLocation(void) const
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NcbiCerr
Definition: ncbistre.hpp:544
const TDonor_after_exon & GetDonor_after_exon(void) const
Get the Donor_after_exon member data.
bool IsSetParts(void) const
basic seqments always are in biologic order Check if a value has been assigned to Parts data member.
TMatch GetMatch(void) const
Get the variant data.
TGenomic_start GetGenomic_start(void) const
Get the Genomic_start member data.
const TAcceptor_before_exon & GetAcceptor_before_exon(void) const
Get the Acceptor_before_exon member data.
bool IsMismatch(void) const
Check if variant Mismatch is selected.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
bool IsSetAcceptor_before_exon(void) const
splice sites Check if a value has been assigned to Acceptor_before_exon data member.
TProduct_length GetProduct_length(void) const
Get the Product_length member data.
bool IsSetPoly_a(void) const
start of poly(A) tail on the transcript For sense transcripts: aligned product positions < poly-a <= ...
TDiag GetDiag(void) const
Get the variant data.
TProduct_type GetProduct_type(void) const
Get the Product_type member data.
TMismatch GetMismatch(void) const
Get the variant data.
bool IsSetGenomic_strand(void) const
genomic-strand represents the strand of translation Check if a value has been assigned to Genomic_str...
const TProduct_start & GetProduct_start(void) const
Get the Product_start member data.
const TProduct_end & GetProduct_end(void) const
Get the Product_end member data.
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
bool IsGenomic_ins(void) const
Check if variant Genomic_ins is selected.
bool IsMatch(void) const
Check if variant Match is selected.
TGenomic_ins GetGenomic_ins(void) const
Get the variant data.
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
TGenomic_strand GetGenomic_strand(void) const
Get the Genomic_strand member data.
bool IsDiag(void) const
Check if variant Diag is selected.
const TBases & GetBases(void) const
Get the Bases member data.
list< CRef< CSpliced_exon_chunk > > TParts
bool IsSetProduct_length(void) const
length of the product, in bases/residues from this (or from poly-a if present), a 3' unaligned length...
TGenomic_end GetGenomic_end(void) const
Get the Genomic_end member data.
bool IsSetGenomic_id(void) const
Check if a value has been assigned to Genomic_id data member.
bool IsProduct_ins(void) const
Check if variant Product_ins is selected.
TProduct_ins GetProduct_ins(void) const
Get the variant data.
TNucpos GetNucpos(void) const
Get the variant data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
const TGenomic_id & GetGenomic_id(void) const
Get the Genomic_id member data.
bool IsSetDonor_after_exon(void) const
Check if a value has been assigned to Donor_after_exon data member.
void SetTo(TTo value)
Assign a value to To data member.
bool IsMix(void) const
Check if variant Mix is selected.
Definition: Seq_loc_.hpp:552
list< CRef< CSeq_interval > > Tdata
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
const Tdata & Get(void) const
Get the member data.
const TId & GetId(void) const
Get the Id member data.
Tdata & Set(void)
Assign a value to data member.
void SetId(TId &value)
Assign a value to Id data member.
TFrom GetFrom(void) const
Get the From member data.
list< CRef< CSeq_loc > > Tdata
void SetFrom(TFrom value)
Assign a value to From data member.
const Tdata & Get(void) const
Get the member data.
bool IsPacked_int(void) const
Check if variant Packed_int is selected.
Definition: Seq_loc_.hpp:534
bool IsSetStrand(void) const
Check if a value has been assigned to Strand data member.
TStrand GetStrand(void) const
Get the Strand member data.
TTo GetTo(void) const
Get the To member data.
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
bool IsNull(void) const
Check if variant Null is selected.
Definition: Seq_loc_.hpp:504
void SetStrand(TStrand value)
Assign a value to Strand data member.
const TMix & GetMix(void) const
Get the variant data.
Definition: Seq_loc_.cpp:282
const TPacked_int & GetPacked_int(void) const
Get the variant data.
Definition: Seq_loc_.cpp:216
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ e_Null
not placed
Definition: Seq_loc_.hpp:98
int i
int len
constexpr auto front(list< Head, As... >, T=T()) noexcept -> Head
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
#define abs(a)
Definition: ncbi_heapmgr.c:130
unsigned int a
Definition: ncbi_localip.c:102
Defines: CTimeFormat - storage class for time format.
T max(T x_, T y_)
T min(T x_, T y_)
Int4 delta(size_t dimension_, const Int4 *score_)
The Object manager core.
USING_SCOPE(objects)
CRef< CPacked_seqint > ProjectAndCollapseCDS(const CSeq_align &spliced_aln, CConstRef< CSeq_loc > product_cds_loc)
static CRef< CSeq_loc > ProjectExon_newlogic(const CSpliced_exon &exon, const CSeq_id &aln_genomic_id, ENa_strand aln_genomic_strand)
void Canonicalize(CSeq_loc &loc)
Recursively convert empty container-locs to null-locs, drop null sublocs from containers,...
T53Partialness GetTerminalPartialness(const CSeq_align &spliced_aln, CConstRef< CSeq_loc > product_cds_loc, size_t unaligned_ends_partialness_thr)
Return whether 5' and/or 3' end of exons-loc is partial based on unaligned tails in case of RNA,...
bool AreAbuttingOnProduct(const CSpliced_exon &exon1, const CSpliced_exon &exon2)
retrun true iff abutting on query (in nucpos-coords)
CRef< CSeq_loc > ProjectExons(const CSeq_align &spliced_aln, CConstRef< CSeq_loc > product_cds_loc, bool convert_overlaps, size_t unaligned_ends_partialness_thr=0)
size_t GetUnalignedLength_3p(const CSeq_align &spliced_aln)
T53Partialness GetExonPartialness(const CSeq_align &spliced_aln, const CSpliced_exon &target_exon)
Return whether 5' and/or 3' end of exon is partial based on consensus splicing with upstream/downstre...
CRef< CSeq_loc > TruncateToCDS(CRef< CSeq_loc > detailed_rna_exon, CConstRef< CPacked_seqint > collapsed_genomic_cds)
static CRef< CSeq_loc > ProjectExon(const CSpliced_exon &exon, const CSeq_id &aln_genomic_id, ENa_strand aln_genomic_strand, bool convert_overlaps)
static CRef< CSeq_loc > ProjectExon_oldlogic(const CSpliced_exon &spliced_exon, const CSeq_id &aln_genomic_id, ENa_strand aln_genomic_strand, bool convert_overlaps)
pair< bool, bool > T53Partialness
first and ::second indicate partialness for of a loc or an exon, 5' and 3' end respectively.
void TransferPartialness(const CSeq_loc &src, CSeq_loc &dest)
CRef< CSpliced_exon > CollapseExonStructure(const CSpliced_exon &orig_exon)
Create an exon with the structure consisting of two diags extending inwards from the exon terminals w...
void AugmentPartialness(CSeq_loc &loc, T53Partialness partialness)
size_t GetUnalignedLength_5p(const CSeq_align &spliced_aln)
CRef< CSeq_loc > CollapseDiscontinuitiesInUTR(const CSeq_loc &loc, TSeqPos cds_start, TSeqPos cds_stop)
Precondition: input loc is discontinuity-preserving RNA loc Postcontition: adjacent packed-ints havin...
static bool SameIdAndStrand(const CSeq_interval &a, const CSeq_interval &b)
static void SetBioStart(CSeq_interval &seqint, TSeqPos pos)
static void Validate(const CSeq_loc &orig_loc, const CSeq_loc &final_loc)
static void AdjustBioStop(CSeq_interval &seqint, TSignedSeqPos amt)
static void CheckIdAndStrand(const CPacked_seqint &ps)
static TSignedSeqPos GetGapLength(const CSeq_interval &prev, const CSeq_interval &curr)
static void SubsumeMicroIntervals(CPacked_seqint &ps)
static bool MinusStrand(const CSeq_interval &seqint)
static void ConvertOverlapsToGaps(CPacked_seqint &ps)
static bool CanCreateNonframeshiftingGap(const CSeq_interval &prev, const CSeq_interval &curr, const CSeq_interval &next)
static void CollapseNonframeshiftting(CPacked_seqint &ps)
static CRef< T > Clone(const T &x)
static CRef< CSeq_loc > TweakExon(const CSeq_loc &orig_loc, bool convert_overlaps)
static CRef< CSeq_interval > Collapse(const CSeq_interval &a, const CSeq_interval &b)
static container_t::value_type rel_at(const container_t &container, typename container_t::const_iterator it, Int8 delta, typename container_t::value_type default_value)
static void AdjustBioStart(CSeq_interval &seqint, TSignedSeqPos amt)
static const TSeqPos k_keep_len
static void AdjustBiostops(CPacked_seqint &ps)
static string AsString(const CPacked_seqint &packed_seqint)
static void SetBioStop(CSeq_interval &seqint, TSeqPos pos)
static const TSeqPos k_min_len
static void safe_advance(iterator_t begin, iterator_t end, iterator_t &it, Int8 d)
static TSeqPos GetBiostartsDelta(const CSeq_interval &upst, const CSeq_interval &downst)
static CRef< CSeq_interval > Prev(const CPacked_seqint::Tdata &seqints, const CPacked_seqint::Tdata::const_iterator it)
static CRef< CSeq_interval > Next(const CPacked_seqint::Tdata &seqints, const CPacked_seqint::Tdata::const_iterator it)
SAnnotSelector –.
bool IsFrameshifting() const
size_t GetFramePreservingSubjLen() const
void Add(const SBlock &other)
bool HasGap() const
void EraseAfter(SBlocks::iterator it)
SBlocks(const CSpliced_exon &spliced_exon)
static DP_BlockInfo * blocks
int g(Seg_Gsm *spe, Seq_Mtf *psm, Thd_Gsm *tdg)
Definition: thrddgri.c:44
Modified on Tue Apr 16 20:10:02 2024 by modify_doxy.py rev. 669887