NCBI C++ ToolKit
transform_align.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: transform_align.cpp 101884 2024-02-28 15:27:16Z mozese2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Vyacheslav Chetvernin
27  *
28  * File Description: Alignment transformations
29  *
30  */
31 #include <ncbi_pch.hpp>
37 
38 #include <objmgr/bioseq_handle.hpp>
39 #include <objmgr/scope.hpp>
40 #include <objmgr/feat_ci.hpp>
41 #include <objmgr/util/sequence.hpp>
42 
44 
45 #include "feature_generator.hpp"
46 
48 
50 
51 
52 namespace {
53 
54 pair <ENa_strand, ENa_strand> GetSplicedStrands(const CSpliced_seg& spliced_seg)
55 {
56  ENa_strand product_strand =
57  spliced_seg.IsSetProduct_strand() ?
58  spliced_seg.GetProduct_strand() :
59  (spliced_seg.GetExons().front()->IsSetProduct_strand() ?
60  spliced_seg.GetExons().front()->GetProduct_strand() :
62  ENa_strand genomic_strand =
63  spliced_seg.IsSetGenomic_strand() ?
64  spliced_seg.GetGenomic_strand() :
65  (spliced_seg.GetExons().front()->IsSetGenomic_strand()?
66  spliced_seg.GetExons().front()->GetGenomic_strand():
68 
69  return make_pair(product_strand, genomic_strand);
70 }
71 
72 void SetProtpos(CProduct_pos &pos, int value)
73 {
74  pos.SetProtpos().SetAmin(value/3);
75  pos.SetProtpos().SetFrame((value % 3) +1);
76 }
77 
78 }
79 
80 
81 void CFeatureGenerator::SImplementation::GetExonStructure(const CSpliced_seg& spliced_seg, vector<SExon>& exons, CScope* scope)
82 {
83  pair <ENa_strand, ENa_strand> strands = GetSplicedStrands(spliced_seg);
84  ENa_strand product_strand = strands.first;
85  ENa_strand genomic_strand = strands.second;
86 
87  exons.resize(spliced_seg.GetExons().size());
88  int i = 0;
89  TSignedSeqPos prev_genomic_pos = 0;
91  ITERATE(CSpliced_seg::TExons, it, spliced_seg.GetExons()) {
92  const CSpliced_exon& exon = **it;
93  SExon& exon_struct = exons[i++];
94 
95  const CProduct_pos& prod_from = exon.GetProduct_start();
96  const CProduct_pos& prod_to = exon.GetProduct_end();
97 
98  exon_struct.prod_from = prod_from.AsSeqPos();
99  exon_struct.prod_to = prod_to.AsSeqPos();
100  if (product_strand == eNa_strand_minus) {
101  swap(exon_struct.prod_from, exon_struct.prod_to);
102  exon_struct.prod_from = -exon_struct.prod_from;
103  exon_struct.prod_to = -exon_struct.prod_to;
104  }
105 
106  exon_struct.genomic_from = exon.GetGenomic_start();
107  exon_struct.genomic_to = exon.GetGenomic_end();
108 
109  bool cross_the_origin = i > 1 && (
110  genomic_strand != eNa_strand_minus
111  ? (exon_struct.genomic_from < prev_genomic_pos)
112  : (exon_struct.genomic_from > prev_genomic_pos));
113 
114  if (cross_the_origin && scope) {
115  offset = scope->GetSequenceLength(spliced_seg.GetGenomic_id());
116  }
117 
118  prev_genomic_pos = exon_struct.genomic_from;
119 
120  if (genomic_strand == eNa_strand_minus) {
121  swap(exon_struct.genomic_from, exon_struct.genomic_to);
122  exon_struct.genomic_from = -exon_struct.genomic_from;
123  exon_struct.genomic_to = -exon_struct.genomic_to;
124  }
125 
126  if (offset) {
127  exon_struct.genomic_from += offset;
128  exon_struct.genomic_to += offset;
129  }
130 
131  }
132 
133  _ASSERT( exons.size() == spliced_seg.GetExons().size() );
134 }
135 
136 
138 {
139  CSpliced_seg& spliced_seg = align.SetSegs().SetSpliced();
140 
141  if (!spliced_seg.CanGetExons() || spliced_seg.GetExons().size() < 2)
142  return;
143 
144  vector<SExon> exons;
145  GetExonStructure(spliced_seg, exons, m_scope);
146 
147  bool is_protein = (spliced_seg.GetProduct_type()==CSpliced_seg::eProduct_type_protein);
148 
149  pair <ENa_strand, ENa_strand> strands = GetSplicedStrands(spliced_seg);
150  ENa_strand product_strand = strands.first;
151  ENa_strand genomic_strand = strands.second;
152 
153  int product_min_pos;
154  int product_max_pos;
155  if (product_strand != eNa_strand_minus) {
156  product_min_pos = 0;
157  if (spliced_seg.IsSetPoly_a()) {
158  product_max_pos = spliced_seg.GetPoly_a()-1;
159  } else if (spliced_seg.IsSetProduct_length()) {
160  product_max_pos = spliced_seg.GetProduct_length()-1;
161  if (is_protein)
162  product_max_pos = product_max_pos*3+2;
163  } else {
164  product_max_pos = exons.back().prod_to;
165  }
166  } else {
167  if (spliced_seg.IsSetProduct_length()) {
168  product_min_pos = -int(spliced_seg.GetProduct_length())+1;
169  if (is_protein)
170  product_min_pos = product_min_pos*3-2;
171  } else {
172  product_min_pos = exons[0].prod_from;
173  }
174  if (spliced_seg.IsSetPoly_a()) {
175  product_max_pos = -int(spliced_seg.GetPoly_a())+1;
176  } else {
177  product_max_pos = 0;
178  }
179  }
180 
181  CSpliced_seg::TExons::iterator it = spliced_seg.SetExons().begin();
182  CRef<CSpliced_exon> prev_exon = *it;
183  size_t i = 1;
184  CRef<CSeq_loc_Mapper> mapper_to_cds;
185  CRef<CSeq_id> transcript_id(new CSeq_id);
186  try {
187  transcript_id->Assign(align.GetSeq_id(0));
188  CMappedFeat cds = GetCdsOnMrna(*transcript_id, *m_scope);
189  if (cds && cds.IsSetProduct()) {
190  mapper_to_cds.Reset(new CSeq_loc_Mapper(*cds.GetSeq_feat(),
191  CSeq_loc_Mapper::eLocationToProduct, m_scope.GetPointer()));
192  }
193  } catch (CSeqalignException &) {
194  ERR_POST(Warning << "Can't create mapper to CDS");
195  }
196  for (++it; it != spliced_seg.SetExons().end(); ++i, prev_exon = *it++) {
197  CSpliced_exon& exon = **it;
198 
199  bool donor_set = prev_exon->IsSetDonor_after_exon() || (genomic_strand ==eNa_strand_minus && prev_exon->GetGenomic_start()==0);
200  bool acceptor_set = exon.IsSetAcceptor_before_exon() || (genomic_strand ==eNa_strand_minus && prev_exon->GetGenomic_start()==0);
201 
202  if(donor_set && acceptor_set && exons[i-1].prod_to + 1 == exons[i].prod_from) {
203  continue;
204  }
205 
206  _ASSERT( exons[i].prod_from > exons[i-1].prod_to );
207  int prod_hole_len = exons[i].prod_from - exons[i-1].prod_to -1;
208  _ASSERT( exons[i].genomic_from > exons[i-1].genomic_to );
209  int genomic_hole_len = exons[i].genomic_from - exons[i-1].genomic_to -1;
210 
211  if (((m_intron_stitch_threshold_flags & fProduct) &&
212  prod_hole_len >= (int)m_min_intron) ||
213  ((m_intron_stitch_threshold_flags & fGenomic) &&
214  genomic_hole_len >= (int)m_min_intron))
215  continue;
216 
217  if (!prev_exon->IsSetParts() || prev_exon->GetParts().empty()) {
219  part->SetMatch(exons[i-1].prod_to-exons[i-1].prod_from+1);
220  prev_exon->SetParts().push_back(part);
221  }
222  if (!exon.IsSetParts() || exon.GetParts().empty()) {
224  part->SetMatch(exons[i].prod_to-exons[i].prod_from+1);
225  exon.SetParts().push_back(part);
226  }
227 
228  int max_hole_len = max(prod_hole_len, genomic_hole_len);
229  int min_hole_len = min(prod_hole_len, genomic_hole_len);
230  int left_mismatch_len = 0;
231  int right_mismatch_len = min_hole_len;
232  if (prod_hole_len != genomic_hole_len && mapper_to_cds) {
233  CSeq_loc end_pos(*transcript_id, exons[i-1].prod_to);
234  TSeqPos end_pos_on_cds = mapper_to_cds->Map(end_pos)
236  int bases_needed_to_complete_codon = 2 - (end_pos_on_cds % 3);
237 
238  if (right_mismatch_len >= bases_needed_to_complete_codon) {
239  left_mismatch_len = bases_needed_to_complete_codon + ((right_mismatch_len-bases_needed_to_complete_codon)/2/3)*3;
240  right_mismatch_len -= left_mismatch_len;
241  }
242  }
243 
244  bool no_acceptor_before = i > 1 && !prev_exon->IsSetAcceptor_before_exon();
245  bool no_donor_after = i < exons.size()-1 && !exon.IsSetDonor_after_exon();
246 
247 
248  bool cross_the_origin =
249  genomic_strand != eNa_strand_minus
250  ? (prev_exon->GetGenomic_start() > exon.GetGenomic_start())
251  : (prev_exon->GetGenomic_start() < exon.GetGenomic_start());
252 
253  if (cross_the_origin) {
254  int genomic_size = m_scope->GetSequenceLength(spliced_seg.GetGenomic_id());
255 
256  prev_exon->SetPartial(product_min_pos < exons[i-1].prod_from &&
257  no_acceptor_before);
258 
259  exon.SetPartial(exons[i].prod_to < product_max_pos &&
260  no_donor_after);
261 
262  if (genomic_strand != eNa_strand_minus) {
263  prev_exon->SetGenomic_end(genomic_size-1);
264  exon.SetGenomic_start(0);
265  } else {
266  prev_exon->SetGenomic_start(0);
267  exon.SetGenomic_end(genomic_size-1);
268  }
269 
270  int origin = genomic_strand != eNa_strand_minus ? genomic_size : 1;
271  int to_origin = origin - exons[i-1].genomic_to -1;
272  if (prod_hole_len == genomic_hole_len) {
273  left_mismatch_len = to_origin;
274  right_mismatch_len -= left_mismatch_len;
275  }
276 
277  if (left_mismatch_len > 0 && to_origin > 0) {
278  int mismatch_len = min(left_mismatch_len, to_origin);
280  part->SetMismatch(mismatch_len);
281  prev_exon->SetParts().push_back(part);
282  prod_hole_len -= mismatch_len;
283  genomic_hole_len -= mismatch_len;
284  to_origin -= mismatch_len;
285  exons[i-1].genomic_to += mismatch_len;
286  exons[i-1].prod_to += mismatch_len;
287  left_mismatch_len -= mismatch_len;
288  }
289 
290  if (to_origin > 0) {
291  _ASSERT(left_mismatch_len == 0);
292  _ASSERT(prod_hole_len != genomic_hole_len);
294  if (prod_hole_len < genomic_hole_len) {
295  int genomic_ins = min(genomic_hole_len-prod_hole_len, to_origin);
296  part->SetGenomic_ins(genomic_ins);
297  genomic_hole_len -= genomic_ins;
298  to_origin -= genomic_ins;
299  exons[i-1].genomic_to += genomic_ins;
300  } else {
301  part->SetProduct_ins(prod_hole_len-genomic_hole_len);
302  exons[i-1].prod_to += prod_hole_len-genomic_hole_len;
303  prod_hole_len = genomic_hole_len;
304  }
305  prev_exon->SetParts().push_back(part);
306  }
307  if (to_origin > 0) {
308  _ASSERT(prod_hole_len == genomic_hole_len);
309  _ASSERT(right_mismatch_len >= to_origin);
310  int mismatch_len = to_origin;
312  part->SetMismatch(mismatch_len);
313  prev_exon->SetParts().push_back(part);
314  prod_hole_len -= mismatch_len;
315  genomic_hole_len -= mismatch_len;
316  to_origin = 0;
317  exons[i-1].genomic_to += mismatch_len;
318  exons[i-1].prod_to += mismatch_len;
319  right_mismatch_len -= mismatch_len;
320  }
321 
322  _ASSERT(to_origin == 0);
323  _ASSERT(exons[i-1].genomic_to == origin-1);
324 
325  exons[i].prod_from = exons[i-1].prod_to+1;
326  exons[i].genomic_from = exons[i-1].genomic_to+1;
327 
328  if (is_protein) {
329  prev_exon->SetProduct_end().SetProtpos().SetAmin() = exons[i-1].prod_to/3;
330  prev_exon->SetProduct_end().SetProtpos().SetFrame() = (exons[i-1].prod_to %3) +1;
331  exon.SetProduct_start().SetProtpos().SetAmin() = exons[i].prod_from/3;
332  exon.SetProduct_start().SetProtpos().SetFrame() = (exons[i].prod_from %3) +1;
333  } else if (product_strand != eNa_strand_minus) {
334  prev_exon->SetProduct_end().SetNucpos( exons[i-1].prod_to );
335  exon.SetProduct_start().SetNucpos( exons[i].prod_from );
336  } else {
337  prev_exon->SetProduct_start().SetNucpos( -exons[i-1].prod_to );
338  exon.SetProduct_end().SetNucpos( -exons[i].prod_from );
339  }
340 
341  list <CRef< CSpliced_exon_chunk > >::iterator insertion_point = exon.SetParts().begin();
342 
343  if (left_mismatch_len > 0) {
345  part->SetMismatch(left_mismatch_len);
346  insertion_point = exon.SetParts().insert(insertion_point, part);
347  ++insertion_point;
348  }
349  if (prod_hole_len != genomic_hole_len) {
351  if (prod_hole_len < genomic_hole_len) {
352  part->SetGenomic_ins(genomic_hole_len - prod_hole_len);
353  } else {
354  part->SetProduct_ins(prod_hole_len - genomic_hole_len);
355  }
356  insertion_point = exon.SetParts().insert(insertion_point, part);
357  ++insertion_point;
358  }
359  if (right_mismatch_len > 0) {
361  part->SetMismatch(right_mismatch_len);
362  exon.SetParts().insert(insertion_point, part);
363 
364  }
365 
366  } else {
367 
368  if (is_protein || product_strand != eNa_strand_minus) {
369  prev_exon->SetProduct_end().Assign( exon.GetProduct_end() );
370  } else {
371  prev_exon->SetProduct_start().Assign( exon.GetProduct_start() );
372  }
373 
374  if (genomic_strand != eNa_strand_minus) {
375  prev_exon->SetGenomic_end() = exon.GetGenomic_end();
376  } else {
377  prev_exon->SetGenomic_start() = exon.GetGenomic_start();
378  }
379 
380  if (left_mismatch_len > 0) {
382  part->SetMismatch(left_mismatch_len);
383  prev_exon->SetParts().push_back(part);
384  }
385  if (prod_hole_len != genomic_hole_len) {
387  if (prod_hole_len < genomic_hole_len) {
388  part->SetGenomic_ins(max_hole_len - min_hole_len);
389  } else {
390  part->SetProduct_ins(max_hole_len - min_hole_len);
391  }
392  prev_exon->SetParts().push_back(part);
393  }
394  if (right_mismatch_len > 0) {
396  part->SetMismatch(right_mismatch_len);
397  prev_exon->SetParts().push_back(part);
398 
399  }
400  prev_exon->SetParts().splice(prev_exon->SetParts().end(), exon.SetParts());
401 
402  if (exon.IsSetDonor_after_exon()) {
403  prev_exon->SetDonor_after_exon().Assign( exon.GetDonor_after_exon() );
404  } else {
405  prev_exon->ResetDonor_after_exon();
406  }
407 
408  exons[i].prod_from = exons[i-1].prod_from;
409  exons[i].genomic_from = exons[i-1].genomic_from;
410 
411  prev_exon->SetPartial(
412  (product_min_pos < exons[i-1].prod_from && no_acceptor_before) ||
413  (exons[i].prod_to < product_max_pos && no_donor_after));
414 
415  if (exon.IsSetExt()) {
416  prev_exon->SetExt().splice(prev_exon->SetExt().end(), exon.SetExt());
417  }
418 
419  CSpliced_seg::TExons::iterator save_it = it;
420  --save_it;
421  spliced_seg.SetExons().erase(it);
422  it = save_it;
423  }
424  }
425 }
426 
427 vector<CFeatureGenerator::SImplementation::SExon> CFeatureGenerator::SImplementation::
428 GetExons(const CSeq_align &align)
429 {
430  vector<SExon> exons;
431  GetExonStructure(align.GetSegs().GetSpliced(), exons, NULL);
432  return exons;
433 }
434 
444 };
445 
447 ClearScores(CSeq_align &align)
448 {
450  align.SetSegs().SetSpliced().SetExons())
451  {
452  (*exon_it)->ResetScores();
453  }
454  if (align.IsSetScore()) {
455  CScoreBuilderBase score_builder;
457  *score; ++score)
458  {
459  align.ResetNamedScore(*score);
460  }
461  align.ResetNamedScore("weighted_identity");
462 
463  if (align.SetScore().empty()) {
464  align.ResetScore();
465  }
466  }
467 }
468 
469 
472 {
474  align.SetSegs().SetSpliced().SetExons())
475  {
476  RecalculateExonIdty(**exon_it);
477  }
478 
479  if (align.IsSetScore()) {
480  CScoreBuilderBase score_builder;
482  *score; ++score)
483  {
484  int sink;
485  if (align.GetNamedScore(*score, sink)) {
486  align.ResetNamedScore(*score);
487  score_builder.AddScore(*m_scope, align, *score);
488  }
489  }
490  if (align.GetSegs().GetSpliced().GetProduct_type() ==
492  {
493  score_builder.AddSplignScores(align);
494  }
495  align.ResetNamedScore("weighted_identity");
496  }
497 }
498 
501 {
502  if (!exon.IsSetScores())
503  return;
504 
505  Int8 idty = -1;
506  if (exon.IsSetParts()) {
507  int matches = 0;
508  int total = 0;
509  ITERATE (CSpliced_exon::TParts, part_it, exon.GetParts()) {
510  switch ((*part_it)->Which()) {
512  matches += (*part_it)->GetMatch();
513  total += (*part_it)->GetMatch();
514  break;
515 
517  total += (*part_it)->GetMismatch();
518  break;
519 
521  total += (*part_it)->GetProduct_ins();
522  break;
523 
525  total += (*part_it)->GetGenomic_ins();
526  break;
527 
528  default:
529  matches = INT_MIN; // to ensure negative identity
530  total += 1; // to prevent division by zero
531  break;
532  }
533  }
534  if (total) {
535  idty = matches * NCBI_CONST_INT8(10000000000) / total;
536  }
537  else {
538  idty = 0;
539  }
540  }
541 
542  CScore_set::Tdata& exon_scores = exon.SetScores().Set();
543  ERASE_ITERATE (CScore_set::Tdata, score_it, exon_scores) {
544  if (idty >= 0 && (*score_it)->IsSetId() && (*score_it)->GetId().IsStr() &&
545  (*score_it)->GetId().GetStr() == "idty") {
546  (*score_it)->SetValue().SetReal(idty / 10000000000.);
547  } else {
548  exon_scores.erase(score_it);
549  }
550  }
551 }
552 
554 {
555  CSpliced_seg& spliced_seg = align.SetSegs().SetSpliced();
556 
557  if (!spliced_seg.CanGetExons())
558  return;
559 
560  bool is_protein = (spliced_seg.GetProduct_type()==CSpliced_seg::eProduct_type_protein);
561 
562  pair <ENa_strand, ENa_strand> strands = GetSplicedStrands(spliced_seg);
563  ENa_strand product_strand = strands.first;
564  ENa_strand genomic_strand = strands.second;
565 
566  TSignedSeqRange cds;
567  if (is_protein) {
568  cds = TSignedSeqRange(0, spliced_seg.GetProduct_length()*3 - 1);
569  } else {
570  if (!spliced_seg.CanGetProduct_id())
571  return;
572  cds = GetCds(spliced_seg.GetProduct_id());
573  if (cds.Empty())
574  return;
575  if (product_strand == eNa_strand_minus) {
577  "TrimHolesToCodons(): "
578  "Reversed mRNA with CDS");
579  }
580  }
581 
582  vector<SExon> exons;
583  GetExonStructure(spliced_seg, exons, m_scope);
584 
585  int frame_offset = (exons.back().prod_to/3+1)*3+cds.GetFrom(); // to make modulo operands always positive
586 
587  vector<SExon>::iterator right_exon_it = exons.begin();
588  CSpliced_seg::TExons::iterator right_spl_exon_it = spliced_seg.SetExons().begin();
589 
590  for(;;++right_exon_it, ++right_spl_exon_it) {
591 
592  vector<SExon>::reverse_iterator left_exon_it(right_exon_it);
593  CSpliced_seg::TExons::reverse_iterator left_spl_exon_it(right_spl_exon_it);
594 
595  if (right_exon_it != exons.begin() && right_exon_it != exons.end()) {
596  bool donor_set = left_spl_exon_it != spliced_seg.SetExons().rend() && (*left_spl_exon_it)->IsSetDonor_after_exon();
597  bool acceptor_set = right_spl_exon_it != spliced_seg.SetExons().end() && (*right_spl_exon_it)->IsSetAcceptor_before_exon();
598 
599  if(((donor_set && acceptor_set) || left_exon_it->genomic_to + 1 == right_exon_it->genomic_from) && left_exon_it->prod_to + 1 == right_exon_it->prod_from) {
600  continue;
601  }
602  }
603 
604  if (right_exon_it != exons.begin() && (right_exon_it != exons.end() || (m_flags & fTrimEnds))) {
605  while (exons.rend() != left_exon_it &&
606  cds.GetFrom() < left_exon_it->prod_to && left_exon_it->prod_to < cds.GetTo() &&
607  (left_exon_it->prod_to - cds.GetFrom() + 1) % 3 > 0
608  ) {
609  TrimLeftExon(min(left_exon_it->prod_to - left_exon_it->prod_from + 1,
610  (left_exon_it->prod_to - cds.GetFrom() + 1) % 3),
611  eTrimProduct,
612  exons.rend(), left_exon_it, left_spl_exon_it,
613  product_strand, genomic_strand);
614  }
615  }
616 
617  if (right_exon_it != exons.end() && (right_exon_it != exons.begin() || (m_flags & fTrimEnds))) {
618  while (right_exon_it != exons.end() &&
619  cds.GetFrom() < right_exon_it->prod_from && right_exon_it->prod_from < cds.GetTo() &&
620  (frame_offset-right_exon_it->prod_from) % 3 > 0
621  ) {
622  TrimRightExon(min(right_exon_it->prod_to - right_exon_it->prod_from + 1,
623  (frame_offset-right_exon_it->prod_from) % 3),
624  eTrimProduct,
625  right_exon_it, exons.end(), right_spl_exon_it,
626  product_strand, genomic_strand);
627  }
628  }
629 
630  if (left_exon_it.base() != right_exon_it) {
631  right_exon_it = exons.erase(left_exon_it.base(), right_exon_it);
632  right_spl_exon_it = spliced_seg.SetExons().erase(left_spl_exon_it.base(), right_spl_exon_it);
633  }
634 
635  if (right_exon_it == exons.end())
636  break;
637  }
638  _ASSERT(right_exon_it == exons.end() && right_spl_exon_it == spliced_seg.SetExons().end());
639 }
640 
642 {
643  CSpliced_seg& spliced_seg = align.SetSegs().SetSpliced();
644  bool is_protein_align =
646 
647  int aa_offset = 0;
648 
649  CSpliced_seg::TExons::iterator prev_exon_it = spliced_seg.SetExons().end();
650 
651  NON_CONST_ITERATE (CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
652  CSpliced_exon& exon = **exon_it;
653 
654  if (exon_it == spliced_seg.SetExons().begin()) {
655  if (is_protein_align)
656  aa_offset = - int(exon.GetProduct_start().GetProtpos().GetAmin());
657  else
658  aa_offset = - int(exon.GetProduct_start().GetNucpos())/3;
659  }
660 
661  if (aa_offset) {
662  if (is_protein_align)
663  exon.SetProduct_start().SetProtpos().SetAmin() += aa_offset;
664  else
665  exon.SetProduct_start().SetNucpos() += aa_offset*3;
666  }
667  if (exon.IsSetParts()) {
668  int part_index = 0;
669  ERASE_ITERATE (CSpliced_exon::TParts, part_it, exon.SetParts()) {
670  CSpliced_exon_chunk& chunk = **part_it;
671  switch (chunk.Which()) {
673  int len = chunk.GetGenomic_ins();
674  if (len % 3 == 0) {
675  chunk.SetDiag(len);
676  } else {
677  if (part_index == 0 && prev_exon_it != spliced_seg.SetExons().end() &&
678  (*prev_exon_it)->IsSetParts()) {
679  CSpliced_exon_chunk& prev_chunk = **(*prev_exon_it)->SetParts().rbegin();
680  if (prev_chunk.Which()==CSpliced_exon_chunk::e_Genomic_ins) {
681  int prev_len = prev_chunk.GetGenomic_ins();
682  if (prev_len + len >= 3) {
683 
684  prev_chunk.SetDiag(prev_len);
685 
686  if (is_protein_align) {
687  TSeqPos product_end = (*prev_exon_it)->GetProduct_end().AsSeqPos();
688  product_end += prev_len;
689  (*prev_exon_it)->SetProduct_end().SetProtpos().SetAmin (product_end/3);
690  (*prev_exon_it)->SetProduct_end().SetProtpos().SetFrame(product_end%3+1);
691 
692  TSeqPos product_start = exon.GetProduct_start().AsSeqPos();
693  product_start += prev_len;
694  exon.SetProduct_start().SetProtpos().SetAmin (product_start/3);
695  exon.SetProduct_start().SetProtpos().SetFrame(product_start%3+1);
696  } else {
697  (*prev_exon_it)->SetProduct_end().SetNucpos() += prev_len;
698  exon.SetProduct_start().SetNucpos() += prev_len;
699  }
700 
701  if (len > 3-prev_len) {
703  new_chunk->SetDiag(3-prev_len);
704  exon.SetParts().insert(part_it, new_chunk);
705  chunk.SetGenomic_ins(len - (3-prev_len));
706  } else {
707  chunk.SetDiag(len);
708  }
709  aa_offset += 1;
710  len -= 3-prev_len;
711  }
712  }
713  }
714  if (len > 3) {
716  new_chunk->SetDiag((len/3)*3);
717  exon.SetParts().insert(part_it, new_chunk);
718  chunk.SetGenomic_ins(len % 3);
719  }
720  }
721  aa_offset += len/3;
722  }
723  break;
725  int len = chunk.GetProduct_ins();
726  if (len % 3 == 0) {
727  exon.SetParts().erase(part_it);
728  } else {
729  chunk.SetProduct_ins(len % 3);
730  }
731  aa_offset -= len/3;
732  }
733  break;
734  default:
735  break;
736  }
737  ++part_index;
738  }
739  }
740  if (aa_offset) {
741  if (is_protein_align)
742  exon.SetProduct_end().SetProtpos().SetAmin() += aa_offset;
743  else
744  exon.SetProduct_end().SetNucpos() += aa_offset*3;
745  }
746  prev_exon_it = exon_it;
747  }
748  spliced_seg.SetProduct_length() = is_protein_align
749  ? (*prev_exon_it)->GetProduct_end().GetProtpos().GetAmin()+1
750  : (*prev_exon_it)->GetProduct_end().GetNucpos()+1;
751 }
752 
754 {
755  return m_impl->AdjustAlignment(align_in, range, mode);
756 }
757 
759 {
760  if (!align_in.CanGetSegs() || !align_in.GetSegs().IsSpliced())
761  return CConstRef<CSeq_align>(&align_in);
762 
763  CRef<CSeq_align> align(new CSeq_align);
764  align->Assign(align_in);
765 
766  vector<SExon> orig_exons = GetExons(*align);
767 
768  CSpliced_seg& spliced_seg = align->SetSegs().SetSpliced();
769 
770  pair <ENa_strand, ENa_strand> strands = GetSplicedStrands(spliced_seg);
771  ENa_strand product_strand = strands.first;
772  ENa_strand genomic_strand = strands.second;
773 
774  if (product_strand == eNa_strand_minus) {
776  "AdjustAlignment(): "
777  "product minus strand not supported");
778 
779  }
780 
781  bool plus_strand = !(genomic_strand == eNa_strand_minus);
782 
783  TSeqRange align_range;
784  if (plus_strand) {
785  align_range = TSeqRange(spliced_seg.GetExons().front()->GetGenomic_start(),
786  spliced_seg.GetExons().back()->GetGenomic_end());
787  } else {
788  align_range = TSeqRange(spliced_seg.GetExons().back()->GetGenomic_start(),
789  spliced_seg.GetExons().front()->GetGenomic_end());
790  }
791  bool cross_the_origin = range.GetFrom() > range.GetTo() || align_range.GetFrom() > align_range.GetTo();
792  TSeqPos genomic_size = 0;
793  if (cross_the_origin) {
794  genomic_size = m_scope->GetSequenceLength(spliced_seg.GetGenomic_id());
795 
796 
797  if (range.GetFrom() > range.GetTo()) {
798  range.SetTo(range.GetTo() + genomic_size);
799  }
800  if (align_range.GetFrom() > align_range.GetTo()) {
801  align_range.SetTo(align_range.GetTo() + genomic_size);
802  }
803 
804  if (range.GetTo() < align_range.GetFrom()) {
805  range.SetFrom(range.GetFrom() + genomic_size);
806  range.SetTo(range.GetTo() + genomic_size);
807  }
808  if (align_range.GetTo() < range.GetFrom()) {
809  align_range.SetFrom(align_range.GetFrom() + genomic_size);
810  align_range.SetTo(align_range.GetTo() + genomic_size);
811  }
812 
813  TSeqPos outside_point = min(range.GetFrom(), align_range.GetFrom());
814  NON_CONST_ITERATE(CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
815  CSpliced_exon& exon = **exon_it;
816  if (exon.GetGenomic_start() < outside_point) {
817  exon.SetGenomic_start() += genomic_size;
818  exon.SetGenomic_end() += genomic_size;
819  }
820  }
821  }
822 
823  if (!(range.GetFrom() <= range.GetTo()) ||
824  !(align_range.GetFrom() <= align_range.GetTo())) {
825  NCBI_USER_THROW("no inverted range assertion failed");
826  }
827  if (range.GetTo() < align_range.GetFrom() ||
828  align_range.GetTo() < range.GetFrom()) {
829  NCBI_USER_THROW("alignmentrange and requested range don't overlap");
830  }
831 
832  vector<SExon> exons;
833  GetExonStructure(spliced_seg, exons, m_scope);
834 
835  bool is_protein_align =
837 
838  vector<SExon>::iterator right_exon_it = exons.begin();
839  CSpliced_seg::TExons::iterator right_spl_exon_it = spliced_seg.SetExons().begin();
840 
841  int range_left = plus_strand ? int(range.GetFrom()) : -int(range.GetTo());
842  int range_right = plus_strand ? int(range.GetTo()) : -int(range.GetFrom());
843 
844  for(;;++right_exon_it, ++right_spl_exon_it) {
845 
846  vector<SExon>::reverse_iterator left_exon_it(right_exon_it);
847  CSpliced_seg::TExons::reverse_iterator left_spl_exon_it(right_spl_exon_it);
848 
849  if (right_exon_it == exons.end() &&
850  left_exon_it->genomic_to > range_right
851  )
852  CFeatureGenerator::SImplementation::TrimLeftExon(left_exon_it->genomic_to - range_right, eTrimGenomic,
853  exons.rend(), left_exon_it, left_spl_exon_it,
854  product_strand, genomic_strand);
855 
856  if (right_exon_it == exons.begin() &&
857  right_exon_it->genomic_from < range_left
858  )
859  CFeatureGenerator::SImplementation::TrimRightExon(range_left - right_exon_it->genomic_from, eTrimGenomic,
860  right_exon_it, exons.end(), right_spl_exon_it,
861  product_strand, genomic_strand);
862  bool delete_me = false;
863  if (left_exon_it.base() != right_exon_it) {
864  delete_me = true;
865  }
866  if(delete_me) {
867  right_exon_it = exons.erase(left_exon_it.base(), right_exon_it);
868  right_spl_exon_it = spliced_seg.SetExons().erase(left_spl_exon_it.base(), right_spl_exon_it);
869  }
870 
871  if (right_exon_it == exons.end())
872  break;
873  }
874 
875  CSpliced_exon& first_exon = *spliced_seg.SetExons().front();
876  CSpliced_exon& last_exon = *spliced_seg.SetExons().back();
877 
878  int first_exon_extension = 0;
879  int last_exon_extension = 0;
880 
881  if (plus_strand) {
882 
883  first_exon_extension =
884  first_exon.GetGenomic_start()
885  - ((range.GetFrom() < genomic_size && genomic_size <= first_exon.GetGenomic_start())
886  ? genomic_size
887  : range.GetFrom());
888 
889  if (first_exon_extension > 0) {
890  first_exon.SetGenomic_start() -= first_exon_extension;
891  if (first_exon.IsSetParts()) {
893  chunk->SetDiag(first_exon_extension);
894  first_exon.SetParts().insert(first_exon.SetParts().begin(), chunk);
895  }
896  }
897 
898  last_exon_extension =
899  ((last_exon.GetGenomic_end() <= genomic_size-1 && genomic_size-1 < range.GetTo())
900  ? genomic_size-1
901  : range.GetTo())
902  - last_exon.GetGenomic_end();
903 
904  if (last_exon_extension > 0) {
905  last_exon.SetGenomic_end() += last_exon_extension;
906  if (last_exon.IsSetParts()) {
908  chunk->SetDiag(last_exon_extension);
909  last_exon.SetParts().push_back(chunk);
910  }
911  }
912  } else {
913  last_exon_extension =
914  last_exon.GetGenomic_start()
915  - ((range.GetFrom() < genomic_size && genomic_size <= last_exon.GetGenomic_start())
916  ? genomic_size
917  : range.GetFrom());
918 
919  if (last_exon_extension > 0) {
920  last_exon.SetGenomic_start() -= last_exon_extension;
921  if (last_exon.IsSetParts()) {
923  chunk->SetDiag(last_exon_extension);
924  last_exon.SetParts().push_back(chunk);
925  }
926  }
927 
928  first_exon_extension =
929  ((first_exon.GetGenomic_end() <= genomic_size-1 && genomic_size-1 < range.GetTo())
930  ? genomic_size-1
931  : range.GetTo())
932  - first_exon.GetGenomic_end();
933  if (first_exon_extension > 0) {
934  first_exon.SetGenomic_end() += first_exon_extension;
935  if (first_exon.IsSetParts()) {
937  chunk->SetDiag(first_exon_extension);
938  first_exon.SetParts().insert(first_exon.SetParts().begin(), chunk);
939  }
940  }
941  }
942 
943  exons.front().prod_from -= first_exon_extension;
944  exons.front().genomic_from -= first_exon_extension;
945  exons.back().prod_to += last_exon_extension;
946  exons.back().genomic_to += last_exon_extension;
947 
948 
949  if (plus_strand) {
950  first_exon_extension = first_exon.GetGenomic_start() - range.GetFrom();
951 
952  if (first_exon_extension > 0) {
954  exon->SetGenomic_start() = range.GetFrom();
955  exon->SetGenomic_end() = genomic_size-1;
956  spliced_seg.SetExons().push_front(exon);
957 
958  SExon exon_struct;
959  exon_struct.prod_from = exons.front().prod_from - first_exon_extension;
960  exon_struct.prod_to = exons.front().prod_from - 1;
961  exon_struct.genomic_from = exons.front().genomic_from - first_exon_extension;
962  exon_struct.genomic_to = exons.front().genomic_from - 1;
963 
964  exons.insert(exons.begin(), exon_struct);
965  }
966 
967  last_exon_extension = range.GetTo() - last_exon.GetGenomic_end();
968 
969  if (last_exon_extension > 0) {
971  exon->SetGenomic_start() = 0;
972  exon->SetGenomic_end() = last_exon_extension - 1;
973  spliced_seg.SetExons().push_back(exon);
974 
975  SExon exon_struct;
976  exon_struct.prod_from = exons.back().prod_to + 1;
977  exon_struct.prod_to = exons.back().prod_to + last_exon_extension;
978  exon_struct.genomic_from = exons.back().genomic_to +1;
979  exon_struct.genomic_to = exons.back().genomic_to + last_exon_extension;
980 
981  exons.push_back(exon_struct);
982  }
983  } else {
984  last_exon_extension = last_exon.GetGenomic_start() - range.GetFrom();
985 
986  if (last_exon_extension > 0) {
988  exon->SetGenomic_start() = range.GetFrom();
989  exon->SetGenomic_end() = genomic_size-1;
990  spliced_seg.SetExons().push_back(exon);
991 
992  SExon exon_struct;
993  exon_struct.prod_from = exons.back().prod_to + 1;
994  exon_struct.prod_to = exons.back().prod_to + last_exon_extension;
995  exon_struct.genomic_from = exons.back().genomic_to +1;
996  exon_struct.genomic_to = exons.back().genomic_to + last_exon_extension;
997 
998  exons.push_back(exon_struct);
999  }
1000 
1001  first_exon_extension = range.GetTo() - first_exon.GetGenomic_end();
1002 
1003  if (first_exon_extension > 0) {
1005  exon->SetGenomic_start() = 0;
1006  exon->SetGenomic_end() = first_exon_extension - 1;
1007  spliced_seg.SetExons().push_front(exon);
1008 
1009  SExon exon_struct;
1010  exon_struct.prod_from = exons.front().prod_from - first_exon_extension;
1011  exon_struct.prod_to = exons.front().prod_from - 1;
1012  exon_struct.genomic_from = exons.front().genomic_from - first_exon_extension;
1013  exon_struct.genomic_to = exons.front().genomic_from - 1;
1014 
1015  exons.insert(exons.begin(), exon_struct);
1016  }
1017  }
1018 
1019  if (range_left != exons.front().genomic_from || range_right != exons.back().genomic_to) {
1021  "AdjustAlignment(): "
1022  "result's ends do not match the range. This is a bug in AdjustAlignment implementation");
1023  }
1024 
1025  int offset = is_protein_align ? int(exons.front().prod_from/3)*3 : exons.front().prod_from;
1026  if (offset > exons.front().prod_from) // negative division rounds toward zero
1027  offset -= 3;
1028 
1029  if (mode == eTryToPreserveProductPositions && offset > 0) {
1030  offset = 0; // do not shift product position unnecessarily
1031  }
1032 
1033  vector<SExon>::iterator exon_struct_it = exons.begin();
1034 
1035  int putative_prod_length = 0;
1036  if (is_protein_align) {
1037  NON_CONST_ITERATE (CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
1038  CSpliced_exon& exon = **exon_it;
1039  SetProtpos(exon.SetProduct_start(), exon_struct_it->prod_from - offset);
1040  SetProtpos(exon.SetProduct_end(), exon_struct_it->prod_to - offset);
1041  ++exon_struct_it;
1042  }
1043  putative_prod_length = (exons.back().prod_to - offset + 3)/3;
1044  } else {
1045  NON_CONST_ITERATE (CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
1046  CSpliced_exon& exon = **exon_it;
1047  exon.SetProduct_start().SetNucpos() = exon_struct_it->prod_from - offset;
1048  exon.SetProduct_end().SetNucpos() = exon_struct_it->prod_to - offset;
1049  ++exon_struct_it;
1050  }
1051  putative_prod_length = exons.back().prod_to - offset + 1;
1052  }
1053  if (mode == eForceProductFrom0 || (int)spliced_seg.GetProduct_length() < putative_prod_length) {
1054  spliced_seg.SetProduct_length(putative_prod_length);
1055  }
1056 
1057  if (cross_the_origin) {
1058  NON_CONST_ITERATE(CSpliced_seg::TExons, exon_it, spliced_seg.SetExons()) {
1059  CSpliced_exon& exon = **exon_it;
1060  if (exon.GetGenomic_start() >= genomic_size)
1061  exon.SetGenomic_start() -= genomic_size;
1062  if (exon.GetGenomic_end() >= genomic_size)
1063  exon.SetGenomic_end() -= genomic_size;
1064  }
1065  }
1066  if (spliced_seg.IsSetExons()) {
1067  auto& spliced_exons = spliced_seg.SetExons();
1068  for(auto exon_it = spliced_exons.begin(); exon_it != spliced_exons.end();) {
1069  bool delete_me = false;
1070  if( (*exon_it)->IsSetParts() ) {
1071  //
1072  // check if we perchance eliminated all meaningful "parts" and nothing left from either genome or product
1073  //
1074  delete_me = true;
1075  for (auto part_it: (*exon_it)->GetParts()) {
1076  switch( part_it->Which()) {
1080  delete_me = false;
1081  break;
1082  default: break;
1083  }
1084  }
1085  }
1086  if(delete_me) {
1087  exon_it = spliced_exons.erase(exon_it);
1088  }
1089  else {
1090  exon_it++;
1091  }
1092  }
1093  }
1094  if (GetExons(*align) != orig_exons) {
1095  ClearScores(*align);
1096  }
1097 
1098  return align;
1099 }
1100 
1101 CMappedFeat GetCdsOnMrna(const objects::CSeq_id& rna_id, CScope& scope)
1102 {
1103  CMappedFeat cdregion_feat;
1104  CBioseq_Handle handle = scope.GetBioseqHandle(rna_id);
1105  if (handle) {
1106  for (CFeat_CI feat_iter(handle, CSeqFeatData::eSubtype_cdregion);
1107  feat_iter; ++feat_iter)
1108  {
1109  if (!feat_iter.GetSize() ||
1110  (feat_iter->IsSetPseudo() && feat_iter->GetPseudo()))
1111  {
1112  continue;
1113  }
1114  cdregion_feat = *feat_iter;
1115  const CSeq_loc& cds_loc = cdregion_feat.GetLocation();
1116  const CSeq_id* cds_loc_seq_id = cds_loc.GetId();
1117  if (cds_loc_seq_id == NULL || !sequence::IsSameBioseq(*cds_loc_seq_id, rna_id, &scope)) {
1118  cdregion_feat = CMappedFeat();
1119  }
1120  }
1121  }
1122  return cdregion_feat;
1123 }
1124 
1126 {
1127  CMappedFeat cdregion = GetCdsOnMrna(rna_id, *m_scope);
1128  if (!cdregion) {
1129  return TSignedSeqRange();
1130  }
1131 
1132  TSeqRange cds = cdregion.GetLocation().GetTotalRange();
1133 
1134  return TSignedSeqRange(cds.GetFrom(), cds.GetTo());
1135 }
1136 
1138  vector<SExon>::reverse_iterator left_edge,
1139  vector<SExon>::reverse_iterator& exon_it,
1140  CSpliced_seg::TExons::reverse_iterator& spl_exon_it,
1141  ENa_strand product_strand,
1142  ENa_strand genomic_strand)
1143 {
1144  _ASSERT( trim_amount < 3 || side!=eTrimProduct );
1145  bool is_protein = (*spl_exon_it)->GetProduct_start().IsProtpos();
1146 
1147  while (trim_amount > 0) {
1148  int exon_len = side==eTrimProduct
1149  ? (exon_it->prod_to - exon_it->prod_from + 1)
1150  : (exon_it->genomic_to - exon_it->genomic_from + 1);
1151  if (exon_len <= trim_amount) {
1152  int next_from = exon_it->genomic_from;
1153  ++exon_it;
1154  ++spl_exon_it;
1155  trim_amount -= exon_len;
1156  _ASSERT( trim_amount==0 || side!=eTrimProduct );
1157  if (exon_it == left_edge)
1158  break;
1159  if (trim_amount > 0) { // eTrimGenomic, account for distance between exons
1160  trim_amount -= next_from - exon_it->genomic_to -1;
1161  }
1162  } else {
1163  (*spl_exon_it)->SetPartial(true);
1164  (*spl_exon_it)->ResetDonor_after_exon();
1165 
1166  int genomic_trim_amount = 0;
1167  int product_trim_amount = 0;
1168 
1169  if ((*spl_exon_it)->CanGetParts() && !(*spl_exon_it)->GetParts().empty()) {
1170  CSpliced_exon::TParts& parts = (*spl_exon_it)->SetParts();
1171  CSpliced_exon_Base::TParts::iterator chunk = parts.end();
1172  while (--chunk, (trim_amount>0 ||
1173  (side==eTrimProduct
1174  ? (*chunk)->IsGenomic_ins()
1175  : (*chunk)->IsProduct_ins()))) {
1176  int product_chunk_len = 0;
1177  int genomic_chunk_len = 0;
1178  switch((*chunk)->Which()) {
1180  product_chunk_len = (*chunk)->GetMatch();
1181  genomic_chunk_len = product_chunk_len;
1182  if (product_chunk_len > trim_amount) {
1183  (*chunk)->SetMatch(product_chunk_len - trim_amount);
1184  }
1185  break;
1187  product_chunk_len = (*chunk)->GetMismatch();
1188  genomic_chunk_len = product_chunk_len;
1189  if (product_chunk_len > trim_amount) {
1190  (*chunk)->SetMismatch(product_chunk_len - trim_amount);
1191  }
1192  break;
1194  product_chunk_len = (*chunk)->GetDiag();
1195  genomic_chunk_len = product_chunk_len;
1196  if (product_chunk_len > trim_amount) {
1197  (*chunk)->SetDiag(product_chunk_len - trim_amount);
1198  }
1199  break;
1200 
1202  product_chunk_len = (*chunk)->GetProduct_ins();
1203  if (side==eTrimProduct && product_chunk_len > trim_amount) {
1204  (*chunk)->SetProduct_ins(product_chunk_len - trim_amount);
1205  }
1206  break;
1208  genomic_chunk_len = (*chunk)->GetGenomic_ins();
1209  if (side==eTrimGenomic && genomic_chunk_len > trim_amount) {
1210  (*chunk)->SetGenomic_ins(genomic_chunk_len - trim_amount);
1211  }
1212  break;
1213  default:
1214  _ASSERT(false);
1215  break;
1216  }
1217 
1218  if (side==eTrimProduct && product_chunk_len <= trim_amount) {
1219  genomic_trim_amount += genomic_chunk_len;
1220  product_trim_amount += product_chunk_len;
1221  trim_amount -= product_chunk_len;
1222  } else if (side==eTrimGenomic && genomic_chunk_len <= trim_amount) {
1223  genomic_trim_amount += genomic_chunk_len;
1224  product_trim_amount += product_chunk_len;
1225  trim_amount -= genomic_chunk_len;
1226  } else {
1227  genomic_trim_amount += min(trim_amount, genomic_chunk_len);
1228  product_trim_amount += min(trim_amount, product_chunk_len);
1229  trim_amount = 0;
1230  break;
1231  }
1232  chunk = parts.erase(chunk);
1233  }
1234 
1235  } else {
1236  genomic_trim_amount += trim_amount;
1237  product_trim_amount += trim_amount;
1238  trim_amount = 0;
1239  }
1240 
1241  exon_it->prod_to -= product_trim_amount;
1242  exon_it->genomic_to -= genomic_trim_amount;
1243 
1244  if (is_protein) {
1245  CProduct_pos& prot_pos = (*spl_exon_it)->SetProduct_end();
1246  SetProtpos(prot_pos, exon_it->prod_to);
1247  } else {
1248  if (product_strand != eNa_strand_minus) {
1249  (*spl_exon_it)->SetProduct_end().SetNucpos() -= product_trim_amount;
1250  } else {
1251  (*spl_exon_it)->SetProduct_start().SetNucpos() += product_trim_amount;
1252  }
1253  }
1254 
1255  if (genomic_strand != eNa_strand_minus) {
1256  (*spl_exon_it)->SetGenomic_end() -= genomic_trim_amount;
1257  } else {
1258  (*spl_exon_it)->SetGenomic_start() += genomic_trim_amount;
1259  }
1260  }
1261  }
1262 }
1264  vector<SExon>::iterator& exon_it,
1265  vector<SExon>::iterator right_edge,
1266  CSpliced_seg::TExons::iterator& spl_exon_it,
1267  ENa_strand product_strand,
1268  ENa_strand genomic_strand)
1269 {
1270  _ASSERT( trim_amount < 3 || side!=eTrimProduct );
1271  bool is_protein = (*spl_exon_it)->GetProduct_start().IsProtpos();
1272 
1273  while (trim_amount > 0) {
1274  int exon_len = side==eTrimProduct
1275  ? (exon_it->prod_to - exon_it->prod_from + 1)
1276  : (exon_it->genomic_to - exon_it->genomic_from + 1);
1277  if (exon_len <= trim_amount) {
1278  int prev_to = exon_it->genomic_to;
1279  ++exon_it;
1280  ++spl_exon_it;
1281  trim_amount -= exon_len;
1282  _ASSERT( trim_amount==0 || side!=eTrimProduct );
1283  if (exon_it == right_edge)
1284  break;
1285  if (trim_amount > 0) { // eTrimGenomic, account for distance between exons
1286  trim_amount -= exon_it->genomic_from - prev_to -1;
1287  }
1288  } else {
1289  (*spl_exon_it)->SetPartial(true);
1290  (*spl_exon_it)->ResetAcceptor_before_exon();
1291 
1292  int genomic_trim_amount = 0;
1293  int product_trim_amount = 0;
1294 
1295  if ((*spl_exon_it)->CanGetParts() && !(*spl_exon_it)->GetParts().empty()) {
1296  CSpliced_exon::TParts& parts = (*spl_exon_it)->SetParts();
1297  CSpliced_exon_Base::TParts::iterator chunk = parts.begin();
1298  for (; trim_amount>0 ||
1299  (side==eTrimProduct
1300  ? (*chunk)->IsGenomic_ins()
1301  : (*chunk)->IsProduct_ins());
1302  ) {
1303  int product_chunk_len = 0;
1304  int genomic_chunk_len = 0;
1305  switch((*chunk)->Which()) {
1307  product_chunk_len = (*chunk)->GetMatch();
1308  genomic_chunk_len = product_chunk_len;
1309  if (product_chunk_len > trim_amount) {
1310  (*chunk)->SetMatch(product_chunk_len - trim_amount);
1311  }
1312  break;
1314  product_chunk_len = (*chunk)->GetMismatch();
1315  genomic_chunk_len = product_chunk_len;
1316  if (product_chunk_len > trim_amount) {
1317  (*chunk)->SetMismatch(product_chunk_len - trim_amount);
1318  }
1319  break;
1321  product_chunk_len = (*chunk)->GetDiag();
1322  genomic_chunk_len = product_chunk_len;
1323  if (product_chunk_len > trim_amount) {
1324  (*chunk)->SetDiag(product_chunk_len - trim_amount);
1325  }
1326  break;
1327 
1329  product_chunk_len = (*chunk)->GetProduct_ins();
1330  if (side==eTrimProduct && product_chunk_len > trim_amount) {
1331  (*chunk)->SetProduct_ins(product_chunk_len - trim_amount);
1332  }
1333  break;
1335  genomic_chunk_len = (*chunk)->GetGenomic_ins();
1336  if (side==eTrimGenomic && genomic_chunk_len > trim_amount) {
1337  (*chunk)->SetGenomic_ins(genomic_chunk_len - trim_amount);
1338  }
1339  break;
1340  default:
1341  _ASSERT(false);
1342  break;
1343  }
1344 
1345  if (side==eTrimProduct && product_chunk_len <= trim_amount) {
1346  genomic_trim_amount += genomic_chunk_len;
1347  product_trim_amount += product_chunk_len;
1348  trim_amount -= product_chunk_len;
1349  } else if (side==eTrimGenomic && genomic_chunk_len <= trim_amount) {
1350  genomic_trim_amount += genomic_chunk_len;
1351  product_trim_amount += product_chunk_len;
1352  trim_amount -= genomic_chunk_len;
1353  } else {
1354  genomic_trim_amount += min(trim_amount, genomic_chunk_len);
1355  product_trim_amount += min(trim_amount, product_chunk_len);
1356  trim_amount = 0;
1357  break;
1358  }
1359  chunk = parts.erase(chunk);
1360  }
1361 
1362  } else {
1363  genomic_trim_amount += trim_amount;
1364  product_trim_amount += trim_amount;
1365  trim_amount = 0;
1366  }
1367 
1368  exon_it->prod_from += product_trim_amount;
1369  exon_it->genomic_from += genomic_trim_amount;
1370 
1371  if (is_protein) {
1372  CProduct_pos& prot_pos = (*spl_exon_it)->SetProduct_start();
1373  SetProtpos(prot_pos, exon_it->prod_from);
1374  } else {
1375  if (product_strand != eNa_strand_minus) {
1376  (*spl_exon_it)->SetProduct_start().SetNucpos() += product_trim_amount;
1377  } else {
1378  (*spl_exon_it)->SetProduct_end().SetNucpos() -= product_trim_amount;
1379  }
1380  }
1381 
1382  if (genomic_strand != eNa_strand_minus) {
1383  (*spl_exon_it)->SetGenomic_start() += genomic_trim_amount;
1384  } else {
1385  (*spl_exon_it)->SetGenomic_end() -= genomic_trim_amount;
1386  }
1387  }
1388  }
1389 }
1390 
1391 namespace fg {
1393 {
1394  int gcode = 1;
1395 
1396  auto source = sequence::GetBioSource(bsh);
1397  if (source != nullptr) {
1398  gcode = source->GetGenCode(gcode);
1399  }
1400 
1401  return gcode;
1402 }
1403 }
1404 
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
CBioseq_Handle –.
CFeat_CI –.
Definition: feat_ci.hpp:64
unique_ptr< SImplementation > m_impl
Definition: gene_model.hpp:232
CConstRef< objects::CSeq_align > AdjustAlignment(const objects::CSeq_align &align, TSeqRange range, EProductPositionsMode mode=eForceProductFrom0)
EProductPositionsMode
Adjust alignment to the specified range (cross-the-origin range on circular chromosome is indicated b...
Definition: gene_model.hpp:151
CMappedFeat –.
Definition: mapped_feat.hpp:59
TSeqPos AsSeqPos() const
Definition: Product_pos.cpp:56
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
void AddSplignScores(const CSeq_align &align, CSeq_align::TScore &scores)
Compute the six splign scores.
void AddScore(CScope &scope, CSeq_align &align, CSeq_align::EScoreType score)
EScoreType
enum controlling known named scores
Definition: Seq_align.hpp:128
@ eScore_PercentIdentity_Gapped
Definition: Seq_align.hpp:163
@ eScore_PercentIdentity_Ungapped
Definition: Seq_align.hpp:164
@ eScore_PercentCoverage
Definition: Seq_align.hpp:168
@ eScore_HighQualityPercentCoverage
Definition: Seq_align.hpp:177
@ eScore_IdentityCount
Definition: Seq_align.hpp:145
@ eScore_MismatchCount
Definition: Seq_align.hpp:154
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
bool GetNamedScore(const string &id, int &score) const
Get score.
Definition: Seq_align.cpp:563
void ResetNamedScore(const string &name)
Definition: Seq_align.cpp:606
CSeq_loc_Mapper –.
CSpliced_exon_chunk –.
int offset
Definition: replacements.h:160
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
Definition: ncbimisc.hpp:843
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define NCBI_USER_THROW(message)
Throw a quick-and-dirty runtime exception of type 'CException' with the given error message and error...
Definition: ncbiexpt.hpp:715
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
bool IsSameBioseq(const CSeq_id &id1, const CSeq_id &id2, CScope *scope, CScope::EGetBioseqFlag get_flag=CScope::eGetBioseq_All)
Determines if two CSeq_ids represent the same CBioseq.
const CBioSource * GetBioSource(const CBioseq &bioseq)
Retrieve the BioSource object for a given bioseq handle.
Definition: sequence.cpp:104
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
TSeqPos GetSequenceLength(const CSeq_id &id, TGetFlags flags=0)
Get sequence length Return kInvalidSeqPos if sequence is not found.
Definition: scope.cpp:769
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
@ eLocationToProduct
Map from the feature's location to product.
bool IsSetProduct(void) const
const CSeq_loc & GetLocation(void) const
CConstRef< CSeq_feat > GetSeq_feat(void) const
Get current seq-feat.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
bool Empty(void) const
Definition: range.hpp:148
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
CRange< TSignedSeqPos > TSignedSeqRange
Definition: range.hpp:420
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
void SetFrom(TFrom value)
Assign a value to From data member.
Definition: Range_.hpp:231
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
void SetTo(TTo value)
Assign a value to To data member.
Definition: Range_.hpp:278
const TDonor_after_exon & GetDonor_after_exon(void) const
Get the Donor_after_exon member data.
const TProtpos & GetProtpos(void) const
Get the variant data.
void SetScores(TScores &value)
Assign a value to Scores data member.
TScore & SetScore(void)
Assign a value to Score data member.
Definition: Seq_align_.hpp:902
const TGenomic_id & GetGenomic_id(void) const
Get the Genomic_id member data.
void SetProduct_start(TProduct_start &value)
Assign a value to Product_start data member.
void SetAmin(TAmin value)
Assign a value to Amin data member.
Definition: Prot_pos_.hpp:229
bool IsSetParts(void) const
basic seqments always are in biologic order Check if a value has been assigned to Parts data member.
list< CRef< CScore > > Tdata
Definition: Score_set_.hpp:90
TProduct_ins & SetProduct_ins(void)
Select the variant.
void SetProduct_end(TProduct_end &value)
Assign a value to Product_end data member.
bool IsSetProduct_strand(void) const
should be 'plus' or 'minus' Check if a value has been assigned to Product_strand data member.
const TProduct_id & GetProduct_id(void) const
Get the Product_id member data.
TGenomic_start GetGenomic_start(void) const
Get the Genomic_start member data.
bool CanGetExons(void) const
Check if it is safe to call GetExons method.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
TDiag & SetDiag(void)
Select the variant.
bool IsSetAcceptor_before_exon(void) const
splice sites Check if a value has been assigned to Acceptor_before_exon data member.
TExons & SetExons(void)
Assign a value to Exons data member.
TProduct_length GetProduct_length(void) const
Get the Product_length member data.
bool IsSetPoly_a(void) const
start of poly(A) tail on the transcript For sense transcripts: aligned product positions < poly-a <= ...
void ResetScore(void)
Reset Score data member.
Definition: Seq_align_.cpp:295
TExt & SetExt(void)
Assign a value to Ext data member.
void SetProduct_length(TProduct_length value)
Assign a value to Product_length data member.
TProduct_type GetProduct_type(void) const
Get the Product_type member data.
TGenomic_strand GetGenomic_strand(void) const
Get the Genomic_strand member data.
TAmin GetAmin(void) const
Get the Amin member data.
Definition: Prot_pos_.hpp:220
void SetGenomic_start(TGenomic_start value)
Assign a value to Genomic_start data member.
const TParts & GetParts(void) const
Get the Parts member data.
const TProduct_start & GetProduct_start(void) const
Get the Product_start member data.
const TProduct_end & GetProduct_end(void) const
Get the Product_end member data.
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
bool CanGetSegs(void) const
Check if it is safe to call GetSegs method.
Definition: Seq_align_.hpp:915
bool IsSetExt(void) const
extra info Check if a value has been assigned to Ext data member.
TGenomic_ins GetGenomic_ins(void) const
Get the variant data.
bool IsSetGenomic_strand(void) const
Check if a value has been assigned to Genomic_strand data member.
void SetPartial(TPartial value)
Assign a value to Partial data member.
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
TProtpos & SetProtpos(void)
Select the variant.
TGenomic_ins & SetGenomic_ins(void)
Select the variant.
bool CanGetProduct_id(void) const
Check if it is safe to call GetProduct_id method.
bool IsSetExons(void) const
set of segments involved each segment corresponds to one exon exons are always in biological order Ch...
TParts & SetParts(void)
Assign a value to Parts data member.
void SetGenomic_end(TGenomic_end value)
Assign a value to Genomic_end data member.
list< CRef< CSpliced_exon_chunk > > TParts
bool IsSetProduct_length(void) const
length of the product, in bases/residues from this (or from poly-a if present), a 3' unaligned length...
bool IsSetScore(void) const
for whole alignment Check if a value has been assigned to Score data member.
Definition: Seq_align_.hpp:884
TPoly_a GetPoly_a(void) const
Get the Poly_a member data.
TGenomic_end GetGenomic_end(void) const
Get the Genomic_end member data.
bool IsSpliced(void) const
Check if variant Spliced is selected.
Definition: Seq_align_.hpp:778
TProduct_strand GetProduct_strand(void) const
Get the Product_strand member data.
void SetFrame(TFrame value)
Assign a value to Frame data member.
Definition: Prot_pos_.hpp:279
TProduct_ins GetProduct_ins(void) const
Get the variant data.
TNucpos GetNucpos(void) const
Get the variant data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
bool IsSetDonor_after_exon(void) const
Check if a value has been assigned to Donor_after_exon data member.
bool IsSetScores(void) const
scores for this exon Check if a value has been assigned to Scores data member.
E_Choice Which(void) const
Which variant is currently selected.
@ e_Product_ins
insertion in product sequence (i.e. gap in the genomic sequence)
@ e_Diag
both sequences are represented, there is sufficient similarity between product and genomic sequences....
@ e_Genomic_ins
insertion in genomic sequence (i.e. gap in the product sequence)
@ e_Match
both sequences represented, product and genomic sequences match
@ e_Mismatch
both sequences represented, product and genomic sequences do not match
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
int len
range(_Ty, _Ty) -> range< _Ty >
int GetGeneticCode(const CBioseq_Handle &bsh)
mdb_mode_t mode
Definition: lmdb++.h:38
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
const CharType(& source)[N]
Definition: pointer.h:1149
#define NCBI_CONST_INT8(v)
64-bit integers
Definition: ncbi_std.h:195
T max(T x_, T y_)
T min(T x_, T y_)
static const GLdouble origin[]
static void TrimLeftExon(int trim_amount, ETrimSide side, vector< SExon >::reverse_iterator left_edge, vector< SExon >::reverse_iterator &exon_it, objects::CSpliced_seg::TExons::reverse_iterator &spl_exon_it, objects::ENa_strand product_strand, objects::ENa_strand genomic_strand)
void MaximizeTranslation(objects::CSeq_align &align)
vector< SExon > GetExons(const CSeq_align &align)
void RecalculateScores(CSeq_align &align)
CConstRef< objects::CSeq_align > AdjustAlignment(const objects::CSeq_align &align, TSeqRange range, EProductPositionsMode mode)
void RecalculateExonIdty(CSpliced_exon &exon)
TSignedSeqRange GetCds(const objects::CSeq_id &seqid)
void GetExonStructure(const CSpliced_seg &spliced_seg, vector< SExon > &exons, CScope *scope)
void ClearScores(CSeq_align &align)
static void TrimRightExon(int trim_amount, ETrimSide side, vector< SExon >::iterator &exon_it, vector< SExon >::iterator right_edge, objects::CSpliced_seg::TExons::iterator &spl_exon_it, objects::ENa_strand product_strand, objects::ENa_strand genomic_strand)
void TrimHolesToCodons(objects::CSeq_align &align)
void StitchSmallHoles(objects::CSeq_align &align)
#define _ASSERT
USING_SCOPE(objects)
CSeq_align::EScoreType s_ScoresToRecalculate[]
CMappedFeat GetCdsOnMrna(const objects::CSeq_id &rna_id, CScope &scope)
Modified on Thu Apr 11 15:08:34 2024 by modify_doxy.py rev. 669887