NCBI C++ ToolKit
gene_model.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gene_model.cpp 100764 2023-09-08 13:26:56Z mozese2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Mike DiCuccio
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
34 #include <corelib/ncbitime.hpp>
36 #include <objmgr/scope.hpp>
37 #include <objmgr/bioseq_handle.hpp>
38 #include <objmgr/seqdesc_ci.hpp>
39 #include <objmgr/annot_ci.hpp>
40 #include <objmgr/feat_ci.hpp>
41 #include <objmgr/align_ci.hpp>
43 #include <objmgr/util/sequence.hpp>
44 #include <objmgr/util/feature.hpp>
45 #include <objmgr/seq_vector.hpp>
46 
48 
70 #include <objects/seq/seq__.hpp>
74 #include <util/range_coll.hpp>
75 #include <util/value_convert.hpp>
77 
78 #include "feature_generator.hpp"
79 #include <serial/serial.hpp>
80 
83 USING_SCOPE(sequence);
84 
85 //////////////////////////
87  CScope& scope,
88  CSeq_annot& annot,
89  CBioseq_set& seqs,
91  TSeqPos allowed_unaligned)
92 {
93  CFeatureGenerator generator(scope);
95  generator.SetAllowedUnaligned(allowed_unaligned);
96 
97  CConstRef<CSeq_align> clean_align = generator.CleanAlignment(align_in);
98  generator.ConvertAlignToAnnot(*clean_align, annot, seqs);
99 }
100 
102  CScope& scope,
103  CSeq_annot& annot,
104  CBioseq_set& seqs,
106  TSeqPos allowed_unaligned)
107 {
108  CFeatureGenerator generator(scope);
110  generator.SetAllowedUnaligned(allowed_unaligned);
111 
112  generator.ConvertAlignToAnnot(aligns, annot, seqs);
113 }
114 
116  CScope& scope,
117  const CSeq_align* align)
118 {
119  CFeatureGenerator generator(scope);
120  generator.SetFeatureExceptions(feat, align);
121 }
122 
124  "annotated by transcript or proteomic data";
125 const char* k_rna_comment =
126  "The sequence of the model RefSeq transcript was modified relative "
127  "to this genomic sequence to represent the inferred CDS";
128 const char* k_cds_comment =
129  "The sequence of the model RefSeq protein was modified relative "
130  "to this genomic sequence to represent the inferred CDS";
131 
133  CRef<CSeq_feat> gene_feat,
134  CRef<CSeq_feat> mrna_feat,
135  CRef<CSeq_feat> cds_feat)
136 {
137  CFeatureGenerator generator(scope);
138  generator.SetPartialFlags(gene_feat, mrna_feat, cds_feat);
139 }
140 
142  CSeq_annot& annot)
143 {
144  CFeatureGenerator generator(scope);
145  generator.RecomputePartialFlags(annot);
146 }
147 
148 
149 ///
150 /// Return the mol-info object for a given sequence
151 ///
152 static const CMolInfo* s_GetMolInfo(const CBioseq_Handle& handle)
153 {
154  if (handle) {
155  CSeqdesc_CI desc_iter(handle, CSeqdesc::e_Molinfo);
156  for ( ; desc_iter; ++desc_iter) {
157  return &desc_iter->GetMolinfo();
158  }
159  }
160 
161  return NULL;
162 }
163 
164 /////////////////////////////////////
165 
167  : m_scope(&scope)
168  , m_flags(fDefaults)
169  , m_intron_stitch_threshold_flags(fBoth)
170  , m_min_intron(kDefaultMinIntron)
171  , m_allowed_unaligned(kDefaultAllowedUnaligned)
172  , m_is_gnomon(false)
173  , m_is_best_refseq(false)
174 {
175 }
176 
178 {
179 }
180 
182  : m_impl(new SImplementation(*scope))
183 {
184 }
185 
187  : m_impl(new SImplementation(scope))
188 {
189 }
190 
192 {
193 }
194 
196 {
197  m_impl->m_flags = flags;
198 }
199 
201 {
202  return m_impl->m_flags;
203 }
204 
206 {
207  m_impl->m_intron_stitch_threshold_flags = flags;
208 }
209 
211 {
212  m_impl->m_min_intron = value;
213 }
214 
216 {
217  m_impl->m_allowed_unaligned = value;
218 }
219 
222 {
223  return m_impl->CleanAlignment(align_in);
224 }
225 
227  CSeq_annot& annot,
228  CBioseq_set& seqs,
229  Int8 gene_id,
230  const CSeq_feat* cdregion)
231 {
232  return m_impl->ConvertAlignToAnnot(align, annot, seqs, gene_id, cdregion, false);
233 }
234 
236  const list< CRef<CSeq_align> > &aligns,
237  CSeq_annot &annot,
238  CBioseq_set &seqs)
239 {
240  m_impl->ConvertAlignToAnnot(aligns, annot, seqs);
241 }
242 
244  const objects::CSeq_loc &loc,
245  objects::CSeq_annot& annot,
246  objects::CBioseq_set& seqs,
247  CCdregion::EFrame frame,
248  CRef<objects::CSeq_id> prot_id,
249  CRef<objects::CSeq_id> rna_id)
250 {
251  CBioseq_Handle bsh = m_impl->m_scope->GetBioseqHandle(*loc.GetId());
252  if (!bsh) {
254  "Can't find genomic sequence " + loc.GetId()->AsFastaString());
255  }
256 
257  TFeatureGeneratorFlags old_flags = GetFlags();
258  TFeatureGeneratorFlags flags = old_flags;
259 
260  /// Temporarily change flags to make sure the needed bioseqs are generated,
261  /// and that the input ids are used
263  SetFlags(flags);
264 
265  static CAtomicCounter counter;
266  size_t new_id_num = counter.Add(1);
267  CTime time(CTime::eCurrent);
268  if (!rna_id) {
269  string str("lcl|MRNA_");
270  str += time.AsString("YMD");
271  str += "_";
272  str += NStr::NumericToString(new_id_num);
273  rna_id.Reset(new CSeq_id(str));
274  }
275  if (!prot_id) {
276  string str("lcl|PROT_");
277  str += time.AsString("YMD");
278  str += "_";
279  str += NStr::NumericToString(new_id_num);
280  prot_id.Reset(new CSeq_id(str));
281  }
282 
283  CSeq_align fake_align;
285  fake_align.SetDim(2);
286  fake_align.SetSegs().SetSpliced().SetProduct_id().Assign(*rna_id);
287  fake_align.SetSegs().SetSpliced().SetGenomic_id().Assign(*loc.GetId());
288  fake_align.SetSegs().SetSpliced().SetProduct_strand(eNa_strand_plus);
289  fake_align.SetSegs().SetSpliced().SetGenomic_strand(loc.GetStrand());
290  fake_align.SetSegs().SetSpliced().SetProduct_type(
292 
293  TSeqPos product_pos = 0;
294  ITERATE (CSeq_loc, loc_it, loc) {
296  exon->SetProduct_start().SetNucpos(product_pos);
297  product_pos += loc_it.GetRange().GetLength();
298  exon->SetProduct_end().SetNucpos(product_pos-1);
299  exon->SetGenomic_start(loc_it.GetRange().GetFrom());
300  exon->SetGenomic_end(loc_it.GetRange().GetTo());
302  match->SetMatch(loc_it.GetRange().GetLength());
303  exon->SetParts().push_back(match);
304  fake_align.SetSegs().SetSpliced().SetExons().push_back(exon);
305  }
306  fake_align.SetSegs().SetSpliced().SetProduct_length(product_pos);
307 
308  CSeq_feat cdregion;
309  cdregion.SetData().SetCdregion().SetFrame(frame);
310  if (frame != CCdregion::eFrame_one &&
311  !loc.IsPartialStart(eExtreme_Biological))
312  {
314  "Non-standard frame specified with 5'-complete location");
315  }
316 
317  CSeq_loc cdregion_loc(*rna_id, 0, product_pos-1, eNa_strand_plus);
318  if (loc.IsPartialStart(eExtreme_Biological)) {
319  cdregion_loc.SetPartialStart(true, eExtreme_Biological);
320  }
321  if (loc.IsPartialStop(eExtreme_Biological)) {
322  cdregion_loc.SetPartialStop(true, eExtreme_Biological);
323  } else if (flags & fCreateCdregion) {
324  /// location is 3'-complete; verify we have a whole number of codons,
325  /// taking frame into account
326  switch (frame) {
328  product_pos -= 1;
329  break;
330 
332  product_pos -= 2;
333  break;
334  default:
335  break;
336  }
337 
338  if (product_pos % 3) {
340  "Non-whole number of codons with 3'-complete location");
341  }
342  }
343 
344  const COrg_ref* org = sequence::GetOrg_refOrNull(bsh);
345  if (org) {
347  code->SetId(fg::GetGeneticCode(bsh));
348  cdregion.SetData().SetCdregion().SetCode().Set().push_back(code);
349  }
350 
351  cdregion.SetLocation().Assign(cdregion_loc);
352  cdregion.SetProduct().SetWhole(*prot_id);
353 
354  m_impl->ConvertAlignToAnnot(fake_align, annot, seqs, 0, &cdregion, false);
355 
356  /// Restore old flags
357  SetFlags(old_flags);
358 }
359 
361  const CSeq_align* align)
362 {
363  m_impl->SetFeatureExceptions(feat, align);
364 }
365 
366 
368  CRef<CSeq_feat> mrna_feat,
369  CRef<CSeq_feat> cds_feat)
370 {
371  m_impl->SetPartialFlags(gene_feat, mrna_feat, cds_feat);
372 }
373 
375 {
376  m_impl->RecomputePartialFlags(annot);
377 }
378 
379 
381  TSeqPos allowed_unaligned,
383  : m_aln(aln), m_scope(scope), m_genomic_row(-1)
384  , m_allowed_unaligned(allowed_unaligned), m_opts(opts)
385 {
386  if(aln.GetSegs().IsSpliced()) {
387  //row 1 is always genomic in spliced-segs
388  m_genomic_row = 1;
389  } else {
390  //otherwise, find exactly one genomic row
391  CSeq_align::TDim num_rows = aln.CheckNumRows();
392  if (num_rows != 2) {
393  /// make sure we only have two rows. anything else
394  /// represents a mixed-strand case or more than two
395  /// sequences
397  "CreateGeneModelFromAlign(): "
398  "failed to create consistent alignment");
399  }
400  for (CSeq_align::TDim i = 0; i < num_rows; ++i) {
401  const CSeq_id& id = aln.GetSeq_id(i);
402  CBioseq_Handle handle = scope.GetBioseqHandle(id);
403  if(!handle) {
404  continue;
405  }
406  const CMolInfo* info = sequence::GetMolInfo(handle);
407  if (info && info->IsSetBiomol()
408  && info->GetBiomol() == CMolInfo::eBiomol_genomic)
409  {
410  if(m_genomic_row < 0) {
411  m_genomic_row = i;
412  } else {
414  "CreateGeneModelFromAlign(): "
415  "More than one genomic row in alignment");
416  }
417  }
418  }
419  if (m_genomic_row < 0) {
421  "CreateGeneModelFromAlign(): "
422  "No genomic sequence found in alignment");
423  }
424  }
425 }
426 
428 {
429  if(rna_loc.IsNull()) {
430  if(m_aln.GetSegs().IsSpliced()) {
431  rna_loc = x_GetLocFromSplicedExons(m_aln);
432  } else {
433  const CSeq_id& id = m_aln.GetSeq_id(GetRnaRow());
434  CBioseq_Handle handle = m_scope.GetBioseqHandle(id);
435  CRef<CSeq_loc> range_loc =
436  handle.GetRangeSeq_loc(0, 0, eNa_strand_plus); //0-0 meanns whole range
437  //todo: truncate the range loc not to include polyA, or
438  //else the remapped loc will be erroneously partial
439  //not a huge issue as it only applies to seg alignments only.
440  rna_loc = x_Mapper()->Map(*range_loc);
441  }
442  }
443  return *rna_loc;
444 }
445 
447 {
448  return m_genomic_row;
449 }
450 
452 {
453  //we checked that alignment contains exactly 2 rows
454  return GetGenomicRow() == 0 ? 1 : 0;
455 }
456 
458 {
459  CRef<CSeq_loc> mapped_loc = x_Mapper()->Map(loc);
460  return mapped_loc;
461 }
462 
464 {
465  x_Mapper()->IncludeSourceLocs(b);
466 }
467 
469 {
470  x_Mapper()->SetMergeNone();
471 }
472 
474 {
475  CRef<CSeq_loc> loc(new CSeq_loc);
476  CConstRef<CSpliced_exon> prev_exon;
477  CRef<CSeq_interval> prev_int;
478 
479  const CSpliced_seg& spliced_seg = aln.GetSegs().GetSpliced();
480  TSeqPos genomic_size = m_scope.GetSequenceLength(spliced_seg.GetGenomic_id());
481  ITERATE(CSpliced_seg::TExons, it, spliced_seg.GetExons()) {
482  const CSpliced_exon& exon = **it;
483  CRef<CSeq_interval> genomic_int(new CSeq_interval);
484 
485  genomic_int->SetId().Assign(aln.GetSeq_id(1));
486  genomic_int->SetFrom(exon.GetGenomic_start());
487  genomic_int->SetTo(exon.GetGenomic_end());
488  genomic_int->SetStrand(
489  exon.IsSetGenomic_strand() ? exon.GetGenomic_strand()
490  : spliced_seg.IsSetGenomic_strand() ? spliced_seg.GetGenomic_strand()
491  : eNa_strand_plus);
492 
493  // check for gaps between exons
494  if(!prev_exon.IsNull() &&
495  !(prev_exon->GetProduct_end().GetNucpos() + 1 == exon.GetProduct_start().GetNucpos() &&
496  ((genomic_int->GetStrand()!=eNa_strand_minus && prev_exon->GetGenomic_end()==genomic_size-1 && exon.GetGenomic_start()==0) ||
497  (genomic_int->GetStrand()==eNa_strand_minus && exon.GetGenomic_end()==genomic_size-1 && prev_exon->GetGenomic_start()==0))
498  )) {
499 
500  bool donor_set = prev_exon->IsSetDonor_after_exon();
501  bool acceptor_set = exon.IsSetAcceptor_before_exon();
502 
503  if(!(donor_set && acceptor_set) || prev_exon->GetProduct_end().GetNucpos() + 1 != exon.GetProduct_start().GetNucpos()) {
504  // gap between exons on rna. But which exon is partial?
505  // if have non-strict consensus splice site - blame it
506  // for partialness. If can't disambiguate on this - set
507  // both.
508  bool donor_ok =
509  (donor_set &&
510  prev_exon->GetDonor_after_exon().GetBases() == "GT");
511  bool acceptor_ok =
512  (acceptor_set &&
513  exon.GetAcceptor_before_exon().GetBases() == "AG");
514  if(donor_ok || !acceptor_ok) {
515  genomic_int->SetPartialStart(true, eExtreme_Biological);
516  }
517  if(acceptor_ok || !donor_ok) {
518  prev_int->SetPartialStop(true, eExtreme_Biological);
519  }
520  }
521  }
522 
523  loc->SetPacked_int().Set().push_back(genomic_int);
524 
525  prev_exon = *it;
526  prev_int = genomic_int;
527  }
528 
529  // set terminal partialness
530  if(m_aln.GetSeqStart(0) > m_allowed_unaligned) {
532  }
533 
534  TSeqPos product_len = aln.GetSegs().GetSpliced().GetProduct_length();
535  TSeqPos polya_pos = aln.GetSegs().GetSpliced().CanGetPoly_a() ? aln.GetSegs().GetSpliced().GetPoly_a() : product_len;
536 
537  if(m_aln.GetSeqStop(0) + 1 + m_allowed_unaligned < polya_pos) {
539  }
540  return loc;
541 
542 }
543 
545 {
546  if (!m_mapper) {
547  m_mapper.Reset
548  (new CSeq_loc_Mapper(m_aln, m_aln.GetSeq_id(m_genomic_row),
549  &m_scope, m_opts));
550  }
551  return m_mapper;
552 }
553 
557 {
558  if (!align_in.CanGetSegs() || !align_in.GetSegs().IsSpliced())
559  return CConstRef<CSeq_align>(&align_in);
560 
561  CRef<CSeq_align> align(new CSeq_align);
562  align->Assign(align_in);
563 
564  vector<SExon> orig_exons = GetExons(*align);
565 
566  StitchSmallHoles(*align);
567  TrimHolesToCodons(*align);
568 
570  MaximizeTranslation(*align);
571  }
572 
573  if (GetExons(*align) != orig_exons) {
575  ClearScores(*align);
576  } else {
577  RecalculateScores(*align);
578  }
579  }
580 
581  return align;
582 }
583 
585 {
586  TSeqPos nucpos = pos.AsSeqPos();
587  pos.SetNucpos(nucpos);
588 }
589 
590 string ExtractGnomonModelNum(const CSeq_id& seq_id)
591 {
592  string model_num;
593  if (seq_id.IsGeneral() && seq_id.GetGeneral().CanGetDb() &&
594  NStr::EqualNocase(seq_id.GetGeneral().GetDb(), "GNOMON")) {
595  model_num = seq_id.GetGeneral().GetTag().GetStr();
596  model_num.erase(model_num.size()-2, 2);
597  }
598  return model_num;
599 }
600 
601 bool IsProteinAlign(const CSeq_align& align)
602 {
603  return align.CanGetSegs() && align.GetSegs().IsSpliced()
604  && align.GetSegs().GetSpliced().GetProduct_type()
606 }
607 
610 {
611  /// This is a protein alignment; transform it into a fake transcript alignment
612  /// so the rest of the processing can go on
613  bool found_start_codon = false;
614  bool found_stop_codon = false;
616  align->GetSegs().GetSpliced().GetModifiers()) {
617  if ((*mod_it)->IsStart_codon_found()) {
618  found_start_codon = (*mod_it)->GetStart_codon_found();
619  }
620  if ((*mod_it)->IsStop_codon_found()) {
621  found_stop_codon = (*mod_it)->GetStop_codon_found();
622  }
623  }
624 
625 
626  CBioseq_Handle bsh = m_scope->GetBioseqHandle(align->GetSeq_id(1));
627  if (!bsh) {
629  "Can't find genomic sequence " +
630  align->GetSeq_id(1).AsFastaString());
631  }
632 
633  CSeq_align *fake_transcript_align = new CSeq_align;
634  fake_transcript_align->Assign(*align);
635  align.Reset(fake_transcript_align);
636 
637  CRef<CSeq_id> prot_id(new CSeq_id);
638  prot_id->Assign(fake_transcript_align->GetSeq_id(0));
639 
640  {
641  /// for the mRna we have to
642  /// create a local id, since the id we have in the alignment is a
643  /// protein id
644  static CAtomicCounter counter;
645  size_t new_id_num = counter.Add(1);
646  CTime time(CTime::eCurrent);
647  string str("lcl|MRNA_");
648  if ((m_flags & fGenerateStableLocalIds) == 0) {
649  str += time.AsString("YMD");
650  str += "_";
651  }
652  str += NStr::NumericToString(new_id_num);
653  CRef<CSeq_id> fake_rna_id(new CSeq_id(str));
654  fake_transcript_align->SetSegs().SetSpliced().SetProduct_id(
655  *fake_rna_id);
656  }
657  fake_transcript_align->SetSegs().SetSpliced().SetProduct_type(
660  fake_transcript_align->SetSegs().SetSpliced().SetExons())
661  {
662  s_TransformToNucpos((*exon_it)->SetProduct_start());
663  s_TransformToNucpos((*exon_it)->SetProduct_end());
664  }
665 
666  CRef<CSpliced_exon> last_exon =
667  fake_transcript_align->SetSegs().SetSpliced().SetExons().back();
668  bool aligned_to_the_end =
669  last_exon->GetProduct_end().GetNucpos()+1==
670  fake_transcript_align->GetSegs().GetSpliced().GetProduct_length()*3;
671 
672  fake_transcript_align->SetSegs().SetSpliced().SetProduct_length() =
673  fake_transcript_align->GetSegs().GetSpliced().GetProduct_length()*3 +
674  (((found_stop_codon && aligned_to_the_end) || !aligned_to_the_end)?3:0);
675 
676  if (found_stop_codon && aligned_to_the_end) {
677  bool is_minus = last_exon->IsSetGenomic_strand() ?
678  last_exon->GetGenomic_strand() == eNa_strand_minus :
679  (fake_transcript_align->GetSegs().GetSpliced()
680  . IsSetGenomic_strand() &&
681  fake_transcript_align->GetSegs().GetSpliced()
682  . GetGenomic_strand() == eNa_strand_minus);
683 
684  TSeqPos genomic_length = bsh.GetBioseqLength();
685  TSeqPos space_for_codon = min(3u, is_minus
686  ? last_exon->GetGenomic_start()
687  : genomic_length - last_exon->GetGenomic_end() - 1);
688  if (space_for_codon < 3) {
691  "Stop codon goes outside genomic sequence");
692  }
693  CRef<CSpliced_exon> new_exon(new CSpliced_exon);
694  new_exon->SetProduct_start().SetNucpos(
695  last_exon->GetProduct_end().GetNucpos() + space_for_codon + 1);
696  new_exon->SetProduct_end().SetNucpos(
697  last_exon->GetProduct_end().GetNucpos() + 3);
698  new_exon->SetGenomic_start(
699  is_minus ? genomic_length - 3 + space_for_codon : 0);
700  new_exon->SetGenomic_end(
701  is_minus ? genomic_length - 1 : 2 - space_for_codon);
702  if (last_exon->IsSetProduct_strand()) {
703  new_exon->SetProduct_strand(last_exon->GetProduct_strand());
704  }
705  if (last_exon->IsSetGenomic_strand()) {
706  new_exon->SetGenomic_strand(last_exon->GetGenomic_strand());
707  }
708  fake_transcript_align->SetSegs().SetSpliced().SetExons()
709  . push_back(new_exon);
710  }
711 
712  /// Extend last exon to include whatever part of stop codon fits
713  last_exon->SetProduct_end().SetNucpos() += space_for_codon;
714  if (is_minus) {
715  last_exon->SetGenomic_start() -= space_for_codon;
716  } else {
717  last_exon->SetGenomic_end() += space_for_codon;
718  }
719  if (last_exon->IsSetParts() && space_for_codon) {
720  CRef<CSpliced_exon_chunk> match_stop_codon
721  (new CSpliced_exon_chunk);
722  match_stop_codon->SetMatch(space_for_codon);
723  last_exon->SetParts().push_back(match_stop_codon);
724  }
725  }
726 
727  cd_feat.Reset(new CSeq_feat);
728  cd_feat->SetData().SetCdregion();
729 
730  CRef<CSeq_loc> cds_on_fake_mrna_loc(new CSeq_loc(
731  fake_transcript_align->SetSegs().SetSpliced().SetProduct_id(),
732  0, fake_transcript_align->GetSegs().GetSpliced().GetProduct_length()-1));
733  if (!found_start_codon &&
734  fake_transcript_align->SetSegs().SetSpliced().SetExons().front()->GetProduct_start().GetNucpos()==0) {
735  cds_on_fake_mrna_loc->SetPartialStart(true, eExtreme_Biological);
736  }
737  if (!found_stop_codon && aligned_to_the_end) {
738  cds_on_fake_mrna_loc->SetPartialStop(true, eExtreme_Biological);
739  }
740  cd_feat->SetLocation(*cds_on_fake_mrna_loc);
741 
742  const COrg_ref *org = sequence::GetOrg_refOrNull(bsh);
743  if (org) {
745  code->SetId(fg::GetGeneticCode(bsh));
746  cd_feat->SetData().SetCdregion().SetCode().Set().push_back(code);
747  }
748 
749  cd_feat->SetProduct().SetWhole(*prot_id);
750 
751 }
752 
753 void RenameGeneratedBioseqs(const CSeq_id& query_rna_id, CSeq_id& transcribed_rna_id,
754  CRef<CSeq_feat> cds_feat_on_query_mrna,
755  CRef<CSeq_feat> cds_feat_on_genome_with_translated_product)
756 {
757  transcribed_rna_id.Assign(query_rna_id);
758  if (cds_feat_on_genome_with_translated_product &&
759  cds_feat_on_genome_with_translated_product->CanGetProduct() &&
760  cds_feat_on_query_mrna &&
761  cds_feat_on_query_mrna->CanGetProduct()) {
762  CSeq_id* translated_protein_id = const_cast<CSeq_id*>(cds_feat_on_genome_with_translated_product->SetProduct().GetId());
763  translated_protein_id->Assign(*cds_feat_on_query_mrna->GetProduct().GetId());
764  }
765 }
766 
770  CSeq_annot& annot,
771  CBioseq_set& seqs,
772  Int8 gene_id,
773  const CSeq_feat* cds_feat_on_query_mrna_ptr,
774  bool call_on_align_list)
775 {
776  if (HasMixedGenomicIds(input_align)) {
777  return ConvertMixedAlignToAnnot(input_align, annot, seqs, gene_id, cds_feat_on_query_mrna_ptr,
778  call_on_align_list);
779  }
780 
781  CConstRef<CSeq_align> align(&input_align);
782  CRef<CSeq_feat> cds_feat_on_query_mrna;
783  bool is_protein_align = IsProteinAlign(*align);
784  if (is_protein_align) {
785  TransformProteinAlignToTranscript(align, cds_feat_on_query_mrna);
786  }
787 
789  if (m_flags & fDensegAsExon) {
791  }
792 
793  SMapper mapper(*align, *m_scope, m_allowed_unaligned, opts);
794 
795  const CSeq_id& query_rna_id = align->GetSeq_id(mapper.GetRnaRow());
796 
797  if (!ExtractGnomonModelNum(query_rna_id).empty()) {
798  m_is_gnomon = true;
799  } else {
800  CSeq_id_Handle best_id;
801  if (!(m_flags & fDeNovoProducts)) {
802  best_id = sequence::GetId(query_rna_id, *m_scope,
804  }
805  CSeq_id::EAccessionInfo rna_acc_info =
806  best_id ? best_id.IdentifyAccession() : query_rna_id.IdentifyAccession();
807  m_is_best_refseq = rna_acc_info == CSeq_id::eAcc_refseq_mrna ||
808  rna_acc_info == CSeq_id::eAcc_refseq_ncrna;
809  }
810 
811 
812  if (cds_feat_on_query_mrna_ptr) {
813  cds_feat_on_query_mrna.Reset(new CSeq_feat);
814  cds_feat_on_query_mrna->Assign(*cds_feat_on_query_mrna_ptr);
815  } else if (!is_protein_align && !(m_flags & fDeNovoProducts)) {
816  CMappedFeat cdregion_handle = GetCdsOnMrna(query_rna_id, *m_scope);
817  if (cdregion_handle) {
818  cds_feat_on_query_mrna.Reset(new CSeq_feat);
819  cds_feat_on_query_mrna->Assign(cdregion_handle.GetMappedFeature());
820  }
821  }
822 
823  CMappedFeat full_length_rna;
824  vector<CMappedFeat> ncRNAs;
825 
826  CBioseq_Handle query_rna_handle = m_scope->GetBioseqHandle(query_rna_id);
827  if (query_rna_handle) {
828  for (CFeat_CI feat_iter(query_rna_handle, CSeqFeatData::e_Rna);
829  feat_iter; ++feat_iter) {
830  const CSeq_loc &rna_loc = feat_iter->GetLocation();
831  if (feat_iter->GetData().GetSubtype() !=
833  ++rna_loc.begin() == rna_loc.end() &&
834  rna_loc.GetTotalRange().GetLength() ==
835  query_rna_handle.GetBioseqLength())
836  {
837  full_length_rna = *feat_iter;
838  } else if (feat_iter->GetData().GetSubtype() ==
840  {
841  ncRNAs.push_back(*feat_iter);
842  }
843  }
844  }
845 
846  CTime time(CTime::eCurrent);
847  static CAtomicCounter counter;
848  size_t model_num = counter.Add(1);
849 
850  /// we always need the mRNA location as a reference
851  CRef<CSeq_loc> rna_feat_loc_on_genome(new CSeq_loc);
852  rna_feat_loc_on_genome->Assign(mapper.GetRnaLoc());
853 
854  CRef<CSeq_feat> cds_feat_on_transcribed_mrna;
855  list<CRef<CSeq_loc> > transcribed_mrna_seqloc_refs;
856 
857  /// create a new bioseq for this mRNA; if the mRNA sequence is not found,
858  /// this is needed in order to translate the protein
859  /// alignment, even if flag fForceTranscribeMrna wasn't set
860  CRef<CSeq_id> transcribed_rna_id =
861  x_CreateMrnaBioseq(*align, rna_feat_loc_on_genome, time,
862  model_num, seqs,
863  cds_feat_on_query_mrna, cds_feat_on_transcribed_mrna);
864 
865  CRef<CSeq_feat> mrna_feat_on_genome_with_translated_product =
866  full_length_rna && (m_flags&fPropagateNcrnaFeats)
867  /// If there is a full-length RNA feature, propagate it instead of
868  /// creating a new one. Create the bioseq separately
869  ? x_CreateNcRnaFeature(&full_length_rna.GetOriginalFeature(),
870  *align, rna_feat_loc_on_genome, opts)
871  : x_CreateMrnaFeature(rna_feat_loc_on_genome, query_rna_id,
872  *transcribed_rna_id, cds_feat_on_query_mrna);
873  if (mrna_feat_on_genome_with_translated_product &&
874  !mrna_feat_on_genome_with_translated_product->IsSetProduct()) {
875  /// Propagated full-length feature; add product
876  mrna_feat_on_genome_with_translated_product->
877  SetProduct().SetWhole().Assign(*transcribed_rna_id);
878  }
879 
880  CRef<CSeq_feat> cds_feat_on_genome_with_translated_product =
881  x_CreateCdsFeature(cds_feat_on_query_mrna, cds_feat_on_transcribed_mrna,
882  transcribed_mrna_seqloc_refs,
883  *align, rna_feat_loc_on_genome, time, model_num, seqs, opts);
884 
885  const CSeq_id& genomic_id = align->GetSeq_id(mapper.GetGenomicRow());
886  if (m_is_best_refseq && mrna_feat_on_genome_with_translated_product) {
887  CSeq_id_Handle genomic_acc = sequence::GetId(genomic_id, *m_scope,
889  if (genomic_acc) {
890  x_AddSelectMarkup(*align, query_rna_handle, *genomic_acc.GetSeqId(),
891  *mrna_feat_on_genome_with_translated_product,
892  cds_feat_on_genome_with_translated_product.GetPointer());
893  }
894  }
895 
896  CRef<CSeq_feat> gene_feat;
897 
898  if(!call_on_align_list){
899  if (gene_id) {
900  TGeneMap::iterator gene = genes.find(gene_id);
901  if (gene == genes.end()) {
902  x_CreateGeneFeature(gene_feat, query_rna_handle, mapper,
903  rna_feat_loc_on_genome, genomic_id, gene_id);
904  if (gene_feat) {
905  _ASSERT(gene_feat->GetData().Which() !=
907  annot.SetData().SetFtable().push_back(gene_feat);
908  }
909  gene = genes.insert(make_pair(gene_id,gene_feat)).first;
910  } else {
911  gene_feat = gene->second;
912  gene_feat->SetLocation(*MergeSeq_locs(&gene_feat->GetLocation(),
913  &mrna_feat_on_genome_with_translated_product->GetLocation()));
914  }
915 
916  CRef< CSeqFeatXref > genexref( new CSeqFeatXref() );
917  genexref->SetId(*gene_feat->SetIds().front());
918 
919  CRef< CSeqFeatXref > mrnaxref( new CSeqFeatXref() );
920  mrnaxref->SetId(*mrna_feat_on_genome_with_translated_product->SetIds().front());
921 
922  gene_feat->SetXref().push_back(mrnaxref);
923  mrna_feat_on_genome_with_translated_product->SetXref().push_back(genexref);
924 
925  } else {
926  x_CreateGeneFeature(gene_feat, query_rna_handle, mapper,
927  rna_feat_loc_on_genome, genomic_id);
928  if (gene_feat) {
929  _ASSERT(gene_feat->GetData().Which() != CSeqFeatData::e_not_set);
930  annot.SetData().SetFtable().push_back(gene_feat);
931  }
932  }
933  }
934 
935  if (mrna_feat_on_genome_with_translated_product) {
936  _ASSERT(mrna_feat_on_genome_with_translated_product->GetData().Which() != CSeqFeatData::e_not_set);
937 
938  annot.SetData().SetFtable().push_back(mrna_feat_on_genome_with_translated_product); // NOTE: added after gene!
939  }
940 
941  CSeq_annot::C_Data::TFtable propagated_features;
942 
943  if(cds_feat_on_genome_with_translated_product.NotNull()) {
944  propagated_features.push_back(cds_feat_on_genome_with_translated_product);
945 
946  if (cds_feat_on_query_mrna && cds_feat_on_query_mrna->CanGetProduct()) {
947  CBioseq_Handle prot_handle =
948  m_scope->GetBioseqHandle(*cds_feat_on_query_mrna->GetProduct().GetId());
949  if (prot_handle) {
950  for (CFeat_CI feat_iter(prot_handle,
952  feat_iter; ++feat_iter) {
953  const CProt_ref &prot_ref =
954  feat_iter->GetData().GetProt();
955  if (prot_ref.IsSetName() &&
956  !prot_ref.GetName().empty()) {
957  CRef< CSeqFeatXref > prot_xref(
958  new CSeqFeatXref());
959  prot_xref->SetData().SetProt().SetName()
960  . push_back(prot_ref.GetName().front());
961  cds_feat_on_genome_with_translated_product->SetXref().push_back(prot_xref);
962  break;
963  }
964  }
965  }
966  }
967  }
968 
969  ITERATE(vector<CMappedFeat>, it, ncRNAs){
970  CRef<CSeq_feat> ncrna_feat =
971  x_CreateNcRnaFeature(&it->GetOriginalFeature(), *align, rna_feat_loc_on_genome, opts);
972  if(ncrna_feat)
973  propagated_features.push_back(ncrna_feat);
974  }
975 
976  NON_CONST_ITERATE(CSeq_annot::C_Data::TFtable, it, propagated_features){
977  _ASSERT((*it)->GetData().Which() != CSeqFeatData::e_not_set);
978  annot.SetData().SetFtable().push_back(*it);
979 
980  if (m_is_gnomon) { // create xrefs for gnomon models
981  CRef< CSeqFeatXref > propagatedxref( new CSeqFeatXref() );
982  if ((*it)->IsSetIds()) {
983  propagatedxref->SetId(*(*it)->SetIds().front());
984  }
985 
986  CRef< CSeqFeatXref > mrnaxref( new CSeqFeatXref() );
987  mrnaxref->SetId(*mrna_feat_on_genome_with_translated_product->SetIds().front());
988 
989  (*it)->SetXref().push_back(mrnaxref);
990  mrna_feat_on_genome_with_translated_product->SetXref().push_back(propagatedxref);
991  }
992  }
993 
994  if(!call_on_align_list){
995  if(propagated_features.empty()){
996  SetPartialFlags(gene_feat, mrna_feat_on_genome_with_translated_product, CRef<CSeq_feat>());
997  }
998  ITERATE(CSeq_annot::C_Data::TFtable, it, propagated_features){
999  x_CheckInconsistentDbxrefs(gene_feat, *it);
1000  SetPartialFlags(gene_feat, mrna_feat_on_genome_with_translated_product, *it);
1001  }
1002  x_CopyAdditionalFeatures(query_rna_handle, mapper, annot);
1003  }
1004 
1005  if (!(m_flags & fGenerateLocalIds)) {
1006  if (mrna_feat_on_genome_with_translated_product) {
1007  mrna_feat_on_genome_with_translated_product->SetProduct().SetWhole().Assign(query_rna_id);
1008  }
1009  if (cds_feat_on_genome_with_translated_product) {
1010  if (cds_feat_on_query_mrna->CanGetProduct()) {
1011  cds_feat_on_genome_with_translated_product->
1012  SetProduct().Assign(cds_feat_on_query_mrna->GetProduct());
1013  cds_feat_on_transcribed_mrna->
1014  SetProduct().Assign(cds_feat_on_query_mrna->GetProduct());
1015  }
1016  CRef<CSeq_id> seq_id(new CSeq_id);
1017  seq_id->Assign(query_rna_id);
1018  cds_feat_on_transcribed_mrna->SetLocation().SetId(*seq_id);
1019  NON_CONST_ITERATE (list<CRef<CSeq_loc> >, loc, transcribed_mrna_seqloc_refs) {
1020  (*loc)->SetId(*seq_id);
1021  }
1022  }
1023 
1024  // rename generated bioseqs if query bioseqs do not exist
1025  if (!query_rna_handle) {
1026  RenameGeneratedBioseqs(query_rna_id, *transcribed_rna_id,
1027  cds_feat_on_query_mrna, cds_feat_on_genome_with_translated_product);
1028  }
1029  }
1030 
1031  if (mrna_feat_on_genome_with_translated_product) {
1032  CBioseq_Handle rna_handle =
1033  m_scope->GetBioseqHandle(query_rna_id);
1034  CSeq_entry_Handle rna_seh;
1035  if (!rna_handle) {
1036  rna_seh = m_scope->AddTopLevelSeqEntry(*seqs.SetSeq_set().front());
1037  }
1038 
1039  SetFeatureExceptions(*mrna_feat_on_genome_with_translated_product, align,
1040  cds_feat_on_genome_with_translated_product.GetPointer(),
1041  cds_feat_on_query_mrna.GetPointer(),
1042  cds_feat_on_transcribed_mrna.GetPointer());
1043 
1044  if (rna_seh) {
1045  m_scope->RemoveTopLevelSeqEntry(rna_seh);
1046  }
1047  }
1048  if (cds_feat_on_genome_with_translated_product) {
1049  CBioseq_Handle prot_handle =
1050  m_scope->GetBioseqHandle(*cds_feat_on_genome_with_translated_product->GetProduct().GetId());
1051  CSeq_entry_Handle prot_seh;
1052  if (!prot_handle) {
1053  prot_seh = m_scope->AddTopLevelSeqEntry(*seqs.SetSeq_set().back());
1054  }
1055 
1056  TSeqPos clean_match_count = 0;
1057  SetFeatureExceptions(*cds_feat_on_genome_with_translated_product, align, NULL,
1058  cds_feat_on_query_mrna.GetPointer(),
1059  cds_feat_on_transcribed_mrna.GetPointer(),
1060  &transcribed_mrna_seqloc_refs,
1061  &clean_match_count);
1062  if (!clean_match_count) {
1063  /// Not even one base matched cleanly; remove feature
1064  annot.SetData().SetFtable().remove(cds_feat_on_genome_with_translated_product);
1065  cds_feat_on_genome_with_translated_product = NULL;
1066  }
1067  if (prot_seh) {
1068  m_scope->RemoveTopLevelSeqEntry(prot_seh);
1069  }
1070  }
1071 
1072  if (!(m_flags & fGenerateLocalIds)) {
1073  RenameGeneratedBioseqs(query_rna_id, *transcribed_rna_id, cds_feat_on_query_mrna, cds_feat_on_genome_with_translated_product);
1074  }
1075  if (m_is_gnomon) {
1076  // add generated bioseqs to the scope
1078  m_scope->AddTopLevelSeqEntry(**it);
1079  }
1080  }
1081 
1082  if (!(m_flags & fForceTranscribeMrna) ||
1084  {
1085  /// We created Bioseqs the user didn't ask for,
1086  /// so we need to now remove them
1087  for (CBioseq_set::TSeq_set::iterator bioseq_it =
1088  seqs.SetSeq_set().begin();
1089  bioseq_it != seqs.SetSeq_set().end(); )
1090  {
1091  if (((*bioseq_it)->GetSeq().IsNa() &&
1092  !(m_flags & fForceTranscribeMrna)) ||
1093  ((*bioseq_it)->GetSeq().IsAa() &&
1095  {
1096  bioseq_it = seqs.SetSeq_set().erase(bioseq_it);
1097  } else {
1098  ++bioseq_it;
1099  }
1100  }
1101  }
1102 
1103  //collapse one interval packed-ints
1104  for ( CTypeIterator<CSeq_loc> loc(annot); loc; ++loc) {
1105  if (loc->IsPacked_int() && loc->GetPacked_int().Get().size()==1) {
1106  CRef<CSeq_interval> interval = loc->SetPacked_int().Set().front();
1107  loc->SetInt(*interval);
1108  }
1109  }
1110  return is_protein_align ? cds_feat_on_genome_with_translated_product : mrna_feat_on_genome_with_translated_product;
1111 }
1112 
1113 void
1116  const list< CRef<CSeq_align> > &aligns,
1117  CSeq_annot &annot,
1118  CBioseq_set &seqs)
1119 {
1121  if (m_flags & fDensegAsExon) {
1123  }
1124 
1125  CRef<CSeq_feat> gene_feat;
1126  CSeq_annot gene_annot;
1127  CSeq_id_Handle gene_handle;
1128  ITERATE(list< CRef<CSeq_align> >, align_it, aligns){
1129  CConstRef<CSeq_align> clean_align = CleanAlignment(**align_it);
1130  CRef<CSeq_feat> mrna_feat = ConvertAlignToAnnot(*clean_align, gene_annot, seqs, 0, NULL, true);
1131 
1132  SMapper mapper(*clean_align, *m_scope, m_allowed_unaligned, opts);
1133  const CSeq_id& genomic_id = clean_align->GetSeq_id(mapper.GetGenomicRow());
1134  const CSeq_id& rna_id = clean_align->GetSeq_id(mapper.GetRnaRow());
1135  if(!gene_handle)
1136  gene_handle = CSeq_id_Handle::GetHandle(genomic_id);
1137  else if(!(gene_handle == genomic_id))
1139  "Bad list of alignments to ConvertAlignToAnnot(); alignments on different genes");
1140 
1141  CRef<CSeq_loc> loc(new CSeq_loc);
1142  loc->Assign(mapper.GetRnaLoc());
1143 
1144  CBioseq_Handle handle = m_scope->GetBioseqHandle(rna_id);
1145  x_CreateGeneFeature(gene_feat, handle, mapper, loc, genomic_id);
1146 
1147  x_CopyAdditionalFeatures(handle, mapper, gene_annot);
1148  }
1149  NON_CONST_ITERATE(CSeq_annot::C_Data::TFtable, feat_it, gene_annot.SetData().SetFtable())
1150  {
1151  x_CheckInconsistentDbxrefs(gene_feat, *feat_it);
1152  }
1153  gene_annot.SetData().SetFtable().push_front(gene_feat);
1154  RecomputePartialFlags(gene_annot);
1155  annot.SetData().SetFtable().splice(annot.SetData().SetFtable().end(),
1156  gene_annot.SetData().SetFtable());
1157 }
1158 
1159 bool IsContinuous(const CSeq_loc& loc)
1160 {
1161  ITERATE (CSeq_loc, loc_it, loc) {
1162  if ((loc_it.GetRange().GetFrom() != loc.GetStart(eExtreme_Positional) && loc_it.GetRangeAsSeq_loc()->IsPartialStart(eExtreme_Positional)) ||
1163  (loc_it.GetRange().GetTo() != loc.GetStop(eExtreme_Positional) && loc_it.GetRangeAsSeq_loc()->IsPartialStop(eExtreme_Positional))) {
1164  return false;
1165  }
1166  }
1167  return true;
1168 }
1169 
1170 void AddLiteral(CSeq_inst& inst, const string& seq, CSeq_inst::EMol mol_class)
1171 {
1172  if (inst.IsSetExt()) {
1173  if (!inst.SetExt().SetDelta().Set().empty()) {
1174  CDelta_seq& delta_seq = *inst.SetExt().SetDelta().Set().back();
1175  if (delta_seq.IsLiteral() && delta_seq.GetLiteral().IsSetSeq_data()) {
1176  string iupacna;
1177  switch(delta_seq.GetLiteral().GetSeq_data().Which()) {
1178  case CSeq_data::e_Iupacna:
1179  iupacna = delta_seq.GetLiteral().GetSeq_data().GetIupacna();
1180  break;
1181  case CSeq_data::e_Ncbi2na:
1183  0, delta_seq.GetLiteral().GetLength(), iupacna, CSeqUtil::e_Iupacna);
1184  break;
1185  case CSeq_data::e_Ncbi4na:
1187  0, delta_seq.GetLiteral().GetLength(), iupacna, CSeqUtil::e_Iupacna);
1188  break;
1189  case CSeq_data::e_Ncbi8na:
1191  0, delta_seq.GetLiteral().GetLength(), iupacna, CSeqUtil::e_Iupacna);
1192  break;
1193  case CSeq_data::e_Iupacaa:
1194  delta_seq.SetLiteral().SetSeq_data().SetIupacaa().Set() += seq;
1195  delta_seq.SetLiteral().SetLength() += seq.size();
1196  return;
1197  default:
1198  inst.SetExt().SetDelta().AddLiteral(seq, mol_class);
1199  return;
1200  }
1201  iupacna += seq;
1202  delta_seq.SetLiteral().SetSeq_data().SetIupacna().Set(iupacna);
1203  delta_seq.SetLiteral().SetLength(iupacna.size());
1204  CSeqportUtil::Pack(&delta_seq.SetLiteral().SetSeq_data());
1205  return;
1206  }
1207  }
1208  inst.SetExt().SetDelta().AddLiteral(seq, mol_class);
1209  } else {
1210  inst.SetSeq_data().SetIupacna().Set() += seq;
1211  }
1212 }
1213 
1216  const CSeq_align& align,
1217  const CSeq_loc& loc,
1218  bool add_unaligned_parts,
1219  bool mark_transcript_deletions,
1220  bool* has_gap,
1221  bool* has_indel)
1222 {
1223  /// set up the inst
1225 
1226  // this is created as a transcription of the genomic location
1227 
1230 
1231  to_mrna.SetMergeAll();
1232  to_genomic.SetMergeAll();
1233 
1234  int seq_size = 0;
1235  int prev_product_to = -1;
1236  bool prev_fuzz = false;
1237 
1238  for (CSeq_loc_CI loc_it(loc,
1241  loc_it; ++loc_it) {
1242 
1243  CConstRef<CSeq_loc> exon = loc_it.GetRangeAsSeq_loc();
1244  CRef<CSeq_loc> mrna_loc = to_mrna.Map(*exon);
1245 
1246  if ((prev_product_to > -1 &&
1247  loc_it.GetRangeAsSeq_loc()->IsPartialStart(eExtreme_Biological)) ||
1248  prev_fuzz) {
1249  if (has_gap != NULL) {
1250  *has_gap = true;
1251  }
1252  if (!inst.IsSetExt()) {
1253  inst.SetExt().SetDelta().AddLiteral
1255  inst.ResetSeq_data();
1256  }
1257  int gap_len = add_unaligned_parts ? mrna_loc->GetTotalRange().GetFrom()-(prev_product_to+1) : 0;
1258  if (gap_len >= 0) {
1259  seq_size += gap_len;
1260  prev_product_to += gap_len;
1261  inst.SetExt().SetDelta().AddLiteral(gap_len);
1262  if (gap_len == 0)
1263  inst.SetExt().SetDelta().Set().back()
1264  ->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
1265  }
1266  }
1267 
1268  unsigned part_count = 0;
1269  unsigned mapped_exon_len = 0;
1270  for (CSeq_loc_CI part_it(*mrna_loc); part_it; ++part_it) {
1271  ++part_count;
1272  if (prev_product_to<0) {
1273  prev_product_to = part_it.GetRange().GetFrom()-1;
1274  if (add_unaligned_parts && part_it.GetRange().GetFrom() > 0) {
1275  seq_size = part_it.GetRange().GetFrom();
1276  inst.SetExt().SetDelta().AddLiteral(seq_size);
1277  }
1278  }
1279  int deletion_len = part_it.GetRange().GetFrom()-(prev_product_to+1);
1280  /// If this is the first part of the mapped segment, the deletion is
1281  /// in the CDS location on the transcript; mark with Ns only if
1282  /// mark_transcript_deletions is set. If this is a later part, the
1283  /// deletion is in the transcript mapping to the genomic sequence;
1284  /// mark always
1285  if (deletion_len > 0) {
1286  if (mark_transcript_deletions && part_count == 1) {
1287  // check if the deletion is in the alignment or the original multupart cds
1288 
1289  CSeq_loc deletion_loc;
1290  deletion_loc.SetInt().SetId().Assign(part_it.GetSeq_id());
1291  deletion_loc.SetInt().SetFrom(prev_product_to+1);
1292  deletion_loc.SetInt().SetTo(part_it.GetRange().GetFrom()-1);
1293 
1294  deletion_len -= (int)GetLength(*to_genomic.Map(deletion_loc), NULL);
1295  }
1296 
1297  if (deletion_len > 0 && (mark_transcript_deletions || part_count > 1)) {
1298  if (has_indel != NULL) {
1299  *has_indel = true;
1300  }
1301  string deletion(deletion_len, 'N');
1302  AddLiteral(inst, deletion, CSeq_inst::eMol_rna);
1303  seq_size += deletion.size();
1304  }
1305  }
1306 
1307  CConstRef<CSeq_loc> part = part_it.GetRangeAsSeq_loc();
1308  CRef<CSeq_loc> genomic_loc = to_genomic.Map(*part);
1309 
1310  for (CSeq_loc_CI it(*genomic_loc); it; ++it) {
1311  mapped_exon_len += it.GetRange().GetLength();
1312  }
1313 
1314  CSeqVector vec(*genomic_loc, *m_scope, CBioseq_Handle::eCoding_Iupac);
1315  string seq;
1316  vec.GetSeqData(0, vec.size(), seq);
1317 
1318  AddLiteral(inst, seq, CSeq_inst::eMol_rna);
1319 
1320  seq_size += vec.size();
1321 
1322  prev_product_to = part_it.GetRange().GetTo();
1323  }
1324  if (has_indel != NULL &&
1325  (part_count > 1 ||
1326  mapped_exon_len != loc_it.GetRange().GetLength())) {
1327  *has_indel = true;
1328  }
1329 
1330  prev_fuzz = loc_it.GetRangeAsSeq_loc()->IsPartialStop(eExtreme_Biological);
1331  }
1332 
1333  if (add_unaligned_parts && align.GetSegs().IsSpliced()) {
1334  const CSpliced_seg& spl = align.GetSegs().GetSpliced();
1335  if (spl.IsSetProduct_length()) {
1336  TSeqPos length = spl.GetProduct_length();
1337  if (seq_size < (int)length) {
1338  if (!inst.IsSetExt()) {
1339  inst.SetExt().SetDelta().AddLiteral
1341  inst.ResetSeq_data();
1342  }
1343  inst.SetExt().SetDelta().AddLiteral(length-seq_size);
1344  seq_size = length;
1345  }
1346  }
1347  }
1348 
1349  inst.SetLength(seq_size);
1350  if (inst.IsSetExt()) {
1352  } else {
1355  }
1356 }
1357 
1360  CConstRef<CSeq_loc> rna_feat_loc_on_genome,
1361  const CTime& time,
1362  size_t model_num,
1363  CBioseq_set& seqs,
1364  CConstRef<CSeq_feat> cds_feat_on_query_mrna,
1365  CRef<CSeq_feat>& cds_feat_on_transcribed_mrna)
1366 {
1367  CRef<CSeq_entry> entry(new CSeq_entry);
1368  CBioseq& bioseq = entry->SetSeq();
1369 
1370  CRef<CSeqdesc> mdes(new CSeqdesc);
1371  entry->SetSeq().SetDescr().Set().push_back(mdes);
1372  mdes->SetMolinfo().SetBiomol(cds_feat_on_query_mrna.IsNull()
1374 
1375  CMolInfo::ECompleteness completeness;
1376  if (!IsContinuous(*rna_feat_loc_on_genome)) {
1377  completeness = CMolInfo::eCompleteness_partial;
1378  } else if (cds_feat_on_query_mrna.IsNull()) {
1379  completeness = CMolInfo::eCompleteness_unknown;
1380  } else if (cds_feat_on_query_mrna->GetLocation().IsPartialStart(eExtreme_Biological) &&
1381  cds_feat_on_query_mrna->GetLocation().IsPartialStop(eExtreme_Biological)
1382  ) {
1383  completeness = CMolInfo::eCompleteness_no_ends;
1384  } else if (cds_feat_on_query_mrna->GetLocation().IsPartialStart(eExtreme_Biological)) {
1385  completeness = CMolInfo::eCompleteness_no_left;
1386  } else if (cds_feat_on_query_mrna->GetLocation().IsPartialStop(eExtreme_Biological)) {
1387  completeness = CMolInfo::eCompleteness_no_right;
1388  } else {
1389  completeness = CMolInfo::eCompleteness_unknown;
1390  }
1391  mdes->SetMolinfo().SetCompleteness(completeness);
1392 
1393  x_CollectMrnaSequence(bioseq.SetInst(), align, *rna_feat_loc_on_genome);
1394 
1395  CRef<CSeq_align> assembly(new CSeq_align);
1396  assembly->Assign(align);
1397  bioseq.SetInst().SetHist().SetAssembly().push_back(assembly);
1398 
1399  CRef<CSeq_id> transcribed_rna_id(new CSeq_id);
1400  {{
1401  /// create a new seq-id for this
1402  string str("lcl|CDNA_");
1403  if ((m_flags & fGenerateStableLocalIds) == 0) {
1404  str += time.AsString("YMD");
1405  str += "_";
1406  }
1407  str += NStr::SizetToString(model_num);
1408  transcribed_rna_id->Set(str);
1409  }}
1410  bioseq.SetId().push_back(transcribed_rna_id);
1411 
1412  if (cds_feat_on_query_mrna.NotNull()) {
1413  CRef<CSeq_annot> annot(new CSeq_annot);
1414  entry->SetSeq().SetAnnot().push_back(annot);
1415  _ASSERT(cds_feat_on_query_mrna->GetData().Which() != CSeqFeatData::e_not_set);
1416 
1417  cds_feat_on_transcribed_mrna.Reset(new CSeq_feat);
1418  cds_feat_on_transcribed_mrna->Assign(*cds_feat_on_query_mrna);
1419  cds_feat_on_transcribed_mrna->SetLocation().SetId(*transcribed_rna_id);
1420 
1421  annot->SetData().SetFtable().push_back(cds_feat_on_transcribed_mrna);
1422 
1423  // remap code-breaks
1425  cds_feat_on_transcribed_mrna->SetData().SetCdregion();
1426  if (cds.IsSetCode_break()) {
1427  for (CCdregion::TCode_break::iterator it = cds.SetCode_break().begin(); it != cds.SetCode_break().end(); ++it) {
1428  (*it)->SetLoc().SetId(*transcribed_rna_id);
1429  }
1430  }
1431  }
1432 
1435  }
1436 
1437  seqs.SetSeq_set().push_back(entry);
1438 
1439  return transcribed_rna_id;
1440 }
1441 
1442 void AddCodeBreak(CSeq_feat& feat, CSeq_loc& loc, char ncbieaa)
1443 {
1444  CRef<CCode_break> code_break(new CCode_break);
1445  code_break->SetLoc(loc);
1446  code_break->SetAa().SetNcbieaa(ncbieaa);
1447  if (feat.IsSetData() && feat.SetData().IsCdregion()) {
1448  feat.SetData().SetCdregion().SetCode_break().push_back(code_break);
1449  } else {
1450  NCBI_THROW(CException, eUnknown, "Adding code break to non-cdregion feature");
1451  }
1452 }
1453 
1456  CRef<CSeq_feat> cds_feat_on_transcribed_mrna,
1457  list<CRef<CSeq_loc> >& transcribed_mrna_seqloc_refs,
1458  const CTime& time,
1459  size_t model_num,
1460  CBioseq_set& seqs)
1461 {
1462  CRef<CSeq_entry> entry(new CSeq_entry);
1463  CBioseq& bioseq = entry->SetSeq();
1464 
1465  // create a new seq-id for this
1466  string str("lcl|PROT_");
1467  if ((m_flags & fGenerateStableLocalIds) == 0) {
1468  str += time.AsString("YMD");
1469  str += "_";
1470  }
1471  str += NStr::SizetToString(model_num);
1472  CRef<CSeq_id> translated_protein_id(new CSeq_id(str));
1473  cds_feat_on_transcribed_mrna->SetProduct().SetWhole(*translated_protein_id);
1474 
1475  bioseq.SetId().push_back(translated_protein_id);
1476 
1477  CRef<CSeqdesc> desc(new CSeqdesc);
1479 
1480  CMolInfo::ECompleteness completeness;
1481  if (!IsContinuous(*cds_loc)) {
1482  completeness = CMolInfo::eCompleteness_partial;
1483  } else if (cds_loc->IsPartialStart(eExtreme_Biological) &&
1485  ) {
1486  completeness = CMolInfo::eCompleteness_no_ends;
1487  } else if (cds_loc->IsPartialStart(eExtreme_Biological)) {
1488  completeness = CMolInfo::eCompleteness_no_left;
1489  } else if (cds_loc->IsPartialStop(eExtreme_Biological)) {
1490  completeness = CMolInfo::eCompleteness_no_right;
1491  } else {
1492  completeness = CMolInfo::eCompleteness_complete;
1493  }
1494  desc->SetMolinfo().SetCompleteness(completeness);
1495 
1496  bioseq.SetDescr().Set().push_back(desc);
1497 
1498  // set up the inst
1499 
1500  CSeq_entry_Handle mrna_seh = m_scope->AddTopLevelSeqEntry(*seqs.SetSeq_set().back());
1501 
1502  string strprot;
1503  CSeqTranslator::Translate(*cds_feat_on_transcribed_mrna, *m_scope, strprot, true, false);
1504 
1505  CRef<CSeq_loc> protloc_on_mrna(new CSeq_loc);
1506  protloc_on_mrna->Assign(cds_feat_on_transcribed_mrna->GetLocation());
1507  protloc_on_mrna->SetId(*const_cast<CSeq_id*>(cds_feat_on_transcribed_mrna->GetLocation().GetId()));
1508 
1509  // Remove final stop codon from sequence
1510  bool final_code_break = false;
1511  if (!protloc_on_mrna->IsPartialStop(eExtreme_Biological)) {
1512  final_code_break = (strprot[strprot.size()-1] != '*');
1513 
1514  strprot.resize(strprot.size()-1);
1515  }
1516 
1517  CSeq_inst& seq_inst = bioseq.SetInst();
1518  seq_inst.SetMol(CSeq_inst::eMol_aa);
1519 
1520  seq_inst.SetRepr(CSeq_inst::eRepr_delta);
1521  seq_inst.SetExt().SetDelta();
1522  CSeqVector seqv(*protloc_on_mrna, *m_scope, CBioseq_Handle::eCoding_Ncbi);
1524  map.Reset(&seqv.GetSeqMap());
1525 
1526  const CCdregion& cdr = cds_feat_on_transcribed_mrna->GetData().GetCdregion();
1527  int frame = 0;
1528  if (cdr.IsSetFrame ()) {
1529  switch (cdr.GetFrame ()) {
1530  case CCdregion::eFrame_two :
1531  frame = 1;
1532  break;
1534  frame = 2;
1535  break;
1536  default :
1537  break;
1538  }
1539  }
1540 
1541  bool starts_with_code_break = false;
1542  if (cdr.IsSetCode_break()) {
1544  if ((*it)->GetLoc().GetStart(eExtreme_Positional) == protloc_on_mrna->GetStart(eExtreme_Positional)) {
1545  starts_with_code_break = true;
1546  break;
1547  }
1548  }
1549  }
1550 
1551  size_t b = 0;
1552  size_t e = 0;
1553  size_t skip_5_prime = 0;
1554  size_t skip_3_prime = 0;
1555  unsigned count_internal_stops = 0;
1556 
1557  for( CSeqMap_CI ci = map->BeginResolved(m_scope.GetPointer()); ci; ci.Next()) {
1558  int codon_start_pos = (int)ci.GetPosition() + frame;
1559  int len = int(ci.GetLength()) - frame;
1560  frame = len >=0 ? -(len%3) : -len;
1561  _ASSERT( -3 < frame && frame < 3 );
1562  len += frame;
1563  if (len==0) {
1564  if (b==0 &&
1565  (ci.IsUnknownLength() || !ci.IsSetData()) &&
1566  cds_loc->IsPartialStart(eExtreme_Biological)) {
1567 
1568  skip_5_prime += 1;
1569  b += 1;
1570  frame += 3;
1571  }
1572  continue;
1573  }
1574  e = b + len/3;
1575  bool stop_codon_included = e > strprot.size();
1576  if (stop_codon_included) {
1577  _ASSERT( len%3 != 0 || !protloc_on_mrna->IsPartialStop(eExtreme_Biological) );
1578  --e;
1579  len = len >= 3 ? len-3 : 0;
1580  }
1581 
1582  // template for codon seq-locs
1583  CRef<CSeq_loc> codon_on_mrna = protloc_on_mrna->Merge(CSeq_loc::fMerge_SingleRange, NULL);
1584  codon_on_mrna->SetPartialStart(false, eExtreme_Biological);
1585  codon_on_mrna->SetPartialStop(false, eExtreme_Biological);
1586 
1587 
1588  if (ci.IsUnknownLength()) {
1589  seq_inst.SetExt().SetDelta().AddLiteral(len);
1590  seq_inst.SetExt().SetDelta().Set().back()->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
1591  } else if (!ci.IsSetData()) { // unaligned mRNA portion
1592  if (b==skip_5_prime &&
1593  cds_loc->IsPartialStart(eExtreme_Biological)) {
1594  skip_5_prime += e-b;
1595  } else if (stop_codon_included && b==e) {
1596  // just stop codon
1597  // do not add zero length gap
1598  } else {
1599  if (strprot[b] != 'X') { // preceding partial codon translated unambiguously - add it
1600  AddLiteral(seq_inst, strprot.substr(b,1), CSeq_inst::eMol_aa);
1601  b += 1;
1602  }
1603  if (b < e) {
1604  seq_inst.SetExt().SetDelta().AddLiteral(e-b);
1605  }
1606  }
1607  } else {
1608  if (stop_codon_included && final_code_break) {
1609  TSeqPos pos_on_mrna = codon_start_pos + protloc_on_mrna->GetStart(eExtreme_Positional) + (e-b)*3;
1610  CRef<CSeq_loc> stop_codon_on_mrna = codon_on_mrna->Merge(CSeq_loc::fMerge_SingleRange, NULL);
1611  stop_codon_on_mrna->SetInt().SetFrom(pos_on_mrna);
1612  stop_codon_on_mrna->SetInt().SetTo(pos_on_mrna + 2);
1613  AddCodeBreak(*cds_feat_on_transcribed_mrna, *stop_codon_on_mrna, '*');
1614  transcribed_mrna_seqloc_refs.push_back(stop_codon_on_mrna);
1615  }
1616  if (b < e) {
1617 
1618  if (b==0 && strprot[b] != 'M' &&
1619  !starts_with_code_break &&
1620  !protloc_on_mrna->IsPartialStart(eExtreme_Biological)) {
1621  strprot[b] = 'M';
1622  TSeqPos pos_on_mrna = codon_start_pos + protloc_on_mrna->GetStart(eExtreme_Positional);
1623  CRef<CSeq_loc> start_codon_on_mrna = codon_on_mrna->Merge(CSeq_loc::fMerge_SingleRange, NULL);
1624  start_codon_on_mrna->SetInt().SetFrom(pos_on_mrna);
1625  start_codon_on_mrna->SetInt().SetTo(pos_on_mrna + 2);
1626  AddCodeBreak(*cds_feat_on_transcribed_mrna, *start_codon_on_mrna, 'M');
1627  transcribed_mrna_seqloc_refs.push_back(start_codon_on_mrna);
1628  }
1629 
1630  // Repair any internal stops with Xs
1631  size_t stop_aa_pos = b-1;
1632  while ((stop_aa_pos = strprot.find('*', stop_aa_pos+1)) < e) {
1633  strprot[stop_aa_pos] = 'X';
1634 
1635  TSeqPos pos_on_mrna = codon_start_pos + protloc_on_mrna->GetStart(eExtreme_Positional) + (stop_aa_pos-b)*3;
1636  CRef<CSeq_loc> internal_stop_on_mrna = codon_on_mrna->Merge(CSeq_loc::fMerge_SingleRange, NULL);
1637  internal_stop_on_mrna->SetInt().SetFrom(pos_on_mrna);
1638  internal_stop_on_mrna->SetInt().SetTo(pos_on_mrna + 2);
1639  AddCodeBreak(*cds_feat_on_transcribed_mrna, *internal_stop_on_mrna, 'X');
1640  transcribed_mrna_seqloc_refs.push_back(internal_stop_on_mrna);
1641  ++count_internal_stops;
1642  }
1643 
1644 
1645  AddLiteral(seq_inst, strprot.substr(b,e-b), CSeq_inst::eMol_aa);
1646  }
1647  }
1648  b = e;
1649  }
1650  _ASSERT( -2 <= frame && frame <= 0 );
1651 
1652  if (m_is_best_refseq && count_internal_stops) {
1653  CRef<CUser_object> align_info(new CUser_object);
1654  align_info->SetType().SetStr("AlignInfo");
1655  align_info->AddField("num_internal_stop_codon", (int)count_internal_stops);
1656  cds_feat_on_transcribed_mrna->AddExt(align_info);
1657  }
1658 
1659  if (frame < 0) { //last codon partial
1660  if (b < strprot.size() && strprot[b] != 'X') { // last partial codon translated unambiguously - add it
1661  _ASSERT( b == strprot.size()-1 );
1662  AddLiteral(seq_inst, strprot.substr(b,1), CSeq_inst::eMol_aa);
1663  b += 1;
1664  frame = 0;
1665  }
1666  }
1667 
1668  _ASSERT( b <= strprot.size() &&
1669  strprot.size() <= b + (frame==0?0:1) );
1670 
1671  if (cds_loc->IsPartialStop(eExtreme_Biological)) {
1672  while (seq_inst.GetExt().GetDelta().Get().size() > 0 &&
1673  !seq_inst.GetExt().GetDelta().Get().back()->GetLiteral().IsSetSeq_data()) {
1674  skip_3_prime += seq_inst.GetExt().GetDelta().Get().back()->GetLiteral().GetLength();
1675  seq_inst.SetExt().SetDelta().Set().pop_back();
1676  }
1677  }
1678 
1679  if (skip_5_prime || skip_3_prime) {
1680  CSeq_loc_Mapper to_prot(*cds_feat_on_transcribed_mrna, CSeq_loc_Mapper::eLocationToProduct);
1681  CSeq_loc_Mapper to_mrna(*cds_feat_on_transcribed_mrna, CSeq_loc_Mapper::eProductToLocation);
1682 
1683  CRef<CSeq_loc> prot_loc = to_prot.Map(*protloc_on_mrna)->Merge(CSeq_loc::fMerge_SingleRange, NULL);
1684 
1685  prot_loc->SetInt().SetFrom(skip_5_prime);
1686  prot_loc->SetInt().SetTo(b-skip_3_prime-1+(skip_3_prime?0:1));
1687  prot_loc->SetPartialStart(skip_5_prime, eExtreme_Biological);
1688  prot_loc->SetPartialStop(skip_3_prime, eExtreme_Biological);
1689 
1690  cds_feat_on_transcribed_mrna->SetLocation(*to_mrna.Map(*prot_loc));
1691  }
1692 
1693  seq_inst.SetLength(b-skip_5_prime-skip_3_prime);
1694 
1695  if (seq_inst.SetExt().SetDelta().Set().size() == 1 && seq_inst.SetExt().SetDelta().Set().back()->GetLiteral().IsSetSeq_data()) {
1696  seq_inst.SetRepr(CSeq_inst::eRepr_raw);
1697  CRef<CSeq_data> dprot(new CSeq_data);
1698  dprot->Assign(seq_inst.SetExt().SetDelta().Set().back()->GetLiteral().GetSeq_data());
1699  seq_inst.SetSeq_data(*dprot);
1700  seq_inst.ResetExt();
1701  }
1702 
1704  CRef<CSeq_align> mrna_assembly = seqs.SetSeq_set().back()->GetSeq().GetInst().GetHist().GetAssembly().back();
1705 
1706  CRef<CSeq_feat> cds_feat_on_assembly_mrna(new CSeq_feat);
1707  cds_feat_on_assembly_mrna->Assign(*cds_feat_on_transcribed_mrna);
1708  cds_feat_on_assembly_mrna->SetLocation().SetId(mrna_assembly->GetSeq_id(0));
1709 
1710  if ( !cds_feat_on_assembly_mrna->GetLocation().IsPartialStop(eExtreme_Biological)) {
1711  cds_feat_on_assembly_mrna->SetLocation().SetInt().SetTo() -= 3;
1712  }
1713 
1714  CSeq_loc_Mapper to_prot(*cds_feat_on_assembly_mrna, CSeq_loc_Mapper::eLocationToProduct);
1715  CRef<CSeq_align> prot_assembly = to_prot.Map(*mrna_assembly);
1716 
1717  prot_assembly->SetSegs().SetSpliced().SetProduct_length(seq_inst.GetLength());
1718 
1719  seq_inst.SetHist().SetAssembly().push_back(prot_assembly);
1720  }
1721 
1722 #ifdef _DEBUG
1723  CSeq_entry_Handle prot_seh = m_scope->AddTopLevelSeqEntry(*entry);
1724 
1725  CBioseq_Handle prot_h = m_scope->GetBioseqHandle(*translated_protein_id);
1727  string result;
1728  vec.GetSeqData(0, vec.size(), result);
1729  _ASSERT( b-skip_5_prime-skip_3_prime==result.size() );
1730 
1731  m_scope->RemoveTopLevelSeqEntry(prot_seh);
1732 #endif
1733 
1734  m_scope->RemoveTopLevelSeqEntry(mrna_seh);
1735 
1736  seqs.SetSeq_set().push_back(entry);
1737  return bioseq;
1738 }
1739 
1743  const CSeq_id& query_rna_id,
1744  CSeq_id& transcribed_rna_id,
1745  CConstRef<CSeq_feat> cds_feat_on_query_mrna)
1746 {
1747  CRef<CSeq_feat> mrna_feat;
1748  if (m_flags & fCreateMrna) {
1749  mrna_feat.Reset(new CSeq_feat());
1751  string name;
1752  string RNA_class;
1753 
1754  string gnomon_model_num = ExtractGnomonModelNum(query_rna_id);
1755  if (!gnomon_model_num.empty()) {
1756  CRef<CObject_id> obj_id( new CObject_id() );
1757  obj_id->SetStr("rna." + gnomon_model_num);
1758  CRef<CFeat_id> feat_id( new CFeat_id() );
1759  feat_id->SetLocal(*obj_id);
1760  mrna_feat->SetIds().push_back(feat_id);
1761  }
1762 
1763  mrna_feat->SetProduct().SetWhole().Assign(transcribed_rna_id);
1764  CBioseq_Handle handle = m_scope->GetBioseqHandle(query_rna_id);
1765  if (handle) {
1766  const CMolInfo* info = s_GetMolInfo(handle);
1767  if (info && info->IsSetBiomol()) {
1768  switch (info->GetBiomol()) {
1771  break;
1774  break;
1777  break;
1780  break;
1783  break;
1786  break;
1789  break;
1792  if (info->IsSetGbmoltype()) {
1793  RNA_class = info->GetGbmoltype();
1794  }
1795  break;
1798  break;
1799  default:
1801  break;
1802  }
1803  }
1804  } else {
1805  type = cds_feat_on_query_mrna.IsNull()
1807  }
1808 
1809  mrna_feat->SetData().SetRna().SetType(type);
1810  if (!RNA_class.empty()) {
1811  mrna_feat->SetData().SetRna().SetExt().SetGen().SetClass(RNA_class);
1812  }
1813  name = x_ConstructRnaName(handle);
1814  if (!name.empty()) {
1815  if (!RNA_class.empty()) {
1816  mrna_feat->SetData().SetRna().SetExt().SetGen().SetProduct(name);
1817  } else {
1818  mrna_feat->SetData().SetRna().SetExt().SetName(name);
1819  }
1820  }
1821 
1822  mrna_feat->SetLocation(*loc);
1823  }
1824  return mrna_feat;
1825 }
1826 
1827 void
1830  const CBioseq_Handle& handle,
1831  SMapper& mapper,
1832  CRef<CSeq_loc> loc,
1833  const CSeq_id& genomic_id, Int8 gene_id)
1834 {
1835  if (m_flags & fCreateGene) {
1836  CFeat_CI feat_iter;
1837  if (handle) {
1838  feat_iter = CFeat_CI(handle, CSeqFeatData::eSubtype_gene);
1839  }
1840  bool update_existing_gene = gene_feat;
1841  string gene_id_str = "gene.";
1842  if (gene_id) {
1843  gene_id_str += NStr::NumericToString(gene_id);
1844  }
1845 
1846  if (!update_existing_gene) {
1847  if (feat_iter && feat_iter.GetSize()) {
1848  gene_feat.Reset(new CSeq_feat());
1849  gene_feat->Assign(feat_iter->GetOriginalFeature());
1850  }
1851  if (!(m_flags & fPropagateOnly)) {
1852  /// if we didn't find am existing gene feature, create one
1853  if (!gene_feat) {
1854  gene_feat.Reset(new CSeq_feat());
1855  gene_feat->SetData().SetGene();
1856  }
1857  if (gene_id) {
1858  CRef<CObject_id> obj_id( new CObject_id() );
1859  obj_id->SetStr(gene_id_str);
1860  CRef<CFeat_id> feat_id( new CFeat_id() );
1861  feat_id->SetLocal(*obj_id);
1862  gene_feat->SetIds().push_back(feat_id);
1863  }
1864  }
1865  }
1866 
1867  if (!gene_feat) {
1868  /// Couldn't create gene feature
1869  return;
1870  }
1871 
1872  CRef<CSeq_loc> gene_loc;
1873  if (!(m_flags & fPropagateOnly)) {
1874  gene_loc = loc;
1875  } else if (feat_iter && feat_iter.GetSize()) {
1876  gene_loc = mapper.Map(feat_iter->GetLocation());
1877  }
1878 
1879  if (gene_loc) {
1880  gene_feat->SetLocation
1881  (*MergeSeq_locs(gene_loc,
1882  update_existing_gene ? &gene_feat->GetLocation() : NULL));
1883  }
1884 
1885  if (feat_iter && feat_iter.GetSize() == 1 && update_existing_gene) {
1886  /// check if gene feature has any dbxrefs that we don't have yet
1887  if (feat_iter->IsSetDbxref()) {
1888  ITERATE (CSeq_feat::TDbxref, xref_it,
1889  feat_iter->GetDbxref()) {
1890  CRef<CDbtag> tag(new CDbtag);
1891  tag->Assign(**xref_it);
1892  bool duplicate = false;
1893  if(gene_feat->IsSetDbxref()){
1894  /// Check for duplications
1895  ITERATE(CSeq_feat::TDbxref, previous_xref_it,
1896  gene_feat->GetDbxref())
1897  if((*previous_xref_it)->Match(**xref_it)){
1898  duplicate = true;
1899  break;
1900  }
1901  }
1902  if(!duplicate)
1903  gene_feat->SetDbxref().push_back(tag);
1904  }
1905  }
1906  }
1907 
1908  if (gene_id) {
1909  /// Special case for gnomon, set gene desc from gnomon id
1910  gene_feat->SetData().SetGene().SetDesc(gene_id_str);
1911  }
1912  }
1913 }
1914 
1918  CRef<CSeq_feat> cds_feat_on_transcribed_mrna,
1919  list<CRef<CSeq_loc> >& transcribed_mrna_seqloc_refs,
1920  const CSeq_align& align,
1921  CRef<CSeq_loc> loc,
1922  const CTime& time,
1923  size_t model_num,
1924  CBioseq_set& seqs,
1926 {
1927  CRef<CSeq_feat> cds_feat;
1928  if (!(m_flags & fCreateCdregion) || cds_feat_on_query_mrna.IsNull()) {
1929  return cds_feat;
1930  }
1931 
1932  TSeqPos offset;
1933  CRef<CSeq_feat> cds_feat_on_genome = x_MapFeature(cds_feat_on_query_mrna.GetNonNullPointer(),
1934  align, loc, opts, offset);
1935  CRef<CSeq_loc> cds_loc;
1936  if (cds_feat_on_genome) {
1937  cds_loc = &cds_feat_on_genome->SetLocation();
1938  }
1939  if (cds_loc && cds_loc->Which() != CSeq_loc::e_not_set) {
1940  CRangeCollection<TSeqPos> loc_ranges;
1941  ITERATE (CSeq_loc, loc_it, *cds_loc) {
1942  loc_ranges += loc_it.GetRange();
1943  }
1944 
1945  bool is_partial_5prime = offset > 0 || cds_loc->IsPartialStart(eExtreme_Biological);
1946  cds_loc->SetPartialStart(is_partial_5prime, eExtreme_Biological);
1947 
1948  string gnomon_model_num;
1949 
1950  if (cds_feat_on_query_mrna->CanGetProduct()) {
1951  gnomon_model_num = ExtractGnomonModelNum(
1952  *cds_feat_on_query_mrna->GetProduct().GetId());
1953  }
1954  /// create a new bioseq for the CDS
1955  if (!gnomon_model_num.empty()) {
1956  CRef<CObject_id> obj_id( new CObject_id() );
1957  obj_id->SetStr("cds." + gnomon_model_num);
1958  CRef<CFeat_id> feat_id( new CFeat_id() );
1959  feat_id->SetLocal(*obj_id);
1960  cds_feat_on_transcribed_mrna->SetIds().push_back(feat_id);
1961  }
1962  x_CreateProteinBioseq(cds_loc, cds_feat_on_transcribed_mrna,
1963  transcribed_mrna_seqloc_refs,
1964  time, model_num, seqs);
1965 
1966  cds_feat.Reset(new CSeq_feat());
1967  cds_feat->Assign(*cds_feat_on_transcribed_mrna);
1968  cds_feat->ResetId();
1969 
1970  cds_feat->SetLocation(*cds_loc);
1971 
1972  /// make sure we set the CDS frame correctly
1973  /// if we're 5' partial, we may need to adjust the frame
1974  /// to ensure that conceptual translations are in-frame
1975  if (is_partial_5prime && offset) {
1976  int orig_frame = 0;
1977  if (cds_feat->GetData().GetCdregion().IsSetFrame()) {
1978  orig_frame = cds_feat->GetData()
1979  .GetCdregion().GetFrame();
1980  if (orig_frame) {
1981  orig_frame -= 1;
1982  }
1983  }
1984  int frame = (offset - orig_frame) % 3;
1985  if (frame < 0) {
1986  frame = -frame;
1987  }
1988  frame = (3 - frame) % 3;
1989  if (frame != orig_frame) {
1990  switch (frame) {
1991  case 0:
1992  cds_feat->SetData().SetCdregion()
1993  .SetFrame(CCdregion::eFrame_one);
1994  break;
1995  case 1:
1996  cds_feat->SetData().SetCdregion()
1997  .SetFrame(CCdregion::eFrame_two);
1998  break;
1999  case 2:
2000  cds_feat->SetData().SetCdregion()
2001  .SetFrame(CCdregion::eFrame_three);
2002  break;
2003 
2004  default:
2006  "mod 3 out of bounds");
2007  }
2008  }
2009  }
2010 
2011  if (!gnomon_model_num.empty() && !is_partial_5prime) {
2012  int cds_start = cds_feat_on_transcribed_mrna->GetLocation().GetTotalRange().GetFrom();
2013  if (cds_start >= 3) {
2014  CBioseq_Handle rna_handle =
2015  m_scope->GetBioseqHandle(*cds_feat_on_transcribed_mrna->GetLocation().GetId());
2016 
2017  string strprot;
2018  if (rna_handle) {
2019  CSeqVector vec(rna_handle, CBioseq_Handle::eCoding_Iupac);
2020  string mrna;
2021  vec.GetSeqData(cds_start % 3, cds_start, mrna);
2022  const CGenetic_code *code = NULL;
2023  if (cds_feat_on_transcribed_mrna->GetData().GetCdregion().IsSetCode()) {
2024  code = &cds_feat_on_transcribed_mrna->GetData().GetCdregion().GetCode();
2025  }
2027  (mrna, strprot,
2029  }
2030  SIZE_TYPE stop_5prime = strprot.rfind('*');
2031  if (stop_5prime != NPOS) {
2032  stop_5prime = stop_5prime*3+cds_start%3;
2033  CRef<CSeq_feat> stop_5prime_feature(new CSeq_feat);
2034  stop_5prime_feature->SetData().SetImp().SetKey("misc_feature");
2035  stop_5prime_feature->SetComment("upstream in-frame stop codon");
2036  CRef<CSeq_loc> stop_5prime_location(new CSeq_loc());
2037  stop_5prime_location->SetInt().SetFrom(stop_5prime);
2038  stop_5prime_location->SetInt().SetTo(stop_5prime+2);
2039  stop_5prime_location->SetInt().SetStrand(eNa_strand_plus);
2040  stop_5prime_location->SetId(*rna_handle.GetSeqId());
2041  stop_5prime_feature->SetLocation(*stop_5prime_location);
2042 
2044  sel.SetResolveNone();
2045  CAnnot_CI it(rna_handle, sel);
2046  it->GetEditHandle().AddFeat(*stop_5prime_feature);
2047  }
2048  }
2049  }
2050 
2051  /// also copy the code break if it exists
2052  if (cds_feat->GetData().GetCdregion().IsSetCode_break()) {
2053  SMapper mapper(align, *m_scope, 0, opts);
2054  mapper.IncludeSourceLocs();
2055  mapper.SetMergeNone();
2056 
2058  cds_feat->SetData().SetCdregion();
2059  CCdregion::TCode_break::iterator it =
2060  cds.SetCode_break().begin();
2061  for ( ; it != cds.SetCode_break().end(); ) {
2062  CSeq_loc code_break_loc;
2063  code_break_loc.Assign((*it)->GetLoc());
2064  code_break_loc.SetId(align.GetSeq_id(0)); // set query mrna id - the mapper maps from query mrna to genome
2065  CRef<CSeq_loc> new_cb_loc = mapper.Map(code_break_loc);
2066 
2067  // we may get an equiv. If we do, get just mapped loc
2068  if (new_cb_loc->IsEquiv()) {
2069  new_cb_loc = new_cb_loc->GetEquiv().Get().front();
2070  }
2071 
2072  CRangeCollection<TSeqPos> new_cb_ranges;
2073  if (new_cb_loc && !new_cb_loc->IsNull()) {
2074  ITERATE (CSeq_loc, loc_it, *new_cb_loc) {
2075  new_cb_ranges += loc_it.GetRange();
2076  }
2077  new_cb_ranges &= loc_ranges;
2078  }
2079  if (new_cb_ranges.GetCoveredLength() == 3) {
2080  (*it)->SetLoc(*new_cb_loc);
2081  ++it;
2082  } else {
2083  it = cds.SetCode_break().erase(it);
2084  }
2085  }
2086  if (cds.GetCode_break().empty()) {
2087  cds.ResetCode_break();
2088  }
2089  }
2090 
2091  }
2092 
2093  return cds_feat;
2094 }
2095 
2098 {
2099  string name;
2100  if (handle) {
2101  name = sequence::CDeflineGenerator().GenerateDefline(handle);
2102  try {
2103  const COrg_ref &org = sequence::GetOrg_ref(handle);
2104  if (org.IsSetTaxname() && NStr::StartsWith(name, org.GetTaxname())) {
2105  name.erase(0, org.GetTaxname().size());
2106  }
2107  }
2108  catch (CException&) {
2109  }
2110  NStr::ReplaceInPlace(name, ", nuclear gene encoding mitochondrial protein",
2111  "");
2112  CFeat_CI feat_iter(handle, CSeqFeatData::eSubtype_gene);
2113  if (feat_iter && feat_iter.GetSize() &&
2114  feat_iter->GetData().GetGene().IsSetLocus())
2115  {
2117  name, " (" + feat_iter->GetData().GetGene().GetLocus() + ')', "");
2118  }
2119  size_t last_comma = name.rfind(',');
2120  if (last_comma != string::npos) {
2121  name.erase(last_comma);
2122  }
2124  }
2125  return name;
2126 }
2127 
2130 SImplementation::x_CreateNcRnaFeature(const CSeq_feat* ncrnafeature_on_mrna,
2131  const CSeq_align& align,
2132  CConstRef<CSeq_loc> loc,
2134 {
2135  CRef<CSeq_feat> ncrna_feat;
2136  if ((m_flags & fPropagateNcrnaFeats) && ncrnafeature_on_mrna != NULL) {
2137 
2138  TSeqPos offset;
2139  CRef<CSeq_loc> non_const_loc(new CSeq_loc); // x_MapFeature requires non-const loc
2140  non_const_loc->Assign(*loc);
2141  ncrna_feat = x_MapFeature(ncrnafeature_on_mrna,
2142  align, non_const_loc, opts, offset);
2143  }
2144  return ncrna_feat;
2145 }
2146 
2147 namespace {
2148 CRef<CSeq_loc> ChangeToMix(const CSeq_loc& a)
2149 {
2150  CRef<CSeq_loc> a_mix(new CSeq_loc);
2151  a_mix->Assign(a);
2152  a_mix->ChangeToMix();
2153  return a_mix;
2154 }
2155 
2156 CRef<CSeq_loc> SubtractPreserveBiologicalOrder(const CSeq_loc& a, const CSeq_loc& b)
2157 {
2158  CRef<CSeq_loc> a_mix = ChangeToMix(a);
2159  CRef<CSeq_loc> b_mix = ChangeToMix(b);
2160 
2161  list< CRef< CSeq_loc > >& a_list = a_mix->SetMix().Set();
2162  const list< CRef< CSeq_loc > >& b_list = b_mix->GetMix().Get();
2163 
2164  ITERATE (list< CRef< CSeq_loc > >, b_i, b_list) {
2165  for (list< CRef< CSeq_loc > >::iterator a_i = a_list.begin(); a_i != a_list.end();) {
2166 
2167  CRef<CSeq_loc> diff = ChangeToMix(*(*a_i)->Subtract(**b_i, CSeq_loc::fSort, nullptr, nullptr));
2168  a_list.splice(a_i, diff->SetMix().Set());
2169  a_i = a_list.erase(a_i);
2170  }
2171  }
2172  if (a_list.size() == 1) {
2173  return a_list.front();
2174  }
2175  a_mix->ChangeToPackedInt();
2176  return a_mix;
2177 }
2178 }
2179 
2182 SImplementation::x_MapFeature(const objects::CSeq_feat* feature_on_mrna,
2183  const CSeq_align& align,
2184  CRef<CSeq_loc> loc,
2186  TSeqPos &offset)
2187 {
2188  // from this point on, we will get complex locations back
2189  SMapper mapper(align, *m_scope, 0, opts);
2190  mapper.IncludeSourceLocs();
2191  mapper.SetMergeNone();
2192 
2193  CRef<CSeq_loc> mapped_loc;
2194 
2195  ///
2196  /// in general, feature has only one segment on mRNA; there are some corner cases
2197  /// (OAZ1, OAZ2, PAZ3, PEG10) in which there are more than one
2198  /// segment.
2199  /// we map each segment separately because we want to stitch genomic insertions,
2200  /// but not segment boundaries.
2201  ///
2202  for (CSeq_loc_CI loc_it(feature_on_mrna->GetLocation());
2203  loc_it; ++loc_it) {
2204  /// location for this interval
2205  CConstRef<CSeq_loc> this_loc = loc_it.GetRangeAsSeq_loc();
2206 
2207  /// map it
2208  CRef<CSeq_loc> equiv = mapper.Map(*this_loc);
2209  if ( !equiv ||
2210  equiv->IsNull() ||
2211  equiv->IsEmpty() ) {
2212  continue;
2213  }
2214 
2215  /// we are using a special variety that will tell us what
2216  /// portion really mapped
2217  ///
2218  /// the first part is the mapped location
2219 
2220  if (equiv->GetEquiv().Get().size() != 2) {
2222  "failed to find requisite parts of "
2223  "mapped seq-loc");
2224  }
2225  CRef<CSeq_loc> this_loc_mapped =
2226  equiv->GetEquiv().Get().front();
2227  if ( !this_loc_mapped ||
2228  this_loc_mapped->IsNull() ||
2229  this_loc_mapped->IsEmpty() ) {
2230  continue;
2231  }
2232 
2233  if ( !mapped_loc ) {
2234  mapped_loc.Reset(new CSeq_loc);
2235  /// This is start of mapped location; record offset
2236  offset = equiv->GetEquiv().Get().back()->GetTotalRange().GetFrom() -
2237  feature_on_mrna->GetLocation().GetTotalRange().GetFrom();
2238  }
2239 
2240  bool is_partial_5prime =
2241  this_loc_mapped->IsPartialStart(eExtreme_Biological);
2242  bool is_partial_3prime =
2243  this_loc_mapped->IsPartialStop(eExtreme_Biological);
2244 
2245  CSeq_loc_CI it1 = loc_it;
2246  bool last_range = !++it1;
2247  if (is_partial_3prime && last_range &&
2248  align.GetSegs().IsSpliced() &&
2249  align.GetSegs().GetSpliced().IsSetPoly_a() &&
2250  feature_on_mrna->GetData().IsCdregion() &&
2251  !this_loc->IsPartialStop(eExtreme_Biological))
2252  {
2253  TSeqPos missing_end =
2254  this_loc->GetTotalRange().GetTo() -
2255  equiv->GetEquiv().Get().back()->GetTotalRange().GetTo();
2256  if (missing_end < 3) {
2257  /// alignment truncates last one or two bases of CDS; stop codon
2258  /// completed by poly-a tail. This should not be annotated as partial
2259  is_partial_3prime = false;
2260  }
2261  }
2262 
2263  /// stitch genomic insertions
2264  /// we take the extreme bounds of the interval only;
2265  /// internal details will be recomputed based on intersection
2266  /// with the mRNA location
2267 
2268  ENa_strand strand = this_loc_mapped->GetStrand();
2269  CSeq_loc sub;
2270  sub.SetInt().SetFrom(this_loc_mapped->GetStart(eExtreme_Positional));
2271  sub.SetInt().SetTo(this_loc_mapped->GetStop(eExtreme_Positional));
2272  sub.SetInt().SetStrand(loc->GetStrand());
2273  sub.SetInt().SetId().Assign(*this_loc_mapped->GetId());
2274 
2275  int left = sub.GetInt().GetFrom();
2276  int right = sub.GetInt().GetTo();
2277  bool cross_origin = (left > right);
2278  if (cross_origin) {
2279 
2280  TSeqPos genomic_size = m_scope->GetSequenceLength(*this_loc_mapped->GetId());
2281 
2283  half->Assign(sub.GetInt());
2284  half->SetTo(genomic_size-1);
2285  sub.SetPacked_int().AddInterval(*half);
2286  half->SetFrom(0);
2287  half->SetTo(right);
2288  sub.SetPacked_int().AddInterval(*half);
2289  }
2290 
2291  this_loc_mapped = loc->Intersect(sub,
2293  NULL);
2294  this_loc_mapped->SetStrand(strand);
2295 
2296  if (this_loc_mapped->IsMix()) {
2297  /// Propagate any internal fuzzy boundaries on the mRNA to the CDS
2298  set<TSeqPos> mrna_fuzzy_boundaries;
2299  ITERATE (CSeq_loc, subloc_it, *loc) {
2300  if (subloc_it.GetRangeAsSeq_loc()->
2301  IsPartialStart(eExtreme_Positional))
2302  {
2303  mrna_fuzzy_boundaries.insert(
2304  subloc_it.GetRange().GetFrom());
2305  }
2306  if (subloc_it.GetRangeAsSeq_loc()->
2307  IsPartialStop(eExtreme_Positional))
2308  {
2309  mrna_fuzzy_boundaries.insert(
2310  subloc_it.GetRange().GetTo());
2311  }
2312  }
2313 
2315  this_loc_mapped->SetMix().Set())
2316  {
2317  (*subloc_it)->SetPartialStart(
2318  mrna_fuzzy_boundaries.count(
2319  (*subloc_it)->GetStart(eExtreme_Positional)),
2321  (*subloc_it)->SetPartialStop(
2322  mrna_fuzzy_boundaries.count(
2323  (*subloc_it)->GetStop(eExtreme_Positional)),
2325  }
2326  }
2327 
2328  if (cross_origin) {
2329  this_loc_mapped = FixOrderOfCrossTheOriginSeqloc(*this_loc_mapped,
2330  (left+right)/2);
2331  }
2332  this_loc_mapped->SetPartialStart(is_partial_5prime, eExtreme_Biological);
2333  this_loc_mapped->SetPartialStop(is_partial_3prime, eExtreme_Biological);
2334 
2335  mapped_loc->SetMix().Set().push_back(this_loc_mapped);
2336  }
2337  if (mapped_loc) {
2338  mapped_loc->ChangeToPackedInt();
2339  mapped_loc->SetId(*loc->GetId());
2340  }
2341 
2343  mapped_loc = FixOrderOfCrossTheOriginSeqloc(*mapped_loc,
2345  }
2346 
2347  if (mapped_loc && feature_on_mrna->GetData().IsRna())
2348  {
2349  if (mapped_loc->IsPartialStop(eExtreme_Biological) &&
2350  !feature_on_mrna->GetLocation().IsPartialStop(eExtreme_Biological) &&
2351  align.GetSegs().IsSpliced() &&
2352  align.GetSegs().GetSpliced().CanGetPoly_a())
2353  {
2354  /// When propagaring RNA feature, don't create fuzz at 3' end if
2355  /// alignment has poly-a flag
2356  mapped_loc->SetPartialStop(false, eExtreme_Biological);
2357  }
2358  if ((mapped_loc->IsPartialStart(eExtreme_Biological) &&
2359  !feature_on_mrna->GetLocation().IsPartialStart(eExtreme_Biological)) ||
2360  (mapped_loc->IsPartialStop(eExtreme_Biological) &&
2361  !feature_on_mrna->GetLocation().IsPartialStop(eExtreme_Biological)))
2362  {
2363  CSeq_loc_Mapper reverse_mapper(align, 0, m_scope.GetPointer(), opts);
2364  CSeq_id &mapped_loc_id = const_cast<CSeq_id &>(*mapped_loc->GetId());
2365  TSignedSeqPos feat_start = feature_on_mrna->GetLocation().GetStart(eExtreme_Biological);
2366  CSeq_loc start_loc(mapped_loc_id, mapped_loc->GetStart(eExtreme_Biological));
2367  TSignedSeqPos mapped_start = reverse_mapper.Map(start_loc)->GetStart(eExtreme_Biological);
2368  if (!feature_on_mrna->GetLocation().IsPartialStart(eExtreme_Biological) &&
2369  TSeqPos(abs(feat_start - mapped_start)) <= m_allowed_unaligned)
2370  {
2371  /// No fuzz in original, and overhang is within limits; shouldn't have fuzz
2372  mapped_loc->SetPartialStart(false, eExtreme_Biological);
2373  }
2374  TSignedSeqPos feat_stop = feature_on_mrna->GetLocation().GetStop(eExtreme_Biological);
2375  CSeq_loc stop_loc(mapped_loc_id, mapped_loc->GetStop(eExtreme_Biological));
2376  TSignedSeqPos mapped_stop = reverse_mapper.Map(stop_loc)->GetStop(eExtreme_Biological);
2377  if (!feature_on_mrna->GetLocation().IsPartialStop(eExtreme_Biological) &&
2378  TSeqPos(abs(feat_stop - mapped_stop)) <= m_allowed_unaligned)
2379  {
2380  /// No fuzz in original, and overhang is within limits; shouldn't have fuzz
2381  mapped_loc->SetPartialStop(false, eExtreme_Biological);
2382  }
2383  }
2384  }
2385 
2386  if (mapped_loc && feature_on_mrna->GetData().IsCdregion()) {
2387  /// For CDS features, trim not to begin/end in gaps
2388  /// Trim beginning
2389  CSeqVector vec(*mapped_loc, *m_scope);
2390  TSeqPos start_gap = 0;
2391  for (; vec.IsInGap(start_gap); ++start_gap);
2392  if (start_gap > 0 && start_gap < vec.size()) {
2393  offset += start_gap;
2394 
2395  CSeq_loc orig_mapped_loc;
2396 
2397  bool no_utr = mapped_loc->GetStart(eExtreme_Biological) == loc->GetStart(eExtreme_Biological);
2398  if (no_utr) {
2399  orig_mapped_loc.Assign(*mapped_loc);
2400  }
2401 
2402  while (mapped_loc->SetPacked_int().Set().front()->GetLength()
2403  <= start_gap)
2404  {
2405  start_gap -= mapped_loc->SetPacked_int().Set().front()->GetLength();
2406  mapped_loc->SetPacked_int().Set().pop_front();
2407  }
2408  if (start_gap) {
2409  CSeq_interval &first_exon =
2410  *mapped_loc->SetPacked_int().Set().front();
2411  if (first_exon.GetStrand() == eNa_strand_minus) {
2412  first_exon.SetTo() -= start_gap;
2413  } else {
2414  first_exon.SetFrom() += start_gap;
2415  }
2416  }
2417  mapped_loc->SetPartialStart(true, eExtreme_Biological);
2418 
2419  if (no_utr) {
2420  loc->Assign(*SubtractPreserveBiologicalOrder(*loc, *SubtractPreserveBiologicalOrder(orig_mapped_loc, *mapped_loc)));
2421  loc->SetPartialStart(true, eExtreme_Biological);
2422  }
2423  }
2424  TSeqPos end_gap = 0;
2425  for (; vec.IsInGap(vec.size() - 1 - end_gap); ++end_gap);
2426  if (end_gap > 0 && end_gap < vec.size()) {
2427  CSeq_loc orig_mapped_loc;
2428 
2429  bool no_utr = mapped_loc->GetStop(eExtreme_Biological) == loc->GetStop(eExtreme_Biological);
2430  if (no_utr) {
2431  orig_mapped_loc.Assign(*mapped_loc);
2432  }
2433 
2434  while (mapped_loc->SetPacked_int().Set().back()->GetLength() <= end_gap)
2435  {
2436  end_gap -= mapped_loc->SetPacked_int().Set().back()->GetLength();
2437  mapped_loc->SetPacked_int().Set().pop_back();
2438  }
2439  if (end_gap) {
2440  CSeq_interval &last_exon =
2441  *mapped_loc->SetPacked_int().Set().back();
2442  if (last_exon.GetStrand() == eNa_strand_minus) {
2443  last_exon.SetFrom() += end_gap;
2444  } else {
2445  last_exon.SetTo() -= end_gap;
2446  }
2447  }
2448  mapped_loc->SetPartialStop(true, eExtreme_Biological);
2449  if (no_utr) {
2450  loc->Assign(*SubtractPreserveBiologicalOrder(*loc, *SubtractPreserveBiologicalOrder(orig_mapped_loc, *mapped_loc)));
2451  loc->SetPartialStop(true, eExtreme_Biological);
2452  }
2453  }
2454  }
2455 
2456  CRef<CSeq_feat> mapped_feat;
2457  if (mapped_loc && mapped_loc->Which() != CSeq_loc::e_not_set) {
2458  mapped_feat.Reset(new CSeq_feat());
2459  mapped_feat->Assign(*feature_on_mrna);
2460  mapped_feat->ResetId();
2461 
2462  mapped_feat->SetLocation(*mapped_loc);
2463  }
2464  return mapped_feat;
2465 }
2466 
2469  CRef<CSeq_feat> mrna_feat,
2470  CRef<CSeq_feat> propagated_feat)
2471 {
2472  if(propagated_feat){
2473  for (CSeq_loc_CI loc_it(propagated_feat->GetLocation()); loc_it; ++loc_it) {
2474  if (loc_it.GetRangeAsSeq_loc()->IsPartialStart(eExtreme_Biological) || loc_it.GetRangeAsSeq_loc()->IsPartialStop(eExtreme_Biological)) {
2475  propagated_feat->SetPartial(true);
2476  if(gene_feat)
2477  gene_feat->SetPartial(true);
2478  break;
2479  }
2480  }
2481  }
2482 
2483  ///
2484  /// partial flags may require a global analysis - we may need to mark some
2485  /// locations partial even if they are not yet partial
2486  ///
2487  if (mrna_feat && propagated_feat)
2488  {
2489  /// in addition to marking the mrna feature partial, we must mark the
2490  /// location partial to match the partialness in the CDS
2491  CSeq_loc& propagated_feat_loc = propagated_feat->SetLocation();
2492  CSeq_loc& mrna_loc = mrna_feat->SetLocation();
2493  if (propagated_feat_loc.IsPartialStart(eExtreme_Biological) &&
2494  propagated_feat_loc.GetStart(eExtreme_Biological) ==
2495  mrna_loc.GetStart(eExtreme_Biological)) {
2496  mrna_loc.SetPartialStart(true, eExtreme_Biological);
2497  }
2498  if (propagated_feat_loc.IsPartialStop(eExtreme_Biological) &&
2499  propagated_feat_loc.GetStop(eExtreme_Biological) ==
2500  mrna_loc.GetStop(eExtreme_Biological)) {
2501  mrna_loc.SetPartialStop(true, eExtreme_Biological);
2502  }
2503  }
2504 
2505  /// set the partial flag for mrna_feat if it has any fuzzy intervals
2506  if(mrna_feat){
2507  for (CSeq_loc_CI loc_it(mrna_feat->GetLocation()); loc_it; ++loc_it) {
2508  if (loc_it.GetRangeAsSeq_loc()->IsPartialStart(eExtreme_Biological) || loc_it.GetRangeAsSeq_loc()->IsPartialStop(eExtreme_Biological)) {
2509  mrna_feat->SetPartial(true);
2510  if(gene_feat)
2511  gene_feat->SetPartial(true);
2512  break;
2513  }
2514  }
2515  }
2516 
2517  ///
2518  /// set gene partialness if mRNA is partial
2519  if (gene_feat && mrna_feat){
2520  CSeq_loc& mrna_loc = mrna_feat->SetLocation();
2521  CSeq_loc& gene_loc = gene_feat->SetLocation();
2522  if (mrna_loc.IsPartialStart(eExtreme_Biological)) {
2523  gene_loc.SetPartialStart(true, eExtreme_Biological);
2524  }
2525  if (mrna_loc.IsPartialStop(eExtreme_Biological)) {
2526  gene_loc.SetPartialStop(true, eExtreme_Biological);
2527  }
2528  }
2529 
2530  ///
2531  /// set gene partialness if CDS is partial
2532  if (gene_feat && propagated_feat && !mrna_feat){
2533  CSeq_loc& propagated_loc = propagated_feat->SetLocation();
2534  CSeq_loc& gene_loc = gene_feat->SetLocation();
2535  if (propagated_loc.IsPartialStart(eExtreme_Biological)) {
2536  gene_loc.SetPartialStart(true, eExtreme_Biological);
2537  }
2538  if (propagated_loc.IsPartialStop(eExtreme_Biological)) {
2539  gene_loc.SetPartialStop(true, eExtreme_Biological);
2540  }
2541  }
2542 }
2543 
2544 /// Check whether range1 contains range2
2545 static inline bool s_Contains(const TSeqRange &range1, const TSeqRange &range2)
2546 {
2547  return range1.GetFrom() <= range2.GetFrom() &&
2548  range1.GetTo() >= range2.GetTo();
2549 }
2550 
2553 {
2555  CSeq_annot_Handle sah = scope.AddSeq_annot(annot);
2556 
2557  /// We're going to recalculate Partial flags for all features,
2558  /// and fuzzy ends for gene features; reset them if they're currently set
2559  for(CFeat_CI ci(sah); ci; ++ci){
2560  CSeq_feat_EditHandle handle(*ci);
2561  CRef<CSeq_feat> feat(const_cast<CSeq_feat*>(handle.GetSeq_feat().GetPointer()));
2562  feat->ResetPartial();
2563  if(feat->GetData().IsGene()){
2564  feat->SetLocation().SetPartialStart(false, eExtreme_Biological);
2565  feat->SetLocation().SetPartialStop(false, eExtreme_Biological);
2566  }
2567  }
2568 
2569  feature::CFeatTree tree(sah);
2570  vector<CMappedFeat> top_level_features = tree.GetChildren(CMappedFeat());
2571 
2572  /// Sort top features (i.e. Seq_feat objects with no parent) by type
2573  vector< vector<CMappedFeat> > top_level_features_by_type;
2574  top_level_features_by_type.resize(CSeqFeatData::e_MaxChoice);
2575 
2576  ITERATE(vector<CMappedFeat>, it, top_level_features)
2577  top_level_features_by_type[it->GetData().Which()].push_back(*it);
2578 
2579  /// Add null gene and rna features; this makes the programming easier for
2580  /// dealing with top-level rnas and top-level cd regions
2581  top_level_features_by_type[CSeqFeatData::e_Gene].push_back(CMappedFeat());
2582  top_level_features_by_type[CSeqFeatData::e_Rna].push_back(CMappedFeat());
2583 
2584  ITERATE(vector<CMappedFeat>, gene_it,
2585  top_level_features_by_type[CSeqFeatData::e_Gene])
2586  {
2587  CRef<CSeq_feat> gene_feat;
2588  if(*gene_it){
2589  CSeq_feat_EditHandle gene_handle(*gene_it);
2590  gene_feat.Reset(const_cast<CSeq_feat*>(gene_handle.GetSeq_feat().GetPointer()));
2591  }
2592  // Get gene's children; or, if we've reached the sentinel null gene
2593  // feature, get top-level rnas.
2594  vector<CMappedFeat> gene_children =
2595  gene_feat ? tree.GetChildren(*gene_it)
2596  : top_level_features_by_type[CSeqFeatData::e_Rna];
2597  sort(gene_children.begin(), gene_children.end());
2598 
2599  ITERATE(vector<CMappedFeat>, child_it, gene_children){
2600  CRef<CSeq_feat> child_feat;
2601  if(*child_it){
2602  CSeq_feat_EditHandle child_handle(*child_it);
2603  child_feat.Reset(const_cast<CSeq_feat*>(child_handle.GetSeq_feat().GetPointer()));
2604  }
2605  if(child_feat && child_feat->GetData().IsCdregion()){
2606  // We have gene and cds with no RNA feature
2607  SetPartialFlags(gene_feat, CRef<CSeq_feat>(), child_feat);
2608  } else if(!child_feat || child_feat->GetData().IsRna()){
2609  vector<CMappedFeat> rna_children =
2610  child_feat ? tree.GetChildren(*child_it)
2611  : top_level_features_by_type[CSeqFeatData::e_Cdregion];
2612  /// When propagating a ncRNA feature, the propagated feature will have a range
2613  /// contained within the range of the newly-created RNA feature, and should be
2614  /// treated as its child. Unfortunately the logic of CFeatTree does not recognize
2615  /// one RNA feature as a child of the other, so we need to do that manually.
2616  while((child_it+1) != gene_children.end() &&
2617  (child_it+1)->GetData().GetSubtype() == CSeqFeatData::eSubtype_ncRNA &&
2618  s_Contains(child_feat->GetLocation().GetTotalRange(),
2619  (child_it+1)->GetTotalRange())){
2620  rna_children.push_back(*(++child_it));
2621  }
2622  if(rna_children.empty()){
2623  // We have gene and RNA with no cds feature
2624  SetPartialFlags(gene_feat, child_feat, CRef<CSeq_feat>());
2625  } else {
2626  ITERATE(vector<CMappedFeat>, rna_child_it, rna_children){
2627  CRef<CSeq_feat> rna_child_feat;
2628  CSeq_feat_EditHandle rna_child_handle(*rna_child_it);
2629  rna_child_feat.Reset(const_cast<CSeq_feat*>(rna_child_handle.GetSeq_feat().GetPointer()));
2630  SetPartialFlags(gene_feat, child_feat, rna_child_feat);
2631  }
2632  }
2633  }
2634  }
2635  }
2636 }
2637 
2638 
2640  CConstRef<CSeq_feat> gene_feat,
2641  CConstRef<CSeq_feat> propagated_feature)
2642 {
2643  if(!gene_feat || !gene_feat->IsSetDbxref() ||
2644  !propagated_feature || !propagated_feature->IsSetDbxref())
2645  return;
2646 
2647  ITERATE (CSeq_feat::TDbxref, gene_xref_it, gene_feat->GetDbxref())
2648  /// Special case for miRBase; the gene feature and propagated ncRNA features can
2649  /// legitimately have different tags for it
2650  if((*gene_xref_it)->GetDb() != "miRBase")
2651  ITERATE (CSeq_feat::TDbxref, propagated_xref_it, propagated_feature->GetDbxref())
2652  if((*gene_xref_it)->GetDb() == (*propagated_xref_it)->GetDb() &&
2653  !(*gene_xref_it)->Match(**propagated_xref_it))
2654  {
2655  string propagated_feature_desc;
2656  if(propagated_feature->GetData().IsCdregion())
2657  propagated_feature_desc = "corresponding cdregion";
2658  else {
2659  NCBI_ASSERT(propagated_feature->GetData().GetSubtype() == CSeqFeatData::eSubtype_ncRNA,
2660  "Unexpected propagated feature type");
2661  propagated_feature_desc = "propagated ncRNA feature";
2662  }
2663  if(propagated_feature->CanGetProduct())
2664  propagated_feature_desc += " " + propagated_feature->GetProduct().GetId()->AsFastaString();
2665  ERR_POST(Warning << "Features for gene "
2666  << gene_feat->GetLocation().GetId()->AsFastaString()
2667  << " and " << propagated_feature_desc
2668  << " have " << (*gene_xref_it)->GetDb()
2669  << " dbxrefs with inconsistent tags");
2670  }
2671 }
2672 
2673 
2674 
2676 {
2677  if (m_flags & fPromoteAllFeatures) {
2678  SAnnotSelector sel;
2679  sel.SetResolveAll()
2680  .SetAdaptiveDepth(true)
2684  for (CFeat_CI feat_iter(handle, sel); feat_iter; ++feat_iter) {
2685  CRef<CSeq_feat> feat(new CSeq_feat());
2686  feat->Assign(feat_iter->GetOriginalFeature());
2687  CRef<CSeq_loc> new_loc =
2688  mapper.Map(feat_iter->GetLocation());
2689  feat->SetLocation(*new_loc);
2690 
2692 
2693  annot.SetData().SetFtable().push_back(feat);
2694  }
2695  }
2696 }
2697 
2698 
2699 
2700 //////////////////////////////////////////////////////////////////////////////
2701 
2702 
2703 ///
2704 /// Handle feature exceptions
2705 ///
2707  const CSeq_align* align,
2708  CSeq_feat* cds_feat,
2709  const CSeq_feat* cds_feat_on_mrna)
2710 {
2711  if ( !feat.IsSetProduct() ) {
2712  ///
2713  /// corner case:
2714  /// we may be a CDS feature for an Ig locus
2715  /// check to see if we have an overlapping V/C/D/J/S region
2716  /// we trust only featu-id xrefs here
2717  ///
2718  if (feat.IsSetXref()) {
2719  CBioseq_Handle bsh = m_scope->GetBioseqHandle(*feat.GetLocation().GetId());
2720  const CTSE_Handle& tse = bsh.GetTSE_Handle();
2721 
2722  ITERATE (CSeq_feat::TXref, it, feat.GetXref()) {
2723  if ( !(*it)->IsSetId() ) {
2724  continue;
2725  }
2726 
2727  CSeq_feat_Handle h;
2728  const CFeat_id& feat_id = (*it)->GetId();
2729  if (feat_id.IsLocal()) {
2730  if (feat_id.GetLocal().IsId()) {
2732  feat_id.GetLocal().GetId());
2733  } else {
2735  feat_id.GetLocal().GetStr());
2736  }
2737  }
2738 
2739  if (h) {
2740  switch (h.GetData().GetSubtype()) {
2748  /// found it
2749  feat.SetExcept(true);
2750  feat.SetExcept_text
2751  ("rearrangement required for product");
2752  break;
2753 
2754  default:
2755  break;
2756  }
2757  }
2758  }
2759  }
2760  return;
2761  }
2762 
2763  ///
2764  /// check to see if there is a Spliced-seg alignment
2765  /// if there is, and it corresponds to this feature, we should use it to
2766  /// record our exceptions
2767  ///
2768 
2770  if (align && align->GetSegs().IsSpliced()) {
2771  al.Reset(align);
2772  }
2773  if ( !al ) {
2774  SAnnotSelector sel;
2775  sel.SetResolveAll();
2776  CAlign_CI align_iter(*m_scope, feat.GetLocation(), sel);
2777  for ( ; align_iter; ++align_iter) {
2778  const CSeq_align& this_align = *align_iter;
2779  if (this_align.GetSegs().IsSpliced() &&
2782  this_align.GetSeq_id(0),
2783  m_scope.GetPointer())) {
2784  al.Reset(&this_align);
2785  break;
2786  }
2787  }
2788  }
2789 
2790  bool has_length_mismatch = false;
2791  //bool has_polya_tail = false;
2792  bool has_incomplete_polya_tail = false;
2793  bool partial_unaligned_section = false;
2794  CRangeCollection<TSeqPos> mismatch_locs;
2795  CRangeCollection<TSeqPos> insert_locs;
2796  CRangeCollection<TSeqPos> delete_locs;
2797  map<TSeqPos,TSeqPos> delete_sizes;
2798 
2799  CBioseq_Handle prod_bsh = m_scope->GetBioseqHandle(*feat.GetProduct().GetId());
2800  if ( !prod_bsh ) {
2801  /// Product doesn't exist (will happen for fake transcript when handling
2802  /// protein alignments); no basis for creating exceptions
2803  return;
2804  }
2805 
2807  if (loc_len > prod_bsh.GetBioseqLength()) {
2808  has_length_mismatch = true;
2809  }
2810 
2812  ///
2813  /// can do full comparison
2814  ///
2815 
2816  /// we know we have a Spliced-seg
2817  /// evaluate for gaps or mismatches
2818  TSeqPos prev_to = 0;
2819  ITERATE (CSpliced_seg::TExons, exon_it,
2820  al->GetSegs().GetSpliced().GetExons()) {
2821  const CSpliced_exon& exon = **exon_it;
2822  TSeqPos pos = exon.GetProduct_start().GetNucpos();
2823  if (exon_it != al->GetSegs().GetSpliced().GetExons().begin()) {
2824  TSeqRange gap(prev_to+1, pos-1);
2825  if (gap.NotEmpty()) {
2826  if (feat.IsSetPartial()) {
2827  partial_unaligned_section = true;
2828  } else {
2829  insert_locs += gap;
2830  }
2831  }
2832  }
2833  prev_to = exon.GetProduct_end().GetNucpos();
2834  if (exon.IsSetParts()) {
2835  ITERATE (CSpliced_exon::TParts, part_it, exon.GetParts()) {
2836  switch ((*part_it)->Which()) {
2838  pos += (*part_it)->GetMatch();
2839  break;
2841  mismatch_locs +=
2842  TSeqRange(pos, pos+(*part_it)->GetMismatch()-1);
2843  pos += (*part_it)->GetMismatch();
2844  break;
2846  pos += (*part_it)->GetDiag();
2847  break;
2849  delete_locs += TSeqRange(pos, pos);
2850  delete_sizes[pos] = (*part_it)->GetGenomic_ins();
2851  break;
2853  insert_locs +=
2854  TSeqRange(pos, pos+(*part_it)->GetProduct_ins()-1);
2855  pos += (*part_it)->GetProduct_ins();
2856  break;
2857  default:
2858  break;
2859  }
2860  }
2861  }
2862  }
2863 
2864  /// Check against aligned range - see if there is a 5' or 3'
2865  /// discrepancy
2866  TSeqRange r = al->GetSeqRange(0);
2867  if (r.GetFrom() != 0) {
2868  if (feat.IsSetPartial()) {
2869  partial_unaligned_section = true;
2870  } else {
2871  insert_locs += TSeqRange(0, r.GetFrom()-1);
2872  }
2873  }
2874 
2875  TSeqPos max_align_len = 0;
2876  if (al->GetSegs().GetSpliced().IsSetPoly_a()) {
2877  //has_polya_tail = true;
2878  max_align_len = al->GetSegs().GetSpliced().GetPoly_a();
2879  } else if (al->GetSegs().GetSpliced().IsSetProduct_length()) {
2880  max_align_len = al->GetSegs().GetSpliced().GetProduct_length();
2881  } else {
2882  max_align_len = prod_bsh.GetBioseqLength();
2883  }
2884 
2885  if (r.GetTo() + 1 < max_align_len) {
2886  if (feat.IsSetPartial()) {
2887  partial_unaligned_section = true;
2888  } else {
2889  insert_locs += TSeqRange(r.GetTo()+1, max_align_len-1);
2890  }
2891  }
2892 
2893  /// also note the poly-A
2894  /**
2895  if (al->GetSegs().GetSpliced().IsSetPoly_a()) {
2896  has_polya_tail = true;
2897  }
2898  **/
2899  }
2900 
2901  if ( insert_locs.empty() && delete_locs.empty() && !partial_unaligned_section)
2902  {
2903  /// only compare for mismatches and 3' unaligned
2904  /// we assume that the feature is otherwise aligned
2905 
2906  CSeqVector nuc_vec(feat.GetLocation(), *m_scope,
2908 
2909  CSeqVector rna_vec(prod_bsh,
2911 
2912  CSeqVector::const_iterator prod_it = rna_vec.begin();
2913  CSeqVector::const_iterator prod_end = rna_vec.end();
2914 
2915  CSeqVector::const_iterator genomic_it = nuc_vec.begin();
2916  CSeqVector::const_iterator genomic_end = nuc_vec.end();
2917  mismatch_locs.clear();
2918 
2919  for ( ; prod_it != prod_end && genomic_it != genomic_end;
2920  ++prod_it, ++genomic_it) {
2921  if (*prod_it != *genomic_it) {
2922  mismatch_locs += TSeqRange(prod_it.GetPos(), prod_it.GetPos());
2923  }
2924  }
2925 
2926  unsigned tail_len = Convert(prod_end - prod_it);
2927  size_t count_a = 0;
2928  for ( ; prod_it != prod_end; ++prod_it) {
2929  if (*prod_it == 'A') {
2930  ++count_a;
2931  }
2932  }
2933 
2934  if (tail_len && count_a >= tail_len * 0.8) {
2935  //has_polya_tail = true;
2936  if (count_a < tail_len * 0.95) {
2937  has_incomplete_polya_tail = true;
2938  }
2939  }
2940  else if (tail_len) {
2941  if (feat.IsSetPartial()) {
2942  partial_unaligned_section = true;
2943  } else {
2944  TSeqPos end_pos = feat.GetLocation().GetTotalRange().GetTo();
2945  insert_locs += TSeqRange(end_pos-tail_len+1, end_pos);
2946  }
2947  }
2948  }
2949 
2950  string except_text;
2951  if (!insert_locs.empty() ||
2952  !delete_locs.empty() ||
2953  has_length_mismatch ||
2954  has_incomplete_polya_tail ||
2955  partial_unaligned_section) {
2956  except_text = "unclassified transcription discrepancy";
2957  }
2958  else if (!mismatch_locs.empty()) {
2959  except_text = "mismatches in transcription";
2960  }
2961 
2962  x_SetExceptText(feat, except_text);
2963  x_SetComment(feat, cds_feat, cds_feat_on_mrna, align, mismatch_locs,
2964  insert_locs, delete_locs, delete_sizes,
2965  partial_unaligned_section);
2966 }
2967 
2969  TSeqPos pos, CRef<CSeq_id> mapped_protein_id,
2970  const CRangeCollection<TSeqPos> &product_ranges,
2971  CRef<CSeq_loc_Mapper> to_mrna, CRef<CSeq_loc_Mapper> to_genomic)
2972 {
2973  CRef<CSeq_loc> mapped;
2974  if (to_mrna) {
2975  ITERATE (CRangeCollection<TSeqPos>, range_it, product_ranges) {
2976  if (range_it->GetLength() > pos) {
2977  pos += range_it->GetFrom();
2978  break;
2979  } else {
2980  pos -= range_it->GetLength();
2981  }
2982  }
2983  CSeq_loc base_loc(*mapped_protein_id, pos, pos);
2984  CRef<CSeq_loc> mrna_loc = to_mrna->Map(base_loc);
2985  mapped = to_genomic->Map(*mrna_loc);
2986  mapped->SetPartialStart(false, eExtreme_Biological);
2987  mapped->SetPartialStop(false, eExtreme_Biological);
2988  }
2989  return mapped;
2990 }
2991 
2992 
2994  const CSeq_align* align,
2995  const CSeq_feat* cds_feat_on_query_mrna,
2996  const CSeq_feat* cds_feat_on_transcribed_mrna,
2997  list<CRef<CSeq_loc> >* transcribed_mrna_seqloc_refs,
2998  TSeqPos *clean_match_count)
2999 {
3000  if ( !feat.IsSetProduct()
3001  || ( cds_feat_on_query_mrna && !cds_feat_on_query_mrna->IsSetProduct() )
3002  ) {
3003  ///
3004  /// corner case:
3005  /// we may be a CDS feature for an Ig locus
3006  /// check to see if we have an overlapping V/C/D/J/S region
3007  /// we trust only featu-id xrefs here
3008  ///
3009  if (feat.IsSetXref()) {
3010  CBioseq_Handle bsh = m_scope->GetBioseqHandle(*feat.GetLocation().GetId());
3011  const CTSE_Handle& tse = bsh.GetTSE_Handle();
3012 
3013  ITERATE (CSeq_feat::TXref, it, feat.GetXref()) {
3014  if ( !(*it)->IsSetId() ) {
3015  continue;
3016  }
3017 
3018  CSeq_feat_Handle h;
3019  const CFeat_id& feat_id = (*it)->GetId();
3020  if (feat_id.IsLocal()) {
3021  if (feat_id.GetLocal().IsId()) {
3023  feat_id.GetLocal().GetId());
3024  } else {
3026  feat_id.GetLocal().GetStr());
3027  }
3028  }
3029 
3030  if (h) {
3031  switch (h.GetData().GetSubtype()) {
3039  /// found it
3040  feat.SetExcept(true);
3041  feat.SetExcept_text
3042  ("rearrangement required for product");
3043  break;
3044 
3045  default:
3046  break;
3047  }
3048  }
3049  }
3050  }
3051  return;
3052  }
3053 
3054  ///
3055  /// exceptions here are easy:
3056  /// we compare the annotated product to the conceptual translation and
3057  /// report problems
3058  ///
3059  bool has_start = false;
3060  bool has_stop = false;
3061  TSeqPos mismatch_count = 0;
3062  bool has_gap = false;
3063  bool has_indel = false;
3064 
3065  string xlate;
3066 
3067  CRef<CSeq_feat> corrected_cds_feat_on_query_mrna;
3068  CRef<CSeq_feat> corrected_cds_feat_on_transcribed_mrna;
3069  if (cds_feat_on_query_mrna) {
3070  /// In some cases, the id in the CDS feature is not the same as in the
3071  /// alignment; make sure the mapping is done with matching ids
3072 
3073  corrected_cds_feat_on_query_mrna.Reset(new CSeq_feat);
3074  corrected_cds_feat_on_query_mrna->Assign(*cds_feat_on_query_mrna);
3075  corrected_cds_feat_on_query_mrna->SetLocation().SetId(align->GetSeq_id(0));
3076 
3077  corrected_cds_feat_on_transcribed_mrna.Reset(new CSeq_feat);
3078  corrected_cds_feat_on_transcribed_mrna->Assign(*cds_feat_on_transcribed_mrna);
3079  corrected_cds_feat_on_transcribed_mrna->SetLocation().SetId(align->GetSeq_id(0));
3080  }
3081 
3082  int cds_start_on_mrna = 0;
3083  int frame_on_mrna = 0;
3084  bool filled_by_polya = false;
3085 
3086  if (align != NULL) {
3087  CBioseq bioseq;
3088  x_CollectMrnaSequence(bioseq.SetInst(), *align, feat.GetLocation(), true, true, &has_gap, &has_indel);
3089 
3090  CSeqVector seq(bioseq, m_scope.GetPointer(),
3092 
3093  if (feat.GetProduct().GetId()->IsLocal()) {
3094  /// For locally generated product, product is generated from
3095  /// translation so we know it will fit the translation perfectly;
3096  /// only need exceptions if there are frameshifts
3097  if (has_indel) {
3098  string except_text = "unclassified translation discrepancy";
3099  x_SetExceptText(feat, except_text);
3100  }
3101  if (clean_match_count) {
3102  *clean_match_count = seq.size();
3103  }
3104  return;
3105  }
3106 
3107  CRef<CSeq_loc_Mapper> genomic_to_mrna(
3109 
3110  int cds_len_on_query_mrna = GetLength(feat.GetLocation(), NULL);
3111  int missing_end = 0;
3112  if (cds_feat_on_query_mrna) {
3113  cds_start_on_mrna =
3114  cds_feat_on_query_mrna->GetLocation().GetStart(eExtreme_Positional);
3115  cds_len_on_query_mrna = GetLength(cds_feat_on_query_mrna->GetLocation(), NULL);
3116  CRef<CSeq_loc> aligned_cds = genomic_to_mrna->Map(feat.GetLocation());
3117  missing_end = cds_feat_on_query_mrna->GetLocation().GetStop(eExtreme_Positional)
3118  - aligned_cds->GetStop(eExtreme_Positional);
3119 
3120  if (cds_feat_on_query_mrna->GetData().GetCdregion().IsSetFrame()) {
3121  switch (cds_feat_on_query_mrna->GetData().GetCdregion().GetFrame()) {
3122  case CCdregion::eFrame_two :
3123  frame_on_mrna = 1;
3124  break;
3126  frame_on_mrna = 2;
3127  break;
3128  default :
3129  break;
3130  }
3131  }
3132  }
3133  string mrna;
3134  seq.GetSeqData(cds_start_on_mrna + frame_on_mrna, cds_start_on_mrna + cds_len_on_query_mrna, mrna);
3135  if ((missing_end == 1 || missing_end == 2) &&
3137  align->GetSegs().IsSpliced() &&
3138  align->GetSegs().GetSpliced().CanGetPoly_a())
3139  {
3140  /// One or two bases at end replaced by poly-a
3141  filled_by_polya = true;
3142  for (size_t pos = mrna.size() - 1 - missing_end;
3143  pos < mrna.size(); ++pos)
3144  {
3145  mrna[pos] = 'A';
3146  }
3147  }
3148 
3149  const CGenetic_code *code = NULL;
3150  if (feat.GetData().GetCdregion().IsSetCode()) {
3151  code = &feat.GetData().GetCdregion().GetCode();
3152  }
3153  bool partial_start = feat.GetLocation().IsPartialStart(eExtreme_Biological);
3154  CSeqTranslator::Translate(mrna, xlate,
3155  partial_start
3158  code);
3159  if (xlate.size() && xlate[0] == '-') {
3160  /// First codon couldn't be translated as initial codon; translate
3161  /// as mid-sequence codon instead
3162  string first_codon = mrna.substr(0,3);
3163  string first_aa;
3164  CSeqTranslator::Translate(first_codon, first_aa,
3166  xlate[0] = first_aa[0];
3167  }
3168 
3169  // deal with code breaks
3170  // NB: these should be folded into the translation machinery instead...
3171  if (feat.GetData().GetCdregion().IsSetCode_break() && corrected_cds_feat_on_transcribed_mrna)
3172  {
3174  feat.GetData().GetCdregion().GetCode_break()) {
3175 
3176  const CSeq_loc& cb_on_genome = (*it)->GetLoc();
3177  CRef<CSeq_loc> cb_on_mrna = genomic_to_mrna->Map(cb_on_genome);
3178  if (!cb_on_mrna) continue;
3179 
3180  TSeqRange r = cb_on_mrna->GetTotalRange();
3181  if (r.GetLength() != 3) {
3182  continue;
3183  }
3184 
3185  int pos = (cb_on_mrna->GetStart(eExtreme_Biological)-(cds_start_on_mrna+frame_on_mrna))/3;
3186 
3187  string src;
3189 
3190  switch ((*it)->GetAa().Which()) {
3192  src += (char)(*it)->GetAa().GetNcbieaa();
3193  src_coding = CSeqUtil::e_Ncbieaa;
3194  break;
3195 
3197  src += (char)(*it)->GetAa().GetNcbistdaa();
3198  src_coding = CSeqUtil::e_Ncbistdaa;
3199  break;
3200 
3202  src += (char)(*it)->GetAa().GetNcbi8aa();
3203  src_coding = CSeqUtil::e_Ncbi8aa;
3204  break;
3205 
3206  default:
3207  break;
3208  }
3209 
3210  if (src.size()) {
3211  string dst;
3212  CSeqConvert::Convert(src, src_coding, 0, 1,
3213  dst, CSeqUtil::e_Ncbieaa);
3214  xlate[pos] = dst[0];
3215  }
3216  }
3217  }
3218  } else {
3219  CSeqTranslator::Translate(feat, *m_scope, xlate);
3220  }
3221 
3222  CRef<CSeq_loc_Mapper> to_prot;
3223  CRef<CSeq_loc_Mapper> to_mrna;
3224  CRef<CSeq_loc_Mapper> to_genomic;
3225  CRef<CSeq_id> mapped_protein_id;
3226  if (corrected_cds_feat_on_transcribed_mrna) {
3227  to_prot.Reset(
3228  new CSeq_loc_Mapper(*corrected_cds_feat_on_query_mrna,
3230  to_mrna.Reset(
3231  new CSeq_loc_Mapper(*corrected_cds_feat_on_query_mrna,
3233  to_genomic.Reset(
3235  mapped_protein_id.Reset(new CSeq_id);
3236  mapped_protein_id->Assign(*corrected_cds_feat_on_query_mrna->GetProduct().GetId());
3237  }
3238 
3239  CRef<CSeq_id> cds_id(new CSeq_id);
3240  cds_id->Assign(*feat.GetProduct().GetId());
3241 
3242  string actual;
3243  CRef<CSeq_loc> whole_product(new CSeq_loc);
3244  whole_product->SetWhole(*cds_id);
3245  CSeqVector vec(*whole_product, *m_scope,
3248  if (cds_feat_on_transcribed_mrna) {
3249  /// make sure we're comparing to aligned part of product
3250 
3251  CSeq_loc cds_feat_on_transcribed_mrna_loc;
3252  cds_feat_on_transcribed_mrna_loc.Assign(corrected_cds_feat_on_transcribed_mrna->GetLocation());
3253  if (cds_feat_on_transcribed_mrna_loc.GetStrand() == eNa_strand_minus) {
3254  cds_feat_on_transcribed_mrna_loc.FlipStrand();
3255  }
3256 
3257  CRef<CSeq_loc> aligned_range =
3258  align->CreateRowSeq_loc(0)->Intersect(cds_feat_on_transcribed_mrna_loc, 0, NULL);
3259  CRef<CSeq_loc> product_loc = to_prot->Map(*aligned_range);
3260  product_ranges.clear();
3261  ITERATE (CSeq_loc, loc_it, *product_loc) {
3262  product_ranges += loc_it.GetRange();
3263  }
3264 
3265 
3266  if (!corrected_cds_feat_on_transcribed_mrna->GetLocation().IsPartialStop(eExtreme_Biological) &&
3267  aligned_range->Intersect(corrected_cds_feat_on_transcribed_mrna->GetLocation(), 0, NULL)->GetStop(eExtreme_Biological) ==
3268  corrected_cds_feat_on_transcribed_mrna->GetLocation().GetStop(eExtreme_Biological)) {
3269  // trim off the stop
3270  product_ranges -= TSeqRange(product_ranges.GetTo(),
3271  product_ranges.GetTo());
3272  }
3273  }
3274 
3275  if ((xlate.size() == product_ranges.GetTo() + (filled_by_polya ? 1 : 2) ||
3276  product_ranges.GetTo() == TSeqRange::GetWholeTo()) &&
3277  xlate[xlate.size() - 1] == '*')
3278  { /// strip a terminal stop
3279  xlate.resize(xlate.size() - 1);
3280  has_stop = true;
3281  }
3282  else if (feat.GetLocation().IsPartialStop(eExtreme_Biological)) {
3283  has_stop = true;
3284  } else {
3285  has_stop = false;
3286  }
3287 
3288  if ( (product_ranges.GetFrom()==0 && xlate.size() && xlate[0] == 'M') ||
3290  has_start = true;
3291  }
3292 
3293  if (product_ranges.Empty()) {
3294  has_gap = true;
3295  } else {
3296  string whole;
3297  vec.GetSeqData(0, vec.size(), whole);
3298  if (product_ranges[0].IsWhole()) {
3299  actual = whole;
3300  } else {
3301  string xlate_trimmed;
3302  ITERATE (CRangeCollection<TSeqPos>, range_it, product_ranges) {
3303  actual += whole.substr(range_it->GetFrom(), range_it->GetLength());
3304  xlate_trimmed += xlate.substr(range_it->GetFrom(), range_it->GetLength());
3305  }
3306  xlate = xlate_trimmed;
3307  }
3308  if (actual != whole) {
3309  has_gap = true;
3310  }
3311  }
3312 
3313  ///
3314  /// now, compare the two
3315  /// we deliberately look for problems here rather than using a direct
3316  /// string compare
3317  /// NB: we could actually compare lengths first; a length difference imples
3318  /// the unclassified translation discrepancy state, but we may expand these
3319  /// states in the future, so it's better to be explicit about our data
3320  /// recording first
3321  ///
3322 
3323  string::const_iterator it1 = actual.begin();
3324  string::const_iterator it1_end = actual.end();
3325  string::const_iterator it2 = xlate.begin();
3326  string::const_iterator it2_end = xlate.end();
3327 
3328  for ( ; it1 != it1_end && it2 != it2_end; ++it1, ++it2) {
3329  TSeqPos pos = Convert(it1 - actual.begin());
3330  CRef<CSeq_loc> mapped = s_MapSingleAA(pos,
3331  mapped_protein_id, product_ranges, to_mrna, to_genomic);
3332  CRef<CCode_break> code_break;
3333  if (mapped && feat.GetData().GetCdregion().IsSetCode_break()) {
3334  if (!mapped->IsInt()) {
3335  mapped->ChangeToPackedInt();
3336  }
3337  NON_CONST_ITERATE (CCdregion::TCode_break, it, feat.SetData().SetCdregion().SetCode_break()) {
3338  CCode_break & cb = **it;
3339  if (cb.GetLoc().Compare(*mapped, CSeq_loc::fCompare_Strand)==0) {
3340  code_break = *it;
3341  break;
3342  }
3343  }
3344  }
3345  if ((m_flags & fTrustProteinSeq) && *it2 == 'X' && code_break) {
3346  /// internal stop codon; change the code-break to actual aa
3347 
3348  if ((m_flags & fForceTranslateCds)) {
3349  // it's too late to change generated protein
3351  "fTrustProteinSeq & fForceTranslateCds combination not implemented");
3352  }
3353 
3354  char actual_aa = *it1;
3355  code_break->SetAa().SetNcbieaa(actual_aa);
3356 
3357  } else if (*it2 == '-' || *it2 == '*') {
3358  has_gap = true;
3359  } else if (*it1 != *it2) {
3360  ++mismatch_count;
3361  } else if (clean_match_count && (!mapped ||
3362  (mapped->IsInt() && mapped->GetTotalRange().GetLength() == 3)))
3363  {
3364  ++*clean_match_count;
3365  }
3366  }
3367 
3368  if (has_stop && filled_by_polya) {
3369  TSeqPos pos = Convert(xlate.size());
3370  CRef<CSeq_loc> mapped = s_MapSingleAA(pos, mapped_protein_id,
3371  product_ranges, to_mrna, to_genomic);
3372  if (mapped) {
3373  AddCodeBreak(feat, *mapped, '*');
3374  if (feat.IsSetComment()) {
3375  feat.SetComment() += "; ";
3376  } else {
3377  feat.SetComment("");
3378  }
3379  feat.SetComment() += "stop codon completed by the addition of "
3380  "3' A residues to the mRNA";
3381  } else {
3382  has_stop = false;
3383  }
3384  }
3385 
3386  string except_text;
3387 
3388  /// The process for setting the comment in some cases finds indels that our
3389  /// process here misses, so check the comment to determine if we have indels
3390  if (feat.IsSetComment() &&
3391  (feat.GetComment().find("indel") != string::npos ||
3392  feat.GetComment().find("inserted") != string::npos ||
3393  feat.GetComment().find("deleted") != string::npos))
3394  {
3395  has_indel = true;
3396  }
3397 
3398  if (actual.size() != xlate.size() ||
3399  !has_stop || !has_start ||
3400  has_gap || has_indel) {
3401  except_text = "unclassified translation discrepancy";
3402  }
3403  else if (mismatch_count) {
3404  except_text = "mismatches in translation";
3405  }
3406 
3407  x_SetExceptText(feat, except_text);
3408 }
3409 
3411  CSeq_feat& feat, const string &text)
3412 {
3413  string except_text = text;
3414 
3415  list<string> except_toks;
3416  if (feat.IsSetExcept_text()) {
3417  NStr::Split(feat.GetExcept_text(), ",", except_toks, NStr::fSplit_Tokenize);
3418 
3419  for (list<string>::iterator it = except_toks.begin();
3420  it != except_toks.end(); ) {
3422  if (it->empty() ||
3423  *it == "annotated by transcript or proteomic data" ||
3424  *it == "unclassified transcription discrepancy" ||
3425  *it == "mismatches in transcription" ||
3426  *it == "unclassified translation discrepancy" ||
3427  *it == "mismatches in translation") {
3428  except_toks.erase(it++);
3429  }
3430  else {
3431  ++it;
3432  }
3433  }
3434  }
3435 
3436  if ( !except_text.empty() ) {
3437  /// Check whether this is a Refseq product
3438  CBioseq_Handle bsh = m_scope->GetBioseqHandle(*feat.GetProduct().GetId());
3439  ITERATE(CBioseq_Handle::TId, it, bsh.GetId())
3440  if(it->GetSeqId()->IsOther() &&
3441  it->GetSeqId()->GetOther().GetAccession()[0] == 'N' &&
3442  string("MRP").find(it->GetSeqId()->GetOther().GetAccession()[1]) != string::npos)
3443  {
3444  except_text = "annotated by transcript or proteomic data";
3445 
3446  /// Refseq exception has to be combined with an inference qualifer
3447  string product_type_string;
3448  if(feat.GetData().IsCdregion())
3449  product_type_string = "AA sequence";
3450  else {
3451  NCBI_ASSERT(feat.GetData().IsRna(), "Bad feature type");
3452  product_type_string = "RNA sequence";
3453  if(feat.GetData().GetRna().CanGetType() &&
3455  product_type_string += ", mRNA";
3456  }
3457  CRef<CGb_qual> qualifier(new CGb_qual);
3458  qualifier->SetQual("inference");
3459  qualifier->SetVal("similar to " + product_type_string + " (same species):RefSeq:" +
3460  it->GetSeqId()->GetOther().GetAccession() + '.' +
3461  NStr::IntToString(it->GetSeqId()->GetOther().GetVersion()));
3462  feat.SetQual().push_back(qualifier);
3463  }
3464 
3465  except_toks.push_back(except_text);
3466  }
3467  except_text = NStr::Join(except_toks, ", ");
3468 
3469  if (except_text.empty()) {
3470  // no exception; set states correctly
3471  feat.ResetExcept_text();
3472  feat.ResetExcept();
3473  } else {
3474  feat.SetExcept(true);
3475  feat.SetExcept_text(except_text);
3476  }
3477 }
3478 
3480  CSeq_feat& feat, CSeq_id_Handle id)
3481 {
3482  CBioseq_Handle bsh = m_scope->GetBioseqHandle(id);
3483  CSeq_id_Handle best_id = sequence::GetId(id, *m_scope);
3484 
3485  string product_type_string = "RNA sequence";
3486  const CMolInfo *mol_info = s_GetMolInfo(bsh);
3487  if (mol_info && mol_info->CanGetBiomol() &&
3488  mol_info->GetBiomol() == CMolInfo::eBiomol_mRNA) {
3489  product_type_string += ", mRNA";
3490  }
3491 
3492  string db = "INSD";
3493  if(best_id.GetSeqId()->IsOther() &&
3494  best_id.GetSeqId()->GetOther().GetAccession()[0] == 'N' &&
3495  string("MRP").find(best_id.GetSeqId()->GetOther().GetAccession()[1]) != string::npos)
3496  {
3497  db = "RefSeq";
3498  }
3499 
3500  CRef<CGb_qual> qualifier(new CGb_qual);
3501  qualifier->SetQual("inference");
3502  qualifier->SetVal("similar to " + product_type_string + " (same species):"+db+":" +
3503  best_id.GetSeqId()->GetSeqIdString(true));
3504  feat.SetQual().push_back(qualifier);
3505 
3506 }
3507 
3509  const CSeq_align* align,
3510  CSeq_feat* cds_feat,
3511  const CSeq_feat* cds_feat_on_query_mrna,
3512  const CSeq_feat* cds_feat_on_transcribed_mrna,
3513  list<CRef<CSeq_loc> >* transcribed_mrna_seqloc_refs,
3514  TSeqPos *clean_match_count)
3515 {
3516  CConstRef<CSeq_align> align_ref;
3517 
3518  if (align && IsProteinAlign(*align)) {
3519  align_ref.Reset(align);
3520  CRef<CSeq_feat> fake_cds_feat;
3521  TransformProteinAlignToTranscript(align_ref, fake_cds_feat);
3522  align = align_ref.GetPointer();
3523  }
3524 
3525  // We're going to set the exception and add any needed inference qualifiers,
3526  // so if there's already an inference qualifer there, remove it.
3527  if (feat.IsSetQual()) {
3528  for (CSeq_feat::TQual::iterator it = feat.SetQual().begin();
3529  it != feat.SetQual().end(); )
3530  {
3531  if ((*it)->CanGetQual() && (*it)->GetQual() == "inference") {
3532  it = feat.SetQual().erase(it);
3533  }
3534  else {
3535  ++it;
3536  }
3537  }
3538  if (feat.GetQual().empty()) {
3539  feat.ResetQual();
3540  }
3541  }
3542 
3543  // Exceptions identified are:
3544  //
3545  // - unclassified transcription discrepancy
3546  // - mismatches in transcription
3547  // - unclassified translation discrepancy
3548  // - mismatches in translation
3549  switch (feat.GetData().Which()) {
3550  case CSeqFeatData::e_Rna:
3551  x_HandleRnaExceptions(feat, align, cds_feat, cds_feat_on_query_mrna);
3552  break;
3553 
3555  x_HandleCdsExceptions(feat, align,
3556  cds_feat_on_query_mrna, cds_feat_on_transcribed_mrna,
3557  transcribed_mrna_seqloc_refs,
3558  clean_match_count);
3559  break;
3560 
3561  case CSeqFeatData::e_Imp:
3562  switch (feat.GetData().GetSubtype()) {
3570  x_HandleRnaExceptions(feat, align, NULL, NULL);
3571  break;
3572 
3573  default:
3574  break;
3575  }
3576  break;
3577 
3578  default:
3579  break;
3580  }
3581 }
3582 
3583 static string s_Count(unsigned num, const string &item_name)
3584 {
3585  return NStr::NumericToString(num) + ' ' + item_name + (num == 1 ? "" : "s");
3586 }
3587 
3589  CSeq_feat *cds_feat,
3590  const CSeq_feat *cds_feat_on_mrna,
3591  const CSeq_align *align,
3592  const CRangeCollection<TSeqPos> &mismatch_locs,
3593  const CRangeCollection<TSeqPos> &insert_locs,
3594  const CRangeCollection<TSeqPos> &delete_locs,
3595  map<TSeqPos,TSeqPos> &delete_sizes,
3596  bool partial_unaligned_section)
3597 {
3598  if (mismatch_locs.empty() && insert_locs.empty() && delete_locs.empty() &&
3599  !partial_unaligned_section &&
3600  !(m_is_gnomon && cds_feat &&
3601  cds_feat->GetData().GetCdregion().IsSetCode_break()))
3602  {
3603  return;
3604  }
3605 
3606  string rna_comment, cds_comment;
3607  CRangeCollection<TSeqPos> inserts_in_cds, deletes_in_cds, cds_ranges;
3608  if (cds_feat) {
3610  for (CSeq_loc_CI loc_it(cds_feat->GetLocation()); loc_it; ++loc_it) {
3611  CRef<CSeq_loc> cds_on_mrna = to_mrna.Map(*loc_it.GetRangeAsSeq_loc());
3612  inserts_in_cds += cds_on_mrna->GetTotalRange();
3613  deletes_in_cds += cds_on_mrna->GetTotalRange();
3614  }
3615  inserts_in_cds &= insert_locs;
3616  deletes_in_cds &= delete_locs;
3617  }
3618  if (cds_feat_on_mrna) {
3619  for (CSeq_loc_CI loc_it(cds_feat_on_mrna->GetLocation());
3620  loc_it; ++loc_it)
3621  {
3622  cds_ranges += loc_it.GetRange();
3623  }
3624  }
3625 
3626  CRef<CUser_object> align_info(new CUser_object);
3627  align_info->SetType().SetStr("AlignInfo");
3628 
3629  if (m_is_best_refseq) {
3630  unsigned indel_count = Convert(insert_locs.size() + delete_locs.size());
3631  unsigned frameshift_count = 0;
3632  unsigned pct_coverage = 100, cds_pct_coverage = 100;
3633  if (partial_unaligned_section) {
3634  pct_coverage =
3636  cds_pct_coverage =
3638  cds_ranges);
3639  }
3640  if (cds_feat && cds_feat_on_mrna) {
3641  unsigned cds_indel_count = 0;
3642  ITERATE (CRangeCollection<TSeqPos>, it, inserts_in_cds) {
3643  ++(it->GetLength() % 3 ? frameshift_count : cds_indel_count);
3644  }
3645  ITERATE (CRangeCollection<TSeqPos>, it, deletes_in_cds) {
3646  ++(delete_sizes[it->GetFrom()] % 3 ? frameshift_count
3647  : cds_indel_count);
3648  }
3649  indel_count -= frameshift_count;
3650  unsigned cds_mismatch_count = 0;
3651  bool start_codon_mismatch = false;
3652  CSeqVector prot(cds_feat->GetProduct(), *m_scope,
3654  const CTrans_table &translate =
3656  CSeq_loc_Mapper to_mrna(*cds_feat_on_mrna,
3658  CSeq_loc_Mapper to_genomic(
3660  CRef<CSeq_id> cds_id(new CSeq_id);
3661  cds_id->Assign(*cds_feat->GetProduct().GetId());
3662  TSeqPos start_pos =
3664  bool single_interval_product = ++cds_feat->GetProduct().begin()
3665  == cds_feat->GetProduct().end();
3666  if (!single_interval_product) {
3668  "product is required to be a single interval");
3669  }
3670  for (TSeqPos pos = start_pos; pos < start_pos + prot.size(); ++pos)
3671  {
3672  CSeq_loc aa_loc(*cds_id, pos, pos);
3673  CRef<CSeq_loc> rna_codon = to_mrna.Map(aa_loc);
3674  CRef<CSeq_loc> genomic_codon = to_genomic.Map(*rna_codon);
3675  CSeqVector codon(*genomic_codon, *m_scope,
3677  if (codon.size() == 3) {
3679  codon[0], codon[1], codon[2]);
3680  char translated_codon = pos == 0
3681  ? translate.GetStartResidue(state)
3682  : translate.GetCodonResidue(state);
3683  if (translated_codon != prot[pos]) {
3684  ++cds_mismatch_count;
3685  if (genomic_codon->GetStart(eExtreme_Biological) ==
3688  {
3689  start_codon_mismatch = true;
3690  }
3691  }
3692  }
3693  }
3694  if (cds_mismatch_count || cds_indel_count || frameshift_count || cds_pct_coverage < 100)
3695  {
3696  cds_comment = "The RefSeq protein";
3697  if (cds_mismatch_count) {
3698  cds_comment += " has "
3699  + s_Count(cds_mismatch_count, "substitution");
3700  }
3701  if (frameshift_count) {
3702  cds_comment += (cds_mismatch_count ? ", " : " has ")
3703  + s_Count(frameshift_count, "frameshift");
3704  }
3705  if (cds_indel_count) {
3706  cds_comment += (cds_mismatch_count || frameshift_count ? ", " : " has ")
3707  + s_Count(cds_indel_count, "non-frameshifting indel");
3708  }
3709  if (cds_pct_coverage < 100) {
3710  if (cds_mismatch_count || cds_indel_count || frameshift_count) {
3711  cds_comment += " and";
3712  }
3713  cds_comment += " aligns at "
3714  + NStr::NumericToString(cds_pct_coverage)
3715  + "% coverage";
3716  }
3717  cds_comment += " compared to this genomic sequence";
3718  }
3719  if (start_codon_mismatch) {
3720  align_info->AddField("start_codon_mismatches", 1);
3721  }
3722  }
3723  rna_comment = "The RefSeq transcript";
3724  if (!mismatch_locs.empty()) {
3725  rna_comment += " has " +
3726  s_Count(mismatch_locs.GetCoveredLength(), "substitution");
3727  align_info->AddField("num_substitions", (int)mismatch_locs.GetCoveredLength());
3728  }
3729  if (frameshift_count) {
3730  rna_comment += (mismatch_locs.empty() ? " has " : ", ") +
3731  s_Count(frameshift_count, "frameshift");
3732  align_info->AddField("num_frameshifts", (int)frameshift_count);
3733  }
3734  if (indel_count) {
3735  rna_comment += (mismatch_locs.empty() && !frameshift_count? " has " : ", ") +
3736  s_Count(indel_count, "non-frameshifting indel");
3737  align_info->AddField("num_nonframeshift_indel", (int)indel_count);
3738  }
3739  if (partial_unaligned_section) {
3740  if (!mismatch_locs.empty() || indel_count || frameshift_count) {
3741  rna_comment += " and";
3742  }
3743  rna_comment += " aligns at "
3744  + NStr::NumericToString(pct_coverage)
3745  + "% coverage";
3746  }
3747  if (rna_comment == "The RefSeq transcript") {
3748  rna_comment.clear();
3749  } else {
3750  rna_comment += " compared to this genomic sequence";
3751  }
3752  } else if (m_is_gnomon) {
3753  set<TSeqPos> insert_codons, delete_codons;
3754  TSeqPos inserted_bases = insert_locs.GetCoveredLength(),
3755  cds_inserted_bases = inserts_in_cds.GetCoveredLength(),
3756  deleted_bases = 0, cds_deleted_bases = 0,
3757  code_breaks = 0;
3758  ITERATE (CRangeCollection<TSeqPos>, delete_it, delete_locs) {
3759  NCBI_ASSERT(delete_it->GetLength() == 1,
3760  "Delete locations should always be one base");
3761  deleted_bases += delete_sizes.find(delete_it->GetFrom())->second;
3762  }
3763  ITERATE (CRangeCollection<TSeqPos>, insert_it, inserts_in_cds) {
3764  for (TSeqPos pos = insert_it->GetFrom();
3765  pos <= insert_it->GetTo(); ++pos)
3766  {
3767  insert_codons.insert((pos - cds_ranges.GetFrom()) / 3);
3768  }
3769  }
3770  ITERATE (CRangeCollection<TSeqPos>, delete_it, deletes_in_cds) {
3771  NCBI_ASSERT(delete_it->GetLength() == 1,
3772  "Delete locations should always be one base");
3773  delete_codons.insert((delete_it->GetFrom() -
3774  cds_ranges.GetFrom()) / 3);
3775  cds_deleted_bases +=
3776  delete_sizes.find(delete_it->GetFrom())->second;
3777  }
3778  if(cds_feat && cds_feat->GetData().GetCdregion().IsSetCode_break()) {
3780  cds_feat->GetData().GetCdregion().GetCode_break())
3781  {
3782  char aa = 0;
3783  switch ((*it)->GetAa().Which()) {
3785  aa = (*it)->GetAa().GetNcbieaa();
3786  break;
3787 
3789  {{
3790  string src_string(1, (*it)->GetAa().GetNcbistdaa()),
3791  dst_string;
3793  0, 1, dst_string,
3795  aa = dst_string[0];
3796  }}
3797  break;
3798 
3800  {{
3801  string src_string(1, (*it)->GetAa().GetNcbi8aa()),
3802  dst_string;
3804  0, 1, dst_string,
3806  aa = dst_string[0];
3807  }}
3808  break;
3809 
3810  default:
3811  break;
3812  }
3813  if (aa != 'U') {
3814  ++code_breaks;
3815  }
3816  }
3817  }
3818  unsigned insert_codons_count = Convert(insert_codons.size()),
3819  delete_codons_count = Convert(delete_codons.size());
3820  if (inserted_bases || deleted_bases) {
3821  rna_comment = k_rna_comment;
3822  }
3823  if (inserted_bases) {
3824  rna_comment += ": inserted " + s_Count(inserted_bases, "base")
3825  + " in " + s_Count(insert_codons_count, "codon");
3826  }
3827  if (deleted_bases) {
3828  rna_comment += string(NStr::EndsWith(rna_comment,"CDS") ? ":" : ";")
3829  + " deleted " + s_Count(deleted_bases, "base")
3830  + " in " + s_Count(delete_codons_count, "codon");
3831  }
3832  if (cds_inserted_bases || cds_deleted_bases || code_breaks) {
3833  cds_comment = k_cds_comment;
3834  }
3835  if (cds_inserted_bases) {
3836  cds_comment += ": inserted " + s_Count(cds_inserted_bases, "base")
3837  + " in " + s_Count(insert_codons_count, "codon");
3838  }
3839  if (cds_deleted_bases) {
3840  cds_comment += string(NStr::EndsWith(cds_comment,"CDS") ? ":" : ";")
3841  + " deleted " + s_Count(cds_deleted_bases, "base")
3842  + " in " + s_Count(delete_codons_count, "codon");
3843  }
3844  if (code_breaks) {
3845  cds_comment += string(NStr::EndsWith(cds_comment,"CDS") ? ":" : ";")
3846  + " substituted " + s_Count(code_breaks, "base")
3847  + " at " + s_Count(code_breaks, "genomic stop codon");
3848  }
3849  }
3850 
3851  if (!rna_comment.empty()) {
3852  if (!rna_feat.IsSetComment()) {
3853  rna_feat.SetComment(rna_comment);
3854  /// If comment is already set, check it doesn't already contain our text
3855  } else if (rna_feat.GetComment().find(rna_comment) == string::npos) {
3856  rna_feat.SetComment() += "; " + rna_comment;
3857  }
3858  }
3859  if (!cds_comment.empty()) {
3860  if (!cds_feat->IsSetComment()) {
3861  cds_feat->SetComment(cds_comment);
3862  /// If comment is already set, check it doesn't already contain our text
3863  } else if (cds_feat->GetComment().find(cds_comment) == string::npos) {
3864  cds_feat->SetComment() += "; " + cds_comment;
3865  }
3866  }
3867  if (!align_info->GetData().empty()) {
3868  rna_feat.AddExt(align_info);
3869  }
3870 }
3871 
3873 (CSeq_feat& feat,
3874  TSeqPos insert_length)
3875 {
3876  _ASSERT(insert_length > 0);
3877  string comment;
3878  if (feat.GetData().IsRna()) {
3879  comment = k_rna_comment;
3880  } else if (feat.GetData().IsCdregion()) {
3881  comment = k_cds_comment;
3882  }
3883  comment += ":";
3884  if (!feat.IsSetComment()) {
3885  feat.SetComment(comment);
3886  } else if (feat.GetComment().find(comment) == string::npos) {
3887  feat.SetComment() += " " + comment;
3888  } else {
3889  feat.SetComment() += ";";
3890  }
3891 
3892  comment = " added " + s_Count(insert_length, "base") + " not found in genome assembly";
3893  feat.SetComment() += comment;
3894 }
3895 
3897 x_AddSelectMarkup(const CSeq_align &align,
3898  const CBioseq_Handle& rna_handle,
3899  const CSeq_id &genomic_acc,
3900  CSeq_feat& rna_feat, CSeq_feat* cds_feat)
3901 {
3902  bool need_location_check = !(m_flags & fSkipLocationCheck);
3903  e_MatchType match_found = eNone;
3904  string ensembl_match_rna, ensembl_match_cds;
3905  vector<string> keywords;
3906  bool drop = false;
3907  vector<CSeqdesc::E_Choice> desc_types = {CSeqdesc::e_User,
3909  for (CSeqdesc_CI desc(rna_handle, desc_types); desc; ++desc) {
3910  if (desc->IsGenbank() && desc->GetGenbank().IsSetKeywords()) {
3911  for (const string &keyword : desc->GetGenbank().GetKeywords()) {
3912  if (m_flags & fDropManeMarkup &&
3913  (keyword == "MANE Select" || keyword == "MANE Plus"
3914  || keyword == "MANE Plus Clinical"))
3915  {
3916  drop = true;
3917  if (keyword == "MANE Select") {
3918  keywords.push_back("RefSeq Select");
3919  } else if (keyword == "MANE Plus Clinical") {
3920  keywords.push_back("RefSeq Plus Clinical");
3921  }
3922  } else {
3923  keywords.push_back(keyword);
3924  }
3925  }
3926  } else if (desc->IsUser() &&
3927  desc->GetUser().HasField("MANE Ensembl match"))
3928  {
3929  NStr::SplitInTwo(desc->GetUser().GetField("MANE Ensembl match")
3930  .GetString(),
3931  "/", ensembl_match_rna, ensembl_match_cds);
3932  NStr::TruncateSpacesInPlace(ensembl_match_rna);
3933  NStr::TruncateSpacesInPlace(ensembl_match_cds);
3934  } else if (desc->IsUser() && desc->GetUser().GetType().IsStr() &&
3935  desc->GetUser().GetType().GetStr() == "RefGeneTracking" &&
3936  need_location_check)
3937  {
3938  if (desc->GetUser().HasField("EnsemblLocation")) {
3939  match_found = x_CheckMatch(align, genomic_acc,
3940  desc->GetUser().GetField("EnsemblLocation"));
3941  } else if (desc->GetUser().HasField("SelectGeneLocation")) {
3942  /// SelectGeneLocation is never treated as an exact match
3943  match_found = min(eOverlap,
3944  x_CheckMatch(align, genomic_acc,
3945  desc->GetUser().GetField("SelectGeneLocation")));
3946  }
3947  }
3948  }
3949 
3950  if ((match_found >= eOverlap || !need_location_check) && !keywords.empty())
3951  {
3952  /// Found overlap; add uql to features
3953  x_AddKeywordQuals(rna_feat, keywords);
3954  if (cds_feat) {
3955  x_AddKeywordQuals(*cds_feat, keywords);
3956  }
3957  }
3958 
3959  if (match_found == eExact && !drop && !ensembl_match_rna.empty()) {
3960  CRef<CDbtag> rna_ensembl_ref(new CDbtag);
3961  rna_ensembl_ref->SetDb("Ensembl");
3962  rna_ensembl_ref->SetTag().SetStr(ensembl_match_rna);
3963  rna_feat.SetDbxref().push_back(rna_ensembl_ref);
3964  if (cds_feat && !ensembl_match_cds.empty()) {
3965  CRef<CDbtag> cds_ensembl_ref(new CDbtag);
3966  cds_ensembl_ref->SetDb("Ensembl");
3967  cds_ensembl_ref->SetTag().SetStr(ensembl_match_cds);
3968  cds_feat->SetDbxref().push_back(cds_ensembl_ref);
3969  }
3970  }
3971 }
3972 
3975  const CSeq_id &genomic_acc,
3976  const CUser_field &loc_field)
3977 {
3978  if (!loc_field.HasField("seq_id") || !loc_field.HasField("from") ||
3979  !loc_field.HasField("to") || !loc_field.HasField("strand"))
3980  {
3981  NCBI_THROW(CException, eUnknown, loc_field.GetLabel().GetStr()
3982  + " doesn't have expected fields");
3983  }
3984 
3985  CSeq_id loc_genomic_acc(loc_field.GetField("seq_id").GetString());
3986  if (loc_genomic_acc.GetTextseq_Id()->GetAccession() ==
3987  genomic_acc.GetTextseq_Id()->GetAccession() &&
3988  loc_genomic_acc.GetTextseq_Id()->GetVersion() >
3989  genomic_acc.GetTextseq_Id()->GetVersion())
3990  {
3991  /// If location is on a newer version then alignment, this is always
3992  /// considered an overlap, regardless of position and strand
3993  return eOverlap;
3994  }
3995 
3996  ENa_strand loc_strand = loc_field.GetField("strand").GetString() == "-"
3998 
3999  /// Otherwise, this is considered a match only if same seq-id and on same
4000  /// strand
4001  if (!loc_genomic_acc.Match(genomic_acc) || loc_strand != align.GetSeqStrand(1))
4002  {
4003  return eNone;
4004  }
4005 
4006  // Considered an overlap if at least 50% of location intersects alignment
4007  TSeqRange loc_range(loc_field.GetField("from").GetInt(),
4008  loc_field.GetField("to").GetInt());
4009  return loc_range == align.GetSeqRange(1) ? eExact
4010  : (loc_range.IntersectingWith(align.GetSeqRange(1))
4011  ? eOverlap : eNone);
4012 }
4013 
4015 x_AddKeywordQuals(CSeq_feat &feat, const vector<string> &keywords)
4016 {
4017  for (const string &keyword : keywords) {
4018  CRef<CGb_qual> qualifier(new CGb_qual);
4019  qualifier->SetQual("tag");
4020  qualifier->SetVal(keyword);
4021  feat.SetQual().push_back(qualifier);
4022  }
4023 }
4024 
4026 {
4027  CRef<CSeq_loc> merged_loc;
4028 
4030  (loc2==NULL ||
4032 
4033  if (loc2==NULL)
4034  merged_loc = loc1->Merge(CSeq_loc::fMerge_SingleRange, NULL);
4035  else
4036  merged_loc = loc1->Add(*loc2, CSeq_loc::fMerge_SingleRange, NULL);
4037  } else {
4038  // cross the origin
4039 
4040  _ASSERT(loc2 == NULL ||
4041  (loc1->Intersect(*loc2, 0, NULL)->IsNull() == false &&
4042  loc1->Intersect(*loc2, 0, NULL)->IsEmpty() == false));
4043 
4044  CRef<CSeq_id> id(new CSeq_id);
4045  id->Assign(*loc1->GetId());
4046 
4047  TSeqPos genomic_size = m_scope->GetSequenceLength(*id);
4048  CRef<CSeq_loc> left_loc(new CSeq_loc(*id, genomic_size-1, genomic_size-1, loc1->GetStrand()));
4049  CRef<CSeq_loc> right_loc(new CSeq_loc(*id, 0, 0, loc1->GetStrand()));
4050 
4051  merged_loc = left_loc;
4052  merged_loc->Add(*right_loc);
4053  merged_loc->Add(*loc1);
4054  if (loc2 != NULL)
4055  merged_loc->Add(*loc2);
4056 
4057  TSeqPos x[] = {
4060  (loc2 ? loc2->GetStart(eExtreme_Positional) : 0),
4061  (loc2 ? loc2->GetStop(eExtreme_Positional) : 0)
4062  };
4063 
4064  if (x[0] > x[1])
4065  x[1] += genomic_size;
4066  if (x[2] > x[3])
4067  x[3] += genomic_size;
4068 
4069  if (x[1] < x[2]) {
4070  x[0] += genomic_size;
4071  x[1] += genomic_size;
4072  } else if (x[3] < x[0]) {
4073  x[2] += genomic_size;
4074  x[3] += genomic_size;
4075  }
4076 
4077 
4078  x[0] = min(x[0], x[2]);
4079  x[1] = max(x[1], x[3]) - genomic_size;
4080  _ASSERT( x[0] > x[1] +1 );
4081 
4082  merged_loc = FixOrderOfCrossTheOriginSeqloc(*merged_loc,
4083  (x[0]+x[1])/2,
4085  }
4086  return merged_loc;
4087 }
4088 
4090 (const CSeq_loc& loc,
4091  TSeqPos outside_point,
4093 {
4094  CRef<CSeq_id> id(new CSeq_id);
4095  id->Assign(*loc.GetId());
4096 
4097  TSeqPos genomic_size = m_scope->GetSequenceLength(*id);
4098  CRef<CSeq_loc> left_loc(new CSeq_loc);
4099  CRef<CSeq_loc> right_loc(new CSeq_loc);
4100 
4101  ITERATE(CSeq_loc, it, loc) {
4102  if (it.GetRangeAsSeq_loc()->GetStart(eExtreme_Biological) > outside_point)
4103  left_loc->Add(*it.GetRangeAsSeq_loc());
4104  else
4105  right_loc->Add(*it.GetRangeAsSeq_loc());
4106  }
4107 
4108  left_loc = left_loc->Merge(flags, NULL);
4109  right_loc = right_loc->Merge(flags, NULL);
4110 
4111  bool no_gap_at_origin = (left_loc->GetStop(eExtreme_Positional) == genomic_size-1 &&
4112  right_loc->GetStart(eExtreme_Positional) == 0);
4113 
4114  if (loc.IsReverseStrand()) {
4115  swap(left_loc, right_loc);
4116  }
4117  left_loc->Add(*right_loc);
4118 
4119  if (no_gap_at_origin) {
4120  left_loc->ChangeToPackedInt();
4121  NON_CONST_ITERATE(CPacked_seqint::Tdata, it,left_loc->SetPacked_int().Set()) {
4122  CSeq_interval& interval = **it;
4123  if (interval.GetFrom() == 0) {
4124  interval.SetFuzz_from().SetLim(CInt_fuzz::eLim_circle);
4125  }
4126  if (interval.GetTo() == genomic_size-1) {
4127  interval.SetFuzz_to().SetLim(CInt_fuzz::eLim_circle);
4128  }
4129  }
4130  }
4131 
4132  return left_loc;
4133 }
4134 
4137 {
4138  set<CSeq_id_Handle> genomic_ids;
4139 
4140  if (!align.GetSegs().IsSpliced()) {
4141  return false;
4142  }
4143 
4144  const CSpliced_seg& sps = align.GetSegs().GetSpliced();
4145  if(sps.CanGetGenomic_id())
4146  genomic_ids.insert(CSeq_id_Handle::GetHandle(sps.GetGenomic_id()));
4147 
4148  const CSpliced_seg& spliced_seg = align.GetSegs().GetSpliced();
4149  ITERATE(CSpliced_seg::TExons, it, spliced_seg.GetExons()) {
4150  const CSpliced_exon& exon = **it;
4151  if (exon.CanGetGenomic_id()) {
4152  genomic_ids.insert(CSeq_id_Handle::GetHandle(exon.GetGenomic_id()));
4153  }
4154  }
4155 
4156  return genomic_ids.size() > 1;
4157 }
4158 
4160  CRef<CSeq_loc>& edited_sequence_seqloc,
4161  CSeq_id& genomic_seqid,
4162  int& region_begin,
4163  int& region_end,
4164  int& offset,
4165  CRef<CSeq_loc>& insert,
4166  const int k_gap_length,
4167  const int next_exon_start)
4168 {
4169  if (insert->SetMix().Set().size() > 1) {
4170  NCBI_THROW(CException, eUnknown, "spliced-seq with several insert exons in a row not supported");
4171  }
4172 
4173  if (insert->SetMix().Set().size() > 0) {
4174  int half_intron_length = (next_exon_start - region_end)/2;
4175  int copy_length = min(k_gap_length, half_intron_length);
4176  region_end += copy_length;
4177 
4178  if (region_begin < region_end) {
4179  CRef<CSeq_loc> genome_loc(new CSeq_loc(genomic_seqid,
4180  region_begin,
4181  region_end -1));
4182  edited_sequence_seqloc->SetMix().Set().push_back(genome_loc);
4183  }
4184  if (copy_length < k_gap_length) {
4185  int gap_length = k_gap_length - copy_length;
4186  // fill gap with sequence from the genome itself for simplicity
4187  // do not bother creating nonexisting sequence
4188  CRef<CSeq_loc> gap_loc(new CSeq_loc(genomic_seqid, 0, gap_length-1));
4189  edited_sequence_seqloc->SetMix().Set().push_back(gap_loc);
4190  offset += gap_length;
4191  }
4192 
4193  edited_sequence_seqloc->SetMix().Set().push_back(insert);
4194  insert.Reset(new CSeq_loc);
4195 
4196  if (copy_length < k_gap_length) {
4197  int gap_length = k_gap_length - copy_length;
4198  CRef<CSeq_loc> gap_loc(new CSeq_loc(genomic_seqid, 0, gap_length-1));
4199  edited_sequence_seqloc->SetMix().Set().push_back(gap_loc);
4200  offset += gap_length;
4201  }
4202 
4203  region_begin = region_end;
4204  }
4205 }
4206 
4210  CSeq_annot& annot,
4211  CBioseq_set& seqs,
4212  Int8 gene_id,
4213  const CSeq_feat* cds_feat_on_query_mrna_ptr,
4214  bool call_on_align_list)
4215 {
4216 
4217  CRef<CSeq_align> align(new CSeq_align);
4218  align->Assign(input_align);
4219 
4220  CRef<CSeq_loc> edited_sequence_seqloc(new CSeq_loc);
4221 
4222 
4223  CSpliced_seg& spliced_seg = align->SetSegs().SetSpliced();
4224  if (!spliced_seg.CanGetGenomic_id()) {
4225  NCBI_THROW(CException, eUnknown, "Mixed-genomic spliced-seq does not have spliced-seg.genomic_id");
4226  }
4227  CRef<CSeq_id> genomic_seqid(new CSeq_id);
4228  genomic_seqid->Assign(spliced_seg.GetGenomic_id());
4229  ENa_strand genomic_strand = eNa_strand_plus;
4230  if (spliced_seg.CanGetGenomic_strand()) {
4231  genomic_strand = spliced_seg.GetGenomic_strand();
4232  } else {
4233  ITERATE(CSpliced_seg::TExons, it, spliced_seg.GetExons()) {
4234  const CSpliced_exon& exon = **it;
4235  if ((!exon.CanGetGenomic_id() || exon.GetGenomic_id().Match(*genomic_seqid)) &&
4236  exon.CanGetGenomic_strand()) {
4237  genomic_strand = exon.GetGenomic_strand();
4238  break;
4239  }
4240  }
4241  }
4242 
4243  CSeq_id_Handle idh= CSeq_id_Handle::GetHandle(*genomic_seqid);
4244  CBioseq_Handle bsh = m_scope->GetBioseqHandle(idh);
4245  TSeqPos genomic_length = bsh.GetBioseqLength();
4246 
4247  { // collect genomic seqlocs for virtual sequence and map exons to it
4248 
4249  const int k_gap_length = min(1000, int(genomic_length));
4250 
4251  if (genomic_strand == eNa_strand_minus) {
4252  // reverse exons to process them same way as plus strand
4253  // will reverse back in the end
4254  spliced_seg.SetExons().reverse();
4255  }
4256  int region_begin = 0; //included endpoint
4257  int region_end = 0; //not included endpoint
4258  int offset = 0;
4259  CRef<CSeq_loc> insert(new CSeq_loc);
4260  NON_CONST_ITERATE(CSpliced_seg::TExons, it, spliced_seg.SetExons()) {
4261  CSpliced_exon& exon = **it;
4262  CSeq_id& seqid = exon.CanGetGenomic_id() ? exon.SetGenomic_id() : *genomic_seqid;
4263 
4264  int exon_start = exon.GetGenomic_start(); // included endpoint
4265  int exon_stop = exon.GetGenomic_end(); // included endpoint
4266 
4267  if (!seqid.Match(*genomic_seqid)) {
4268 
4269  ENa_strand strand = exon.CanGetGenomic_strand() ? exon.GetGenomic_strand() : genomic_strand;
4270  CRef<CSeq_loc> loc(new CSeq_loc(seqid, exon_start, exon_stop, strand));
4271  if (genomic_strand == eNa_strand_minus) {
4272  loc->FlipStrand();
4273  }
4274  insert->SetMix().Set().push_back(loc);
4275 
4276  int exon_length = exon_stop - exon_start +1;
4277  exon_stop = region_end + k_gap_length -1;
4278  exon_start = region_end + k_gap_length - exon_length;
4279  offset += exon_length;
4280  } else {
4281  if (exon.CanGetGenomic_strand() && exon.GetGenomic_strand() != genomic_strand) {
4282  NCBI_THROW(CException, eUnknown, "spliced-seq with mixed genomic strands not supported");
4283  }
4284  if (!(region_end <= exon_start)) {
4285  NCBI_THROW(CException, eUnknown, "spliced-seq with exons out of order not supported");
4286  }
4287 
4288  AddInsertWithGaps(edited_sequence_seqloc,
4289  *genomic_seqid,
4290  region_begin,
4291  region_end,
4292  offset,
4293  insert,
4294  k_gap_length,
4295  exon_start);
4296 
4297  region_end = exon_stop +1;
4298  }
4299 
4300  exon.ResetGenomic_id();
4301  exon.ResetGenomic_strand();
4302 
4303  exon.SetGenomic_start(exon_start + offset);
4304  exon.SetGenomic_end(exon_stop + offset);
4305  }
4306 
4307  AddInsertWithGaps(edited_sequence_seqloc,
4308  *genomic_seqid,
4309  region_begin,
4310  region_end,
4311  offset,
4312  insert,
4313  k_gap_length,
4314  genomic_length);
4315 
4316  if (region_begin < (int)genomic_length) {
4317  CRef<CSeq_loc> genome_loc(new CSeq_loc(*genomic_seqid,
4318  region_begin,
4319  genomic_length -1));
4320  edited_sequence_seqloc->SetMix().Set().push_back(genome_loc);
4321  }
4322 
4323  if (genomic_strand == eNa_strand_minus) {
4324  // reverse exons back
4325  spliced_seg.SetExons().reverse();
4326  }
4327  spliced_seg.SetGenomic_strand(genomic_strand);
4328  }
4329 
4330  edited_sequence_seqloc->ChangeToPackedInt();
4331  CRef<CBioseq> bioseq(new CBioseq(*edited_sequence_seqloc));
4332  CRef<CSeq_entry> seqentry(new CSeq_entry);
4333  seqentry->SetSeq(*bioseq);
4334 
4335  bioseq->SetInst().SetTopology() = bsh.GetCompleteBioseq()->GetInst().GetTopology();
4336  {{
4337  CSeqdesc_CI desc(bsh, CSeqdesc::e_Source);
4338  if (desc) {
4339  CRef<CSeqdesc> seq_desc(new CSeqdesc);
4340  seq_desc->Assign(*desc);
4341  bioseq->SetDescr().Set().push_back(seq_desc);
4342  }
4343  }}
4344  {{
4345  CSeqdesc_CI desc(bsh, CSeqdesc::e_Org);
4346  if (desc) {
4347  CRef<CSeqdesc> seq_desc(new CSeqdesc);
4348  seq_desc->Assign(*desc);
4349  bioseq->SetDescr().Set().push_back(seq_desc);
4350  }
4351  }}
4352 
4353  CBioseq_Handle bioseq_handle = m_scope->AddBioseq(*bioseq);
4354 
4355  CRef<CSeq_id> bioseq_id(new CSeq_id);
4356  bioseq_id->Assign(*bioseq->GetFirstId());
4357  spliced_seg.SetGenomic_id(*bioseq_id);
4358 
4359  CRef<CSeq_feat> gene_feat;
4360  if (gene_id) {
4361  TGeneMap::iterator gene = genes.find(gene_id);
4362  if (gene != genes.end()) {
4363  gene_feat = gene->second;
4364  genes.erase(gene);
4365  }
4366  }
4367 
4368  CSeq_annot annot_local;
4369  CBioseq_set seqs_tmp;
4370  ConvertAlignToAnnot(*align, annot_local, seqs_tmp, gene_id, cds_feat_on_query_mrna_ptr,
4371  call_on_align_list);
4372 
4373  m_scope->RemoveBioseq(bioseq_handle);
4374  annot_local.SetData().SetFtable().clear();
4375 
4376  if (gene_id) {
4377  if (gene_feat) {
4378  genes[gene_id] = gene_feat;
4379  } else {
4380  genes.erase(gene_id);
4381  }
4382  }
4383 
4384  set<CSeq_id_Handle> insert_ids;
4385  TSeqPos insert_length = 0;
4386  TSeqPos cds_insert_length = 0;
4387 
4388  align.Reset(new CSeq_align);
4389  align->Assign(input_align);
4390  {
4391  CSpliced_seg& spliced_seg = align->SetSegs().SetSpliced();
4392  ERASE_ITERATE(CSpliced_seg::TExons, it, spliced_seg.SetExons()) {
4393  CSpliced_exon& exon = **it;
4394  CSeq_id& seqid = exon.CanGetGenomic_id() ? exon.SetGenomic_id() : *genomic_seqid;
4395 
4396  if (!seqid.Match(*genomic_seqid)) {
4397  insert_ids.insert(CSeq_id_Handle::GetHandle(seqid));
4398  insert_length += exon.GetGenomic_end()-exon.GetGenomic_start()+1;
4399 
4400  if (cds_feat_on_query_mrna_ptr) {
4401  int cds_intersection_len =
4402  min(exon.GetProduct_end().GetNucpos(),
4403  cds_feat_on_query_mrna_ptr->GetLocation().GetStop(eExtreme_Positional)) -
4404  max(exon.GetProduct_start().GetNucpos(),
4405  cds_feat_on_query_mrna_ptr->GetLocation().GetStart(eExtreme_Positional))
4406  +1;
4407  if (cds_intersection_len > 0) {
4408  cds_insert_length += cds_intersection_len;
4409  }
4410  }
4411 
4412  spliced_seg.SetExons().erase(it);
4413  }
4414  }
4415  }
4416 
4417  CBioseq_set seqs_discard;
4418  CRef<CSeq_feat> feat =
4419  ConvertAlignToAnnot(*align, annot_local, seqs_discard,
4420  gene_id, cds_feat_on_query_mrna_ptr,
4421  call_on_align_list);
4422 
4423  // inst.hist.assembly = input annot
4424  align.Reset(new CSeq_align);
4425  align->Assign(input_align);
4427  CSeq_entry& entry = **it;
4428  if (entry.IsSeq() &&
4429  entry.GetSeq().IsSetInst() &&
4430  entry.GetSeq().GetInst().IsSetHist() &&
4431  entry.GetSeq().GetInst().GetHist().IsSetAssembly()) {
4432 
4433  entry.SetSeq().SetInst().SetHist().SetAssembly().front() =
4434  align;
4435  break;
4436  }
4437  }
4438  if (seqs_tmp.IsSetClass()) {
4439  seqs.SetClass(seqs_tmp.GetClass());
4440  }
4441  seqs.SetSeq_set().splice(seqs.SetSeq_set().end(), seqs_tmp.SetSeq_set());
4442 
4443  for (list<CRef<CSeq_feat> >::reverse_iterator it = annot_local.SetData().SetFtable().rbegin();
4444  it != annot_local.SetData().SetFtable().rend(); ++it) {
4445  CSeq_feat& f = **it;
4446  if (f.GetData().IsGene()) {
4447  continue;
4448  }
4449 
4450  if (f.GetData().IsCdregion() && cds_insert_length==0) {
4451  continue;
4452  }
4453 
4455  _ASSERT(insert_ids.size() > 0);
4456  NON_CONST_ITERATE (set<CSeq_id_Handle>, id, insert_ids) {
4458  }
4459  x_SetCommentForGapFilledModel(f, f.GetData().IsCdregion() ? cds_insert_length : insert_length);
4460  }
4461 
4462  annot.SetData().SetFtable().splice(annot.SetData().SetFtable().end(),
4463  annot_local.SetData().SetFtable());
4464 
4465  return feat;
4466 }
4467 
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CAlign_CI –.
Definition: align_ci.hpp:63
size_t GetSize(void) const
CAnnot_CI –.
Definition: annot_ci.hpp:59
CAtomicCounter –.
Definition: ncbicntr.hpp:71
CBioseq_Handle –.
const CSeq_id * GetFirstId() const
Definition: Bioseq.cpp:271
CCdregion –.
Definition: Cdregion.hpp:66
CCode_break –.
Definition: Code_break.hpp:66
Definition: Dbtag.hpp:53
CDelta_seq –.
Definition: Delta_seq.hpp:66
CFeat_CI –.
Definition: feat_ci.hpp:64
CFeat_id –.
Definition: Feat_id.hpp:66
TFeatureGeneratorFlags GetFlags() const
Definition: gene_model.cpp:200
CRef< objects::CSeq_feat > ConvertAlignToAnnot(const objects::CSeq_align &align, objects::CSeq_annot &annot, objects::CBioseq_set &seqs, Int8 gene_id=0, const objects::CSeq_feat *cdregion_on_mrna=NULL)
Convert an alignment to an annotation.
unique_ptr< SImplementation > m_impl
Definition: gene_model.hpp:232
void RecomputePartialFlags(objects::CSeq_annot &annot)
Recompute the correct partial states for all features in this annotation.
Definition: gene_model.cpp:374
void SetMinIntron(TSeqPos)
Definition: gene_model.cpp:210
void SetFeatureExceptions(objects::CSeq_feat &feat, const objects::CSeq_align *align=NULL)
Correctly mark exceptions on a feature.
Definition: gene_model.cpp:360
void SetFlags(TFeatureGeneratorFlags)
Definition: gene_model.cpp:195
void SetAllowedUnaligned(TSeqPos)
Definition: gene_model.cpp:215
CFeatureGenerator(CRef< objects::CScope > scope)
CConstRef< objects::CSeq_align > CleanAlignment(const objects::CSeq_align &align)
Clean an alignment according to our best guess of its biological representation.
Definition: gene_model.cpp:221
void SetIntronStitchThresholdFlags(EIntronStitchThresholdFlags)
Definition: gene_model.cpp:205
void SetPartialFlags(CRef< objects::CSeq_feat > gene_feat, CRef< objects::CSeq_feat > mrna_feat, CRef< objects::CSeq_feat > cds_feat)
Mark the correct partial states for a set of features.
Definition: gene_model.cpp:367
void ConvertLocToAnnot(const objects::CSeq_loc &loc, objects::CSeq_annot &annot, objects::CBioseq_set &seqs, objects::CCdregion::EFrame frame=objects::CCdregion::eFrame_one, CRef< objects::CSeq_id > prot_id=CRef< objects::CSeq_id >(), CRef< objects::CSeq_id > rna_id=CRef< objects::CSeq_id >())
Convert genomic location to an annotation.
Definition: gene_model.cpp:243
@Gb_qual.hpp User-defined methods of the data storage class.
Definition: Gb_qual.hpp:61
static const CTrans_table & GetTransTable(int id)
static void SetFeatureExceptions(objects::CSeq_feat &feat, objects::CScope &scope, const objects::CSeq_align *align=NULL)
Correctly mark exceptions on a feature.
Definition: gene_model.cpp:115
static void CreateGeneModelFromAlign(const objects::CSeq_align &align, objects::CScope &scope, objects::CSeq_annot &annot, objects::CBioseq_set &seqs, TGeneModelCreateFlags flags=fDefaults, TSeqPos allowed_unaligned=10)
Create a gene model from an alignment this will optionally promote all features through the alignment...
Definition: gene_model.cpp:86
static void SetPartialFlags(objects::CScope &scope, CRef< objects::CSeq_feat > gene_feat, CRef< objects::CSeq_feat > mrna_feat, CRef< objects::CSeq_feat > cds_feat)
Definition: gene_model.cpp:132
static void CreateGeneModelsFromAligns(const list< CRef< objects::CSeq_align > > &aligns, objects::CScope &scope, objects::CSeq_annot &annot, objects::CBioseq_set &seqs, TGeneModelCreateFlags flags=fDefaults, TSeqPos allowed_unaligned=10)
Definition: gene_model.cpp:101
int TGeneModelCreateFlags
Definition: gene_model.hpp:252
static void RecomputePartialFlags(objects::CScope &scope, objects::CSeq_annot &annot)
Definition: gene_model.cpp:141
CMappedFeat –.
Definition: mapped_feat.hpp:59
TSeqPos AsSeqPos() const
Definition: Product_pos.cpp:56
position_type GetTo() const
Definition: range_coll.hpp:132
size_type size() const
Definition: range_coll.hpp:98
bool empty() const
Definition: range_coll.hpp:102
position_type GetFrom() const
Definition: range_coll.hpp:120
position_type GetCoveredLength(void) const
Returns total length covered by ranges in this collection, i.e.
Definition: range_coll.hpp:157
bool Empty() const
Definition: range_coll.hpp:138
CScope –.
Definition: scope.hpp:92
double GetPercentCoverage(CScope &scope, const CSeq_align &align, unsigned query=0)
Compute percent coverage of the query (sequence 0) (range 0-100)
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
ESubtype GetSubtype(void) const
CSeqFeatXref –.
Definition: SeqFeatXref.hpp:66
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
@ e_Ncbi8na
Definition: sequtil.hpp:52
@ e_Iupacna
Definition: sequtil.hpp:47
@ e_Ncbieaa
Definition: sequtil.hpp:57
@ e_Ncbi8aa
Definition: sequtil.hpp:56
@ e_Ncbi4na
Definition: sequtil.hpp:50
@ e_Ncbi2na
Definition: sequtil.hpp:48
@ e_Ncbistdaa
Definition: sequtil.hpp:58
CSeqVector –.
Definition: seq_vector.hpp:65
CRange< TSeqPos > GetSeqRange(TDim row) const
GetSeqRange NB: On a Spliced-seg, in case the product-type is protein, these only return the amin par...
Definition: Seq_align.cpp:153
CRef< CSeq_loc > CreateRowSeq_loc(TDim row) const
Definition: Seq_align.cpp:2028
TDim CheckNumRows(void) const
Validatiors.
Definition: Seq_align.cpp:73
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
ENa_strand GetSeqStrand(TDim row) const
Get strand (the first one if segments have different strands).
Definition: Seq_align.cpp:294
CSeq_annot_Handle –.
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
CSeq_feat_EditHandle –.
CSeq_feat_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
void AddExt(CRef< CUser_object > ext, TAddExt add_flags=0)
Add an extension by type in exts container.
Definition: Seq_feat.cpp:631
void SetPartialStart(bool val, ESeqLocExtremes ext)
void SetPartialStop(bool val, ESeqLocExtremes ext)
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
CSeq_loc_Mapper –.
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
static TSeqPos Pack(CSeq_data *in_seq, TSeqPos uLength=ncbi::numeric_limits< TSeqPos >::max())
CSpliced_exon_chunk –.
CSeq_feat_Handle GetFeatureWithId(CSeqFeatData::E_Choice type, TFeatureIdInt id) const
Definition: tse_handle.cpp:635
CTime –.
Definition: ncbitime.hpp:296
char GetStartResidue(int state) const
char GetCodonResidue(int state) const
static int SetCodonState(unsigned char ch1, unsigned char ch2, unsigned char ch3)
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
int GetInt(void) const
get value
Definition: User_field.hpp:327
const CUser_field & GetField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Access a named field in this user field.
Definition: User_field.cpp:211
const string & GetString(void) const
Definition: User_field.hpp:348
bool HasField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Verify that a named field exists.
Definition: User_field.cpp:393
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
void erase(iterator pos)
Definition: map.hpp:167
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
size_type size() const
Definition: set.hpp:132
static uch flags
CMappedFeat GetCdsOnMrna(const objects::CSeq_id &rna_id, CScope &scope)
#define false
Definition: bool.h:36
static const char * str(char *buf, int n)
Definition: stats.c:84
int offset
Definition: replacements.h:160
USING_SCOPE(objects)
static bool s_Contains(const TSeqRange &range1, const TSeqRange &range2)
Check whether range1 contains range2.
bool IsProteinAlign(const CSeq_align &align)
Definition: gene_model.cpp:601
void AddInsertWithGaps(CRef< CSeq_loc > &edited_sequence_seqloc, CSeq_id &genomic_seqid, int &region_begin, int &region_end, int &offset, CRef< CSeq_loc > &insert, const int k_gap_length, const int next_exon_start)
const char * k_except_text_for_gap_filled_gnomon_model
Definition: gene_model.cpp:123
const char * k_cds_comment
Definition: gene_model.cpp:128
void AddCodeBreak(CSeq_feat &feat, CSeq_loc &loc, char ncbieaa)
static void s_TransformToNucpos(CProduct_pos &pos)
Definition: gene_model.cpp:584
const char * k_rna_comment
Definition: gene_model.cpp:125
void AddLiteral(CSeq_inst &inst, const string &seq, CSeq_inst::EMol mol_class)
string ExtractGnomonModelNum(const CSeq_id &seq_id)
Definition: gene_model.cpp:590
void RenameGeneratedBioseqs(const CSeq_id &query_rna_id, CSeq_id &transcribed_rna_id, CRef< CSeq_feat > cds_feat_on_query_mrna, CRef< CSeq_feat > cds_feat_on_genome_with_translated_product)
Definition: gene_model.cpp:753
bool IsContinuous(const CSeq_loc &loc)
static string s_Count(unsigned num, const string &item_name)
static CRef< CSeq_loc > s_MapSingleAA(TSeqPos pos, CRef< CSeq_id > mapped_protein_id, const CRangeCollection< TSeqPos > &product_ranges, CRef< CSeq_loc_Mapper > to_mrna, CRef< CSeq_loc_Mapper > to_genomic)
static const CMolInfo * s_GetMolInfo(const CBioseq_Handle &handle)
Return the mol-info object for a given sequence.
Definition: gene_model.cpp:152
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
Definition: ncbimisc.hpp:843
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
TValue Add(int delta) THROWS_NONE
Atomically add value (=delta), and return new counter value.
Definition: ncbicntr.hpp:278
#define NCBI_ASSERT(expr, mess)
Definition: ncbidbg.hpp:130
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
const TPrim & Get(void) const
Definition: serialbase.hpp:347
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
CConstRef< CSeq_id > GetSeqId(void) const
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
CSeq_id::EAccessionInfo IdentifyAccession(void) const
CSeq_id & Set(const CTempString &the_id, TParseFlags flags=fParse_AnyRaw)
Reassign based on flat specifications; arguments interpreted as with constructors.
Definition: Seq_id.cpp:2457
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1065
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
Definition: Seq_id.cpp:169
@ eAcc_refseq_mrna
Definition: Seq_id.hpp:415
@ eAcc_refseq_ncrna
Definition: Seq_id.hpp:416
void SetPacked_int(TPacked_int &v)
Definition: Seq_loc.hpp:984
void SetMix(TMix &v)
Definition: Seq_loc.hpp:987
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
Definition: Seq_loc.cpp:3222
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
void ChangeToPackedInt(void)
Works only if location is currently an interval, point, packed-int (handled trivially),...
Definition: Seq_loc.cpp:3670
bool IsReverseStrand(void) const
Return true if all ranges have reverse strand.
Definition: Seq_loc.hpp:995
void FlipStrand(void)
Flip the strand (e.g. plus to minus)
Definition: Seq_loc.cpp:3969
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
Definition: Seq_loc.cpp:3474
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
CRef< CSeq_loc > Merge(TOpFlags flags, ISynonymMapper *syn_mapper) const
All functions create and return a new seq-loc object.
Definition: Seq_loc.cpp:5037
const_iterator end(void) const
Definition: Seq_loc.cpp:1034
const_iterator begin(void) const
Definition: Seq_loc.cpp:1028
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
int Compare(const CSeq_loc &loc) const
Definition: Seq_loc.cpp:590
void Add(const CSeq_loc &other)
Simple adding of seq-locs.
Definition: Seq_loc.cpp:3875
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
void SetPartialStart(bool val, ESeqLocExtremes ext)
set / remove e_Lim fuzz on start or stop (lt/gt - indicating partial interval)
Definition: Seq_loc.cpp:3280
CRef< CSeq_loc > Intersect(const CSeq_loc &other, TOpFlags flags, ISynonymMapper *syn_mapper) const
Find the intersection with the seq-loc, merge/sort resulting ranges depending on flags.
Definition: Seq_loc.cpp:5183
int TOpFlags
Definition: Seq_loc.hpp:336
void SetStrand(ENa_strand strand)
Set the strand for all of the location's ranges.
Definition: Seq_loc.cpp:5196
void SetPartialStop(bool val, ESeqLocExtremes ext)
Definition: Seq_loc.cpp:3313
void ChangeToMix(void)
Definition: Seq_loc.cpp:3633
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
@ eOrder_Biological
Iterate sub-locations in positional order.
Definition: Seq_loc.hpp:462
@ fMerge_SingleRange
Definition: Seq_loc.hpp:332
@ fCompare_Strand
Definition: Seq_loc.hpp:246
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
bool IsSameBioseq(const CSeq_id &id1, const CSeq_id &id2, CScope *scope, CScope::EGetBioseqFlag get_flag=CScope::eGetBioseq_All)
Determines if two CSeq_ids represent the same CBioseq.
const CMolInfo * GetMolInfo(const CBioseq &bioseq)
Retrieve the MolInfo object for a given bioseq handle.
Definition: sequence.cpp:284
const COrg_ref * GetOrg_refOrNull(const CBioseq_Handle &handle)
Return the pointer to org-ref associated with a given sequence or null if there is no org-ref associa...
Definition: sequence.cpp:245
const COrg_ref & GetOrg_ref(const CBioseq_Handle &handle)
Return the org-ref associated with a given sequence.
Definition: sequence.cpp:264
static void Translate(const string &seq, string &prot, const CGenetic_code *code, bool include_stop=true, bool remove_trailing_X=false, bool *alt_start=NULL, bool is_5prime_complete=true, bool is_3prime_complete=true)
Translate a string using a specified genetic code.
Definition: sequence.cpp:4095
@ fIs5PrimePartial
= 0x4 Translate first codon even if not start codon (because sequence is 5' partial)
Definition: sequence.hpp:984
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
@ eGetId_ForceAcc
return only an accession based seq-id
Definition: sequence.hpp:100
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
CSeq_annot_Handle AddSeq_annot(CSeq_annot &annot, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add Seq-annot, return its CSeq_annot_Handle.
Definition: scope.cpp:538
CSeq_loc_Mapper_Base & SetMergeAll(void)
Merge any abutting or overlapping intervals.
@ eProductToLocation
Map from the feature's product to location.
@ eLocationToProduct
Map from the feature's location to product.
@ fAlign_Dense_seg_TotalRange
Ignore internal dense-seg structure - map each dense-seg according to the total ranges involved.
vector< CSeq_id_Handle > TId
const CTSE_Handle & GetTSE_Handle(void) const
Get CTSE_Handle of containing TSE.
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
bool IsSetDbxref(void) const
virtual CConstRef< CSeq_feat > GetSeq_feat(void) const
const CSeqFeatData & GetData(void) const
Definition: