NCBI C++ ToolKit
adjust_consensus_splicesite.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: adjust_consensus_splicesite.cpp 47202 2022-11-02 19:05:06Z asztalos $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Colleen Bollin
27  */
28 
29 
30 #include <ncbi_pch.hpp>
32 
33 #include <objmgr/seq_vector.hpp>
34 #include <objmgr/util/sequence.hpp>
35 #include <objmgr/seqdesc_ci.hpp>
37 
42 
45 
46 
47 //////////////////////////////////////////////////////////////////
48 /// AdjustForConsensusSpliceSite
49 /// Adjust internal intervals of a CDS (and its associated mRNA feature) to
50 /// position the interval endpoints at consensus splice sites (GT-AG), as long as the
51 /// repositioning does not alter the contents of the translated protein.
52 /// @param cds Coding region feature
53 /// @param scope The scope in which adjustments are to be made (if necessary)
54 ///
56 {
58  CRef<CSeq_feat> new_cds(new CSeq_feat);
59  new_cds->Assign(cds);
60  bool changed = AdjustCDS(*new_cds); // update the location
61 
62  if (changed) {
63  CRef<CCmdComposite> command(new CCmdComposite("Adjust CDS for consensus splice sites"));
64 
65  CSeq_feat_Handle fh = m_Scope->GetSeq_featHandle(cds);
66  CIRef<IEditCommand> chgFeat(new CCmdChangeSeq_feat(fh, *new_cds));
67  command->AddCommand(*chgFeat);
68 
69  CRef<CCmdComposite> update_cmd = AdjustmRNAandExonFeatures(*new_cds, cds);
70  if (update_cmd) {
71  command->AddCommand(*update_cmd);
72  }
73  return command;
74  }
75 
76  return CRef<CCmdComposite>(NULL);
77 }
78 
79 namespace {
80  string GetProteinSeq(const CSeq_feat& cds, CScope& scope)
81  {
82  string new_prot_seq;
83  CSeqTranslator::Translate(cds, scope, new_prot_seq);
84  if (*(new_prot_seq.end() - 1) == '*')
85  new_prot_seq.erase(new_prot_seq.end() - 1);
86  return new_prot_seq;
87  }
88 }
89 
91 {
93  CRef<CSeq_feat> new_cds(new CSeq_feat);
94  new_cds->Assign(cds);
95  bool changed = AdjustCDSEnds(*new_cds); // update the location
96 
97  if (changed) {
98  CRef<CCmdComposite> command(new CCmdComposite("Adjust CDS ends for consensus splice sites"));
99 
100  CSeq_feat_Handle fh = m_Scope->GetSeq_featHandle(cds);
101  CIRef<IEditCommand> chgFeat(new CCmdChangeSeq_feat(fh, *new_cds));
102  command->AddCommand(*chgFeat);
103 
104  if (m_ProtChanged) {
105  CBioseq_Handle prot = m_Scope->GetBioseqHandle(cds.GetProduct());
106  CRef<CSeq_inst> new_inst(new CSeq_inst);
107  new_inst->Assign(prot.GetInst());
108  if (new_inst->GetMol() == CSeq_inst::eMol_aa && new_inst->GetRepr() == CSeq_inst::eRepr_raw) {
109 
110  string new_protein = GetProteinSeq(*new_cds, *m_Scope);
111  new_inst->SetSeq_data().SetNcbieaa().Set(new_protein);
112  new_inst->SetLength(TSeqPos(new_protein.length()));
113  CIRef<IEditCommand> chgProt(new CCmdChangeBioseqInst(prot, *new_inst));
114  command->AddCommand(*chgProt);
115 
116  for (CFeat_CI prot_it(prot, SAnnotSelector(CSeqFeatData::e_Prot)); prot_it; ++prot_it) {
117  if (!prot_it->GetData().GetProt().IsSetProcessed()) {
118  CRef<CSeq_feat> new_prot(new CSeq_feat);
119  new_prot->Assign(prot_it->GetOriginalFeature());
120  new_prot->SetLocation().SetInt().SetTo(TSeqPos(new_protein.length() - 1));
121  CIRef<IEditCommand> chgFeat(new CCmdChangeSeq_feat(prot_it->GetSeq_feat_Handle(), *new_prot));
122  command->AddCommand(*chgFeat);
123  }
124 
125  }
126  }
127  }
128 
129  CRef<CCmdComposite> update_cmd = AdjustmRNAandExonEnds(*new_cds, cds);
130  if (update_cmd) {
131  command->AddCommand(*update_cmd);
132  }
133  return command;
134  }
135 
136  return CRef<CCmdComposite>(NULL);
137 }
138 
140 {
141  _ASSERT(m_Scope);
142  if (!cds.GetData().IsCdregion() || !cds.IsSetLocation() || !cds.IsSetProduct()) {
143  return false;
144  }
145  if ((!cds.GetLocation().IsMix() && !cds.GetLocation().IsPacked_int()) ||
146  cds.GetLocation().GetId() == nullptr) {
147  return false;
148  }
149 
150  // not going to handle mixed-strand exons
151  if (cds.GetLocation().GetStrand() == eNa_strand_other) {
152  return false;
153  }
154 
155  CBioseq_Handle product = m_Scope->GetBioseqHandle(cds.GetProduct());
156  if (!product || !product.IsProtein()) {
157  return false;
158  }
159 
160  // obtaining the original protein sequence
163  string orig_prot_seq;
164  prot_vec.GetSeqData(0, prot_vec.size(), orig_prot_seq);
165  if (NStr::IsBlank(orig_prot_seq)) {
166  return false;
167  }
168 
169  // assumed that the coding region is on one sequence
170  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(cds.GetLocation());
171  if (!bsh) {
172  return false;
173  }
174 
175  x_InitRanges(cds);
176  m_Strand = cds.GetLocation().GetStrand();
177  TSeqPos length = bsh.GetBioseqLength() - 1;
179 
180  // update the location of the new cds and constantly check whether the protein is still the same as the original one
182  CSeq_loc_CI iter = iter_prev;
183  ++iter;
184 
185  TLocs::iterator range_it_prev = m_New_Ranges.begin();
186  TLocs::iterator range_it = range_it_prev;
187  ++range_it;
188 
189  bool changed = false;
190  size_t index = 1;
191  while (iter && range_it != m_New_Ranges.end() && index < m_New_Ranges.size()) {
192 
193  CConstRef<CSeq_loc> loc_prev = iter_prev.GetRangeAsSeq_loc();
195  TSeqRange range_prev0 = *range_it_prev;
196  TSeqRange range0 = *range_it;
197 
198  if (x_IntronLength(*range_it_prev, *range_it) > 9 &&
200  (loc->IsInt() || loc->IsPnt()) && (loc_prev->IsInt() || loc_prev->IsPnt())) {
201 
202  TSeqPos start = range_it_prev->GetFrom();
203  TSeqPos stop = range_it->GetTo();
204  if (m_Strand == eNa_strand_minus) {
205  start = length - range_it_prev->GetTo();
206  stop = length - range_it->GetFrom();
207  }
208 
209  string seqdata = kEmptyStr;
210  seq_vec.GetSeqData(start, stop, seqdata);
211 
212  TSeqPos intron_start = range_it_prev->GetTo() + 1 - start; // donor splice site
213  TSeqPos intron_stop = range_it->GetFrom() -1 - start; // acceptor splice site
214  if (m_Strand == eNa_strand_minus) {
215  intron_start = length - range_it_prev->GetFrom() + 1 - start;
216  intron_stop = length - range_it->GetTo() -1 -start;
217  }
218  bool match = true;
219  if (s_IsAdjustedSpliceSitePairOK(seqdata, intron_start, intron_stop)) {
220  // do nothing
221  } else {
222  match = false;
223 
224  // search forward:
225  if ((loc_prev->IsInt() || loc_prev->IsPnt()) && loc->IsInt()) {
226  TSeqPos offset = 1;
227  TSeqPos exon_length = range_it->GetTo() - range_it->GetFrom() + 1;
228  while (offset < exon_length && !match && offset < 4) {
229  if (s_IsAdjustedSpliceSitePairOK(seqdata, intron_start + offset, intron_stop + offset)) {
230  match = true;
231  } else {
232  offset++;
233  }
234  }
235  if (match) {
236  x_ShiftExonPairForward(*range_it_prev, *range_it, offset);
237  }
238  }
239 
240  if (match) {
241  if (x_HasProteinChanged(cds, orig_prot_seq)) {
242  // undo the changes to the previous and the current range
243  swap(range_prev0, *range_it_prev);
244  swap(range0, *range_it);
245  match = false;
246  } else {
247  changed = true;
248  }
249  }
250 
251 
252  // search backward:
253  if (!match && loc_prev->IsInt() && (loc->IsPnt() || loc->IsInt())) {
254  TSeqPos offset = 1;
255  TSeqPos prev_exon_length = range_it_prev->GetTo() - range_it_prev->GetFrom() + 1;
256  while (offset < prev_exon_length && !match && offset < 4) {
257  if (s_IsAdjustedSpliceSitePairOK(seqdata, intron_start - offset, intron_stop - offset)) {
258  match = true;
259  } else {
260  offset++;
261  }
262  }
263  if (match) {
264  x_ShiftExonPairBackward(*range_it_prev, *range_it, offset);
265  if (x_HasProteinChanged(cds, orig_prot_seq)) {
266  // undo the changes to the previous and the current range
267  swap(range_prev0, *range_it_prev);
268  swap(range0, *range_it);
269  } else {
270  changed = true;
271  }
272  }
273  }
274  }
275  }
276 
277  ++range_it_prev;
278  ++range_it;
279 
280  ++iter_prev;
281  ++iter;
282  }
283 
284  if (changed) {
285  CRef<CSeq_loc> new_loc = x_UpdateLocation(cds);
286  cds.ResetLocation();
287  cds.SetLocation(new_loc.GetObject());
288  }
289 
290  return changed;
291 }
292 
294 {
295  return x_AlsoAdjustmRNA(mrna, edit_cds.GetLocation());
296 }
297 
299 {
300  _ASSERT(m_Scope);
301  CRef<CCmdComposite> cmd(new CCmdComposite("Update mRNA and exons for consensus splice sites"));
302 
304  if ( mrna ) {
305  CRef<CSeq_feat> new_mrna(new CSeq_feat());
306  new_mrna->Assign(*mrna);
307  if (AdjustmRNAToMatchCDS(new_cds, *new_mrna)) {
308  CSeq_feat_Handle mrnah = m_Scope->GetSeq_featHandle(mrna.GetObject());
309  CIRef<IEditCommand> chgmRNA(new CCmdChangeSeq_feat(mrnah, *new_mrna));
310  cmd->AddCommand(*chgmRNA);
311  }
312  }
313 
314  // update the exon features if there were any
315  x_UpdateExonFeatures(cmd.GetPointer(), orig_cds, new_cds);
316  return cmd;
317 }
318 
320 {
321  if (!cmd)
322  return;
323 
326  for ( ; origloc_ci && newloc_ci; ++origloc_ci, ++newloc_ci) {
327  // has been the range updated?
328  if (origloc_ci.GetRange().GetFrom() != newloc_ci.GetRange().GetFrom() ||
329  origloc_ci.GetRange().GetTo() != newloc_ci.GetRange().GetTo()) {
330  // we are only interested in exons that are right on this interval
331 
332  CConstRef<CSeq_loc> orig_subloc = origloc_ci.GetRangeAsSeq_loc();
334  if (exon_ci) {
335  // check whether the exon is right on the original interval
336  const CSeq_feat& exon = exon_ci->GetMappedFeature();
337  if (exon.IsSetLocation()) {
338  TSeqPos exon_start = exon.GetLocation().GetStart(eExtreme_Positional);
339  TSeqPos exon_stop = exon.GetLocation().GetStop(eExtreme_Positional);
340  if (exon_start == origloc_ci.GetRange().GetFrom() &&
341  exon_stop == origloc_ci.GetRange().GetTo()) {
342  // adjust the exon for this interval
343  // make a copy of this exon and update the location of the exon
344  TSeqPos new_start = newloc_ci.GetRange().GetFrom();
345  TSeqPos new_stop = newloc_ci.GetRange().GetTo();
346  CRef<CSeq_feat> exon(new CSeq_feat);
347  exon->Assign(exon_ci->GetMappedFeature());
348  if (exon->GetLocation().IsInt()) {
349  exon->SetLocation().SetInt().SetFrom(new_start);
350  exon->SetLocation().SetInt().SetTo(new_stop);
351  CSeq_feat_Handle exon_fh = m_Scope->GetSeq_featHandle(exon_ci->GetMappedFeature());
352  CIRef<IEditCommand> chgexon(new CCmdChangeSeq_feat(exon_fh, *exon));
353  cmd->AddCommand(*chgexon);
354  }
355  }
356  }
357  }
358  }
359  }
360 }
361 
363 {
364  // selection criteria for sequences where adjusting consensus splice sites can be applied
365  bool accepted = s_IsBioseqGood_Relaxed(bsh);
366  if (accepted) {
367  CSeqdesc_CI desc(bsh, CSeqdesc::e_Source);
368  if (!desc)
369  return false;
370 
371  const CBioSource& bsrc = desc->GetSource();
372  if (bsrc.IsSetLineage() && NStr::FindNoCase(bsrc.GetLineage(), "viruses") != NPOS) {
373  return false;
374  } else if (bsrc.IsSetOrg() && bsrc.GetOrg().GetTaxId() == ZERO_TAX_ID) {
375  return false;
376  }
377  }
378 
379  return accepted;
380 }
381 
383 {
384  // it wont' work on sequences that meet any of the following criteria:
385  CSeqdesc_CI desc(bsh, CSeqdesc::e_Source);
386  if (!desc)
387  return false;
388 
389  const CBioSource& bsrc = desc->GetSource();
390  if (bsrc.GetGenome() != CBioSource::eGenome_genomic &&
392  return false;
393  }
394  return true;
395 }
396 
397 bool CAdjustForConsensusSpliceSite::s_IsBioseqGood_AdjustEnds(const objects::CBioseq_Handle& bsh)
398 {
399  // must be genomic DNA
400  CConstRef<CSeqdesc> molinfo = bsh.GetCompleteBioseq()->GetClosestDescriptor(CSeqdesc::e_Molinfo);
401  bool set_genomic = molinfo && molinfo->GetMolinfo().IsSetBiomol() && molinfo->GetMolinfo().GetBiomol() == CMolInfo::eBiomol_genomic;
402  if (!set_genomic)
403  return false;
404 
405  if (bsh.GetInst().GetMol() != CSeq_inst::eMol_dna)
406  return false;
407 
408  // no organelle
409  if (validator::IsOrganelle(bsh))
410  return false;
411 
412  CSeqdesc_CI desc(bsh, CSeqdesc::e_Source);
413  if (!desc)
414  return false;
415 
416  const CBioSource& bsrc = desc->GetSource();
417  // must have a taxID
418  if (!bsrc.IsSetOrg())
419  return false;
420  if (bsrc.GetOrg().GetTaxId() == ZERO_TAX_ID)
421  return false;
422 
423  return true;
424 }
425 
426 namespace {
427  bool s_IsAcceptorSpliceSiteOK(const string& seqdata, TSeqPos intron_stop)
428  {
429  if (seqdata.empty()) return false;
430  return (seqdata.at(intron_stop - 1) == 'A' && seqdata.at(intron_stop) == 'G');
431  }
432 
433  bool s_IsDonorSpliceSiteOK(const string& seqdata, TSeqPos intron_start)
434  {
435  if (seqdata.empty()) return false;
436  return (seqdata.at(intron_start) == 'G' && (seqdata.at(intron_start + 1) == 'T' || seqdata.at(intron_start + 1) == 'C'));
437  }
438 }
439 
440 bool CAdjustForConsensusSpliceSite::s_IsAdjustedSpliceSitePairOK(const string& seqdata, TSeqPos intron_start, TSeqPos intron_stop)
441 {
442  if (seqdata.empty()) return false;
443  try {
444  return s_IsAcceptorSpliceSiteOK(seqdata, intron_stop) && s_IsDonorSpliceSiteOK(seqdata, intron_start);
445  } catch (const CException& e) {
446  LOG_POST(Error << "Lookup of splice sites failed: " << e.GetMsg());
447  }
448  catch (const exception& e) {
449  LOG_POST(Error << "Lookup of splice sites failed: " << e.what());
450  }
451  return false;
452 }
453 
454 
455 
457 {
458  // don't change the end points of mrna, but the internal intervals should be the same
459  TSeqPos cds_start = loc.GetTotalRange().GetFrom();
460  TSeqPos cds_stop = loc.GetTotalRange().GetTo();
461 
462  TSeqPos mRNA_start = mrna.GetLocation().GetStart(eExtreme_Positional);
463  TSeqPos mRNA_stop = mrna.GetLocation().GetStop(eExtreme_Positional);
464 
465  // if the endpoints of mrna are the same as the cds endpoints, just update the mrna with the new location
466  // also update the partialness of the endpoints
467  if (mRNA_start == cds_start && mRNA_stop == cds_stop) {
468  CRef<CSeq_loc> new_loc(new CSeq_loc);
469  new_loc->Assign(loc);
470 
472  new_loc->SetPartialStart(true, eExtreme_Positional);
473  }
475  new_loc->SetPartialStop(true, eExtreme_Positional);
476  }
477  // not setting the strand, as it should be the same as the CDS's strand
478  // not setting the id either, for the same reason
479 
480  mrna.ResetLocation();
481  mrna.SetLocation(new_loc.GetObject());
482  return true;
483  } else {
484  TLocs mrna_orig_ranges;
485  for (CSeq_loc_CI loc_iter(mrna.GetLocation(), CSeq_loc_CI::eEmpty_Skip, CSeq_loc_CI::eOrder_Positional); loc_iter; ++loc_iter) {
486  mrna_orig_ranges.push_back(loc_iter.GetRange());
487  }
488 
489  // update the ranges
490  TLocs mrna_ranges;
491  ITERATE(TLocs, iter, m_New_Ranges) {
492  mrna_ranges.push_back(*iter);
493  }
494 
495  TLocs::iterator range_it = mrna_ranges.begin(); // or update it with the orig_ranges
496  if (mRNA_start != range_it->GetFrom())
497  range_it->SetFrom(mRNA_start);
498  range_it = mrna_ranges.end() - 1 ;
499  if (mRNA_stop != range_it->GetTo())
500  range_it->SetTo(mRNA_stop);
501 
502  CRef<CSeq_loc> new_loc(new CSeq_loc);
503  if (mrna.GetLocation().IsMix()) {
504  CSeq_loc::TMix& mix_locs = new_loc->SetMix();
505 
506  TLocs::iterator range_it = mrna_ranges.begin();
507  TLocs::iterator orig_rng_it = mrna_orig_ranges.begin();
509 
510  while (loc_iter && range_it != mrna_ranges.end() && orig_rng_it != mrna_orig_ranges.end()) {
511  CConstRef<CSeq_loc> subloc = loc_iter.GetRangeAsSeq_loc();
512  if (range_it->GetFrom() != orig_rng_it->GetFrom() || range_it->GetTo() != orig_rng_it->GetTo()) {
513  // update the subloc
514  const CSeq_id* seqid = subloc->GetId();
515  if (seqid)
516  mix_locs.AddInterval(*seqid, range_it->GetFrom(), range_it->GetTo(), subloc->GetStrand());
517  } else {
518  if (subloc->IsPnt() || subloc->IsInt()) {
519  mix_locs.AddSeqLoc(*subloc);
520  } // other types are not allowed
521  }
522  ++range_it;
523  ++orig_rng_it;
524  ++loc_iter;
525  }
526  } else if (mrna.GetLocation().IsPacked_int()) {
527  CRef<CPacked_seqint> packed(new CPacked_seqint(const_cast<CSeq_id&>(*(mrna.GetLocation().GetId())), mrna_ranges, mrna.GetLocation().GetStrand()));
528  new_loc->SetPacked_int(packed.GetObject());
529  }
530 
532  new_loc->SetPartialStart(true, eExtreme_Positional);
533  }
535  new_loc->SetPartialStop(true, eExtreme_Positional);
536  }
537  // not setting the strand, as it should be the same as the CDS's strand
538  // not setting the id either, for the same reason
539 
540  mrna.ResetLocation();
541  mrna.SetLocation(new_loc.GetObject());
542  return true;
543  }
544  return false;
545 }
546 
547 
549 {
550  CRef<CSeq_loc> new_loc(new CSeq_loc);
551 
552  if (cds.GetLocation().IsMix()) {
553 
554  CSeq_loc::TMix& mix_locs = new_loc->SetMix();
555 
556  TLocs::iterator range_it = m_New_Ranges.begin();
557  TLocs::iterator orig_rng_it = m_Orig_Ranges.begin();
559 
560  while (loc_iter && range_it != m_New_Ranges.end() && orig_rng_it != m_Orig_Ranges.end()) {
561  CConstRef<CSeq_loc> subloc = loc_iter.GetRangeAsSeq_loc();
562  if (range_it->GetFrom() != orig_rng_it->GetFrom() || range_it->GetTo() != orig_rng_it->GetTo()) {
563  // update the subloc
564  const CSeq_id* seqid = subloc->GetId();
565  if (seqid)
566  mix_locs.AddInterval(*seqid, range_it->GetFrom(), range_it->GetTo(), subloc->GetStrand());
567  } else {
568  if (subloc->IsPnt() || subloc->IsInt()) {
569  mix_locs.AddSeqLoc(*subloc);
570  } // other types are not allowed
571  }
572  ++range_it;
573  ++orig_rng_it;
574  ++loc_iter;
575  }
576  } else if (cds.GetLocation().IsPacked_int()) {
577  CRef<CPacked_seqint> packed(new CPacked_seqint(const_cast<CSeq_id&>(*(cds.GetLocation().GetId())), m_New_Ranges, cds.GetLocation().GetStrand()));
578  new_loc->SetPacked_int(packed.GetObject());
579  }
580  else if (cds.GetLocation().IsInt()) {
581  TLocs::iterator range_it = m_New_Ranges.begin();
582  CRef<CSeq_interval> interval(new CSeq_interval(const_cast<CSeq_id&>(*(cds.GetLocation().GetId())), range_it->GetFrom(), range_it->GetTo(), cds.GetLocation().GetStrand()));
583  new_loc->SetInt(*interval);
584  }
585 
587  new_loc->SetPartialStart(true, eExtreme_Biological);
588  }
590  new_loc->SetPartialStop(true, eExtreme_Biological);
591  }
592 
593  return new_loc;
594 }
595 
597 {
598  m_Orig_Ranges.clear();
599  m_New_Ranges.clear();
600  for (CSeq_loc_CI loc_iter(cds.GetLocation(), CSeq_loc_CI::eEmpty_Skip, CSeq_loc_CI::eOrder_Biological); loc_iter; ++loc_iter) {
601  m_Orig_Ranges.push_back(loc_iter.GetRange());
602  m_New_Ranges.push_back(loc_iter.GetRange());
603  }
604 }
605 
607 {
608  CRef<CSeq_loc> new_loc(new CSeq_loc);
609 
610  // make/update new location from the new location ranges:
611  if (cds.GetLocation().IsMix()) {
612  CSeq_loc::TLocations& mix_locs = new_loc->SetMix().Set();
613  x_UpdateMixLocations(cds.GetLocation(), mix_locs);
614  }
615  else if (cds.GetLocation().IsPacked_int()) {
616  new_loc->Assign(cds.GetLocation());
617  CSeq_loc::TIntervals& int_locs = new_loc->SetPacked_int().Set();
618  x_UpdateIntervals(int_locs);
619  }
620  else if (cds.GetLocation().IsInt()) {
621  new_loc->Assign(cds.GetLocation());
622  x_UpdateInterval(new_loc->SetInt());
623  }
624  return new_loc;
625 }
626 
627 bool CAdjustForConsensusSpliceSite::x_HasProteinChanged(const CSeq_feat& cds, const string& orig_prot_seq)
628 {
629  CRef<CSeq_feat> new_cds(new CSeq_feat);
630  new_cds->Assign(cds);
631  new_cds->ResetLocation();
632  CRef<CSeq_loc> new_loc = x_CreateNewLocation(cds);
633  new_cds->SetLocation(*new_loc);
634 
635  string new_prot_seq = GetProteinSeq(*new_cds, *m_Scope);
636  return !NStr::EqualNocase(new_prot_seq, orig_prot_seq);
637 }
638 
640 {
641  // A similar function to x_HasProteinChanged() function,
642  // Note that the cds might change, specifically its frame, as the new protein might differ
643  // from the original in one amino acid when adjusting the frame
644 
645  CRef<CSeq_feat> new_cds(new CSeq_feat);
646  new_cds->Assign(cds);
647  new_cds->ResetLocation();
648  CRef<CSeq_loc> new_loc = x_CreateNewLocation(cds);
649  new_cds->SetLocation(*new_loc);
650 
651  string new_prot_seq = GetProteinSeq(*new_cds, *m_Scope);
652  bool changed = !NStr::EqualNocase(new_prot_seq, orig_prot_seq);
653 
654  if (changed) {
655  // the original frame of the CDS
657  if (cds.GetData().GetCdregion().IsSetFrame()) {
658  orig_frame = cds.GetData().GetCdregion().GetFrame();
659  }
660 
661  // check whether by changing the frame, the protein remains the same (might be longer by 1 amino acid)
662  for (int enumI = CCdregion::eFrame_one; enumI < CCdregion::eFrame_three + 1; ++enumI) {
663  CCdregion::EFrame fr = (CCdregion::EFrame) (enumI);
664  new_cds->SetData().SetCdregion().SetFrame(fr);
665 
666  string prot_seq = GetProteinSeq(*new_cds, *m_Scope);
667  if (NStr::EqualNocase(prot_seq, orig_prot_seq)) {
668  if (fr != orig_frame) {
669  cds.SetData().SetCdregion().SetFrame(fr);
670  }
671  changed = false;
672  }
673  else {
674  prot_seq.erase(prot_seq.begin());
675  if (NStr::EqualNocase(prot_seq, orig_prot_seq)) {
676  if (fr != orig_frame) {
677  cds.SetData().SetCdregion().SetFrame(fr);
678  }
679  changed = false;
680  m_ProtChanged = true;
681  }
682  }
683  }
684  }
685  return changed;
686 }
687 
689 {
690  // the new protein might be shorter than the original one
691  CRef<CSeq_feat> new_cds(new CSeq_feat);
692  new_cds->Assign(cds);
693  new_cds->ResetLocation();
694  CRef<CSeq_loc> new_loc = x_CreateNewLocation(cds);
695  new_cds->SetLocation(*new_loc);
696 
697  const string new_prot_seq = GetProteinSeq(*new_cds, *m_Scope);
698  bool changed = !NStr::EqualNocase(new_prot_seq, orig_prot_seq);
699 
700  if (changed) {
701  string modified_prot = new_prot_seq.substr(0, new_prot_seq.length() - 1);
702  if (NStr::EqualNocase(modified_prot, orig_prot_seq)) {
703  changed = false;
704  m_ProtChanged = true;
705  }
706  else {
707  string shorter_orig_prot = orig_prot_seq.substr(0, orig_prot_seq.length() - 1);
708  if (NStr::EqualNocase(new_prot_seq, shorter_orig_prot)) {
709  changed = false;
710  m_ProtChanged = true;
711  }
712  }
713 
714  }
715  return changed;
716 }
717 
719 {
720  if ( ! orig_loc.IsMix())
721  return;
722 
723  locs.clear();
725  TLocs::iterator range_it = m_New_Ranges.begin();
726  TLocs::iterator orig_rng_it = m_Orig_Ranges.begin();
727 
728  while ( loc_iter && range_it != m_New_Ranges.end() && orig_rng_it != m_Orig_Ranges.end()) {
729  CConstRef<CSeq_loc> subloc = loc_iter.GetRangeAsSeq_loc();
730  if (subloc->IsPnt()) {
731  const CSeq_point& pnt = subloc->GetPnt();
732  CRef<CSeq_loc> new_subloc(new CSeq_loc);
733  if (range_it->GetFrom() != orig_rng_it->GetFrom() || range_it->GetTo() != orig_rng_it->GetTo()) {
734  // make a new interval
735  new_subloc->SetInt().SetFrom(pnt.GetPoint());
736  new_subloc->SetInt().SetTo(pnt.GetPoint());
737  new_subloc->SetInt().SetStrand(m_Strand);
738  CRef<CSeq_id> new_id(new CSeq_id);
739  new_id->Assign(pnt.GetId());
740  new_subloc->SetInt().SetId(*new_id);
741 
742  } else {
743  // just copy the old point
744  new_subloc->SetPnt().Assign(pnt);
745  }
746  locs.push_back(new_subloc);
747 
748  } else if (subloc->IsInt()) {
749  CRef<CSeq_loc> new_subloc(new CSeq_loc);
750  new_subloc->Assign(*subloc);
751 
752  if (range_it->GetFrom() != orig_rng_it->GetFrom()) {
753  new_subloc->SetInt().SetFrom(range_it->GetFrom());
754  }
755  if (range_it->GetTo() != orig_rng_it->GetTo()) {
756  new_subloc->SetInt().SetTo(range_it->GetTo());
757  }
758  locs.push_back(new_subloc);
759 
760  } else {
761  // other cases should not be encountered
762  }
763  ++loc_iter;
764  ++range_it;
765  ++orig_rng_it;
766  }
767 
768 }
769 
771 {
772  TLocs::iterator range_it = m_New_Ranges.begin();
773  TLocs::iterator orig_rng_it = m_Orig_Ranges.begin();
774  if (range_it != m_New_Ranges.end() && orig_rng_it != m_Orig_Ranges.end()) {
775  if (range_it->GetFrom() != orig_rng_it->GetFrom()) {
776  interval.SetFrom(range_it->GetFrom());
777  }
778  if (range_it->GetTo() != orig_rng_it->GetTo()) {
779  interval.SetTo(range_it->GetTo());
780  }
781  }
782 }
783 
785 {
786  CSeq_loc::TIntervals::iterator it = locs.begin();
787  TLocs::iterator range_it = m_New_Ranges.begin();
788  TLocs::iterator orig_rng_it = m_Orig_Ranges.begin();
789 
790  while (it != locs.end() && range_it != m_New_Ranges.end() && orig_rng_it != m_Orig_Ranges.end()) {
791  CSeq_interval& interval = **it;
792  if (range_it->GetFrom() != orig_rng_it->GetFrom()) {
793  interval.SetFrom(range_it->GetFrom());
794  }
795  if (range_it->GetTo() != orig_rng_it->GetTo()) {
796  interval.SetTo(range_it->GetTo());
797  }
798  ++it;
799  ++range_it;
800  ++orig_rng_it;
801  }
802 }
803 
804 
806 {
807  if (range.GetFrom() >= range.GetTo())
808  return;
809  if (m_Strand == eNa_strand_minus) {
810  range_prev.SetFrom(range_prev.GetFrom() - offset);
811  range.SetTo(range.GetTo() - offset);
812  } else {
813  range_prev.SetTo(range_prev.GetTo() + offset);
814  range.SetFrom(range.GetFrom() + offset);
815  }
816 }
817 
819 {
820  if (range_prev.GetFrom() >= range_prev.GetTo())
821  return;
822  if (m_Strand == eNa_strand_minus) {
823  range.SetTo(range.GetTo() + offset);
824  range_prev.SetFrom(range_prev.GetFrom() + offset);
825  } else {
826  range.SetFrom(range.GetFrom() - offset);
827  range_prev.SetTo(range_prev.GetTo() - offset);
828  }
829 }
830 
832 {
833  if (rng1.GetFrom() == rng2.GetFrom() && rng1.GetTo() == rng2.GetTo())
834  return 0;
835 
836  TSeqPos intron_len = 0;
837  if (m_Strand == eNa_strand_minus) {
838  intron_len = rng1.GetFrom() - rng2.GetTo() - 1;
839  } else {
840  intron_len = rng2.GetFrom() - rng1.GetTo() - 1;
841  }
842  return intron_len;
843 }
844 
845 
847 {
848  _ASSERT(m_Scope);
849  if (!cds.GetData().IsCdregion() || !cds.IsSetLocation() || !cds.IsSetProduct()) {
850  return false;
851  }
852  if (cds.GetLocation().GetId() == nullptr) {
853  return false;
854  }
855 
856  // not going to handle mixed-strand exons
857  if (cds.GetLocation().GetStrand() == eNa_strand_other) {
858  return false;
859  }
860 
861  // no CDS exceptions set
862  if ((cds.IsSetExcept() && cds.GetExcept()) || cds.IsSetExcept_text())
863  return false;
864 
865  // assumed that the coding region is on one sequence
866  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(cds.GetLocation());
867  if (!bsh) {
868  return false;
869  }
870 
871  CBioseq_Handle product = m_Scope->GetBioseqHandle(cds.GetProduct());
872  if (!product || !product.IsProtein()) {
873  return false;
874  }
875 
876  // obtaining the original protein sequence
879  string orig_prot_seq;
880  prot_vec.GetSeqData(0, prot_vec.size(), orig_prot_seq);
881  if (orig_prot_seq.empty()) {
882  return false;
883  }
884 
885  x_InitRanges(cds);
886  m_Strand = cds.GetLocation().GetStrand();
887 
888  bool changed = false;
889  if (orig_prot_seq.front() != 'M' &&
891  !edit::CLocationEditPolicy::Is5AtEndOfSeq(cds.GetLocation(), bsh)) {
892 
893  changed = x_AdjustCDS5End(cds, orig_prot_seq);
894  }
895 
896  if (orig_prot_seq.back() != '*' &&
898  !edit::CLocationEditPolicy::Is3AtEndOfSeq(cds.GetLocation(), bsh)) {
899 
900  changed |= x_AdjustCDS3End(cds, orig_prot_seq);
901  }
902  if (changed) {
903  CRef<CSeq_loc> new_loc = x_UpdateLocation(cds);
904  cds.ResetLocation();
905  cds.SetLocation(new_loc.GetObject());
906  }
907 
908  return changed;
909 }
910 
911 namespace {
912  // check whether the location is 1 or two bases away from the end of sequence
913  bool IsCloseTo5EndOfSeq(const CSeq_loc& loc, CBioseq_Handle bsh)
914  {
915  if (!bsh) return false;
916 
917  ENa_strand strand = loc.GetStrand();
918  const auto start = loc.GetStart(eExtreme_Biological);
919 
920  const auto seq_length = bsh.GetInst_Length() - 1;
921  bool rval = false;
922 
923  if (strand == eNa_strand_minus) {
924  if (seq_length - start == 1 || seq_length - start == 2) {
925  rval = true;
926  }
927  }
928  else {
929  if (start == 1 || start == 2) {
930  rval = true;
931  }
932  }
933  return rval;
934  }
935 
936  bool IsCloseTo3EndOfSeq(const CSeq_loc& loc, CBioseq_Handle bsh)
937  {
938  if (!bsh) return false;
939 
940  ENa_strand strand = loc.GetStrand();
941  const auto stop = loc.GetStop(eExtreme_Biological);
942 
943  const auto seq_length = bsh.GetInst_Length() - 1;
944  bool rval = false;
945 
946  if (strand == eNa_strand_minus) {
947  if (stop == 1 || stop == 2) {
948  rval = true;
949  }
950  }
951  else {
952  if (seq_length - stop == 1 || seq_length - stop == 2) {
953  rval = true;
954  }
955  }
956  return rval;
957  }
958 }
959 
960 bool CAdjustForConsensusSpliceSite::x_AdjustCDS5End(CSeq_feat& cds, const string& orig_prot_seq)
961 {
962  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(cds.GetLocation());
963  TSeqPos length = bsh.GetBioseqLength() - 1;
965 
966  // update the location of the new cds and constantly check whether the protein is still the same as the original one
968 
969  TLocs::iterator range_it = m_New_Ranges.begin();
970  TSeqRange range_orig = *range_it;
971 
972  bool changed = false;
973 
975 
976  if (loc->IsInt() || loc->IsPnt()) {
977 
978  TSeqPos start = 0;
979  TSeqPos stop = range_it->GetTo();
980  if (m_Strand == eNa_strand_minus) {
981  stop = length - range_it->GetFrom();
982  }
983 
984  string seqdata = kEmptyStr;
985  seq_vec.GetSeqData(start, stop, seqdata);
986 
987  TSeqPos intron_stop = range_it->GetFrom() - 1 - start; // acceptor splice site
988  if (m_Strand == eNa_strand_minus) {
989  intron_stop = length - range_it->GetTo() - 1 - start;
990  }
991 
992  if (intron_stop < seqdata.size() &&
993  intron_stop - 1 < seqdata.size() && // for plus strand
994  s_IsAcceptorSpliceSiteOK(seqdata, intron_stop)) {
995  // no adjustment is needed
996  return changed;
997  }
998 
999  TSeqPos offset = 1;
1000  while (offset < 3 && !changed) {
1001  if (intron_stop - offset < seqdata.size() &&
1002  intron_stop - offset - 1 < seqdata.size() &&
1003  s_IsAcceptorSpliceSiteOK(seqdata, intron_stop - offset)) {
1004 
1005  x_ExtendStartOfExon(*range_it, offset, cds.GetLocation(), bsh);
1006  if (x_HasProteinChangedAt5End(cds, orig_prot_seq)) {
1007  // undo the changes to the previous and the current range
1008  swap(range_orig, *range_it);
1009  }
1010  else {
1011  changed = true;
1012  }
1013  }
1014 
1015  if (!changed &&
1016  intron_stop + offset < seqdata.size() &&
1017  intron_stop + offset + 1 < seqdata.size() &&
1018  s_IsAcceptorSpliceSiteOK(seqdata, intron_stop + offset)) {
1019 
1020  x_TrimStartOfExon(*range_it, offset);
1021  if (x_HasProteinChangedAt5End(cds, orig_prot_seq)) {
1022  // undo the changes to the previous and the current range
1023  swap(range_orig, *range_it);
1024  }
1025  else {
1026  changed = true;
1027  }
1028  }
1029 
1030  if (!changed) ++offset;
1031  }
1032 
1033  // try extending it to the end of sequence
1034  if (!changed && IsCloseTo5EndOfSeq(cds.GetLocation(), bsh)) {
1035  offset = 2;
1036  while (offset > 0 && !changed) {
1037  if (x_ExtendStartOfExon(*range_it, offset, cds.GetLocation(), bsh)) {
1038  if (x_HasProteinChangedAt5End(cds, orig_prot_seq)) {
1039  // undo the changes to the previous and the current range
1040  swap(range_orig, *range_it);
1041  }
1042  else {
1043  changed = true;
1044  }
1045  }
1046  if (!changed) --offset;
1047  }
1048  }
1049  }
1050  return changed;
1051 }
1052 
1053 
1055 {
1056  bool extended = false;
1057  const auto seq_stop = bsh.GetInst_Length() - 1;
1058 
1059  if (m_Strand == eNa_strand_minus) {
1060  if (range.GetTo() + offset <= seq_stop) {
1061  range.SetTo(range.GetTo() + offset);
1062  extended = true;
1063  }
1064  }
1065  else {
1066  if ((int)range.GetFrom() - (int)offset >= 0) {
1067  range.SetFrom(range.GetFrom() - offset);
1068  extended = true;
1069  }
1070  }
1071  return extended;
1072 }
1073 
1075 {
1076  if (m_Strand == eNa_strand_minus) {
1077  range.SetTo(range.GetTo() - offset);
1078  }
1079  else {
1080  range.SetFrom(range.GetFrom() + offset);
1081  }
1082 }
1083 
1084 bool CAdjustForConsensusSpliceSite::x_AdjustCDS3End(CSeq_feat& cds, const string& orig_prot_seq)
1085 {
1086  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(cds.GetLocation());
1087  TSeqPos length = bsh.GetBioseqLength() - 1;
1089 
1090  // update the location of the new cds and constantly check whether the protein is still the same as the original one
1092  iter.SetPos(iter.GetSize() - 1);
1093 
1094  TLocs::iterator range_it = m_New_Ranges.end() - 1;
1095  TSeqRange range_orig = *range_it;
1096 
1097  bool changed = false;
1098 
1100 
1101  if (loc->IsInt() || loc->IsPnt()) {
1102  TSeqPos start = range_it->GetFrom();
1103  TSeqPos stop = length - 1;
1104  if (m_Strand == eNa_strand_minus) {
1105  start = length - range_it->GetTo();
1106  }
1107 
1108  string seqdata = kEmptyStr;
1109  seq_vec.GetSeqData(start, stop, seqdata);
1110 
1111  TSeqPos intron_start = range_it->GetTo() + 1 - start; // donor splice site
1112  if (m_Strand == eNa_strand_minus) {
1113  intron_start = length - range_it->GetFrom() + 1 - start;
1114  }
1115 
1116  if (intron_start < seqdata.size() &&
1117  intron_start + 1 < seqdata.size() && // for plus strand
1118  s_IsDonorSpliceSiteOK(seqdata, intron_start)) {
1119  // no adjustment is needed
1120  return changed;
1121  }
1122 
1123 
1124  TSeqPos offset = 1;
1125  while (offset < 3 && !changed) {
1126  if (intron_start + offset < seqdata.size() &&
1127  intron_start + offset + 1 < seqdata.size() &&
1128  s_IsDonorSpliceSiteOK(seqdata, intron_start + offset)) {
1129 
1130  x_ExtendStopOfExon(*range_it, offset, cds.GetLocation(), bsh);
1131  if (x_HasProteinChangedAt3End(cds, orig_prot_seq)) {
1132  // undo the changes to the previous and the current range
1133  swap(range_orig, *range_it);
1134  }
1135  else {
1136  changed = true;
1137  }
1138  }
1139 
1140  if (!changed &&
1141  intron_start - offset < seqdata.size() &&
1142  intron_start - offset + 1 < seqdata.size() &&
1143  s_IsDonorSpliceSiteOK(seqdata, intron_start - offset)) {
1144 
1145  x_TrimStopOfExon(*range_it, offset);
1146  if (x_HasProteinChangedAt3End(cds, orig_prot_seq)) {
1147  // undo the changes to the previous and the current range
1148  swap(range_orig, *range_it);
1149  }
1150  else {
1151  changed = true;
1152  }
1153  }
1154 
1155  if (!changed) ++offset;
1156  }
1157 
1158  // try extending it to the end of sequence
1159  if (!changed && IsCloseTo3EndOfSeq(cds.GetLocation(), bsh)) {
1160  offset = 2;
1161  while (offset > 0 && !changed) {
1162  if (x_ExtendStopOfExon(*range_it, offset, cds.GetLocation(), bsh)) {
1163  if (x_HasProteinChangedAt3End(cds, orig_prot_seq)) {
1164  // undo the changes to the previous and the current range
1165  swap(range_orig, *range_it);
1166  }
1167  else {
1168  changed = true;
1169  }
1170  }
1171  if (!changed) --offset;
1172  }
1173  }
1174  }
1175  return changed;
1176 }
1177 
1179 {
1180  bool extended = false;
1181  const auto seq_stop = bsh.GetInst_Length() - 1;
1182 
1183  if (m_Strand == eNa_strand_minus) {
1184  if ((int)range.GetFrom() - (int)offset >= 0) {
1185  range.SetFrom(range.GetFrom() - offset);
1186  extended = true;
1187  }
1188  }
1189  else {
1190  if (range.GetTo() + offset <= seq_stop) {
1191  range.SetTo(range.GetTo() + offset);
1192  extended = true;
1193  }
1194  }
1195  return extended;
1196 }
1197 
1199 {
1200  if (m_Strand == eNa_strand_minus) {
1201  range.SetFrom(range.GetFrom() + offset);
1202  }
1203  else {
1204  range.SetTo(range.GetTo() - offset);
1205  }
1206 }
1207 
1209 {
1210  _ASSERT(m_Scope);
1211  CRef<CCmdComposite> cmd(new CCmdComposite("Update mRNA and exons for consensus splice sites at the ends"));
1212 
1214  if (mrna) {
1215  CRef<CSeq_feat> new_mrna(new CSeq_feat());
1216  new_mrna->Assign(*mrna);
1217  if (AdjustmRNAToMatchCDSEnds(new_cds, *new_mrna)) {
1218  CSeq_feat_Handle mrnah = m_Scope->GetSeq_featHandle(mrna.GetObject());
1219  CIRef<IEditCommand> chgmRNA(new CCmdChangeSeq_feat(mrnah, *new_mrna));
1220  cmd->AddCommand(*chgmRNA);
1221  }
1222  }
1223 
1224  // update the exon features if there were any
1225  x_UpdateExonFeatures(cmd.GetPointer(), orig_cds, new_cds);
1226  return cmd;
1227 }
1228 
1230 {
1231  const auto& cds_loc = edit_cds.GetLocation();
1232  TSeqPos cds_start = cds_loc.GetStart(eExtreme_Biological);
1233  TSeqPos cds_stop = cds_loc.GetStop(eExtreme_Biological);
1234 
1235  const auto& orig_mrna_loc = mrna.GetLocation();
1236  TSeqPos mrna_start = orig_mrna_loc.GetStart(eExtreme_Biological);
1237  TSeqPos mrna_stop = orig_mrna_loc.GetStop(eExtreme_Biological);
1238 
1239  CRef<CSeq_loc> new_loc(new CSeq_loc);
1240  new_loc->Assign(mrna.GetLocation());
1241 
1242  CSeq_loc_I loc_it(*new_loc);
1243  // adjust the first sub-interval
1244  if (cds_start != mrna_start) {
1245  if (loc_it.IsSetStrand() && loc_it.GetStrand() == eNa_strand_minus) {
1246  loc_it.SetTo(cds_start);
1247  }
1248  else {
1249  loc_it.SetFrom(cds_start);
1250  }
1251  }
1252 
1253  // adjust the last sub-interval
1254  if (cds_stop != mrna_stop) {
1255  auto num_intervals = loc_it.GetSize();
1256  loc_it.SetPos(num_intervals - 1);
1257  if (loc_it.IsSetStrand() && loc_it.GetStrand() == eNa_strand_minus) {
1258  loc_it.SetFrom(cds_stop);
1259  }
1260  else {
1261  loc_it.SetTo(cds_stop);
1262  }
1263  }
1264 
1265  if (loc_it.HasChanges()) {
1266  CRef<CSeq_loc> loc = loc_it.MakeSeq_loc();
1267  mrna.SetLocation(*loc);
1268  return true;
1269  }
1270  return false;
1271 }
1272 
1273 
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
USING_SCOPE(objects)
TSeqPos x_IntronLength(const TSeqRange &rng_prev, const TSeqRange &rng)
bool x_HasProteinChanged(const objects::CSeq_feat &cds, const string &orig_prot_seq)
static bool s_IsBioseqGood_AdjustEnds(const objects::CBioseq_Handle &bsh)
void x_InitRanges(const objects::CSeq_feat &cds)
bool AdjustCDS(objects::CSeq_feat &cds)
bool x_HasProteinChangedAt3End(objects::CSeq_feat &cds, const string &orig_prot_seq)
bool x_ExtendStartOfExon(TSeqRange &range, TSeqPos offset, const objects::CSeq_loc &loc, objects::CBioseq_Handle bsh)
CRef< objects::CSeq_loc > x_CreateNewLocation(const objects::CSeq_feat &cds)
void x_UpdateMixLocations(const objects::CSeq_loc &orig_loc, objects::CSeq_loc::TLocations &locs)
void x_UpdateInterval(objects::CSeq_interval &interval)
CRef< CCmdComposite > AdjustmRNAandExonEnds(const objects::CSeq_feat &new_cds, const objects::CSeq_feat &orig_cds)
static bool s_IsBioseqGood_Strict(const objects::CBioseq_Handle &bsh)
static bool s_IsBioseqGood_Relaxed(const objects::CBioseq_Handle &bsh)
void x_ShiftExonPairBackward(TSeqRange &range_prev, TSeqRange &range, TSeqPos offset)
bool x_AlsoAdjustmRNA(objects::CSeq_feat &mrna, const objects::CSeq_loc &loc)
bool x_HasProteinChangedAt5End(objects::CSeq_feat &cds, const string &orig_prot_seq)
CRef< CCmdComposite > GetCommandToAdjustCDSEnds(const objects::CSeq_feat &cds)
void x_TrimStopOfExon(TSeqRange &range, TSeqPos offset)
bool x_ExtendStopOfExon(TSeqRange &range, TSeqPos offset, const objects::CSeq_loc &loc, objects::CBioseq_Handle bsh)
bool x_AdjustCDS3End(objects::CSeq_feat &cds, const string &orig_prot_seq)
bool AdjustmRNAToMatchCDS(const objects::CSeq_feat &edit_cds, objects::CSeq_feat &mrna)
bool x_AdjustCDS5End(objects::CSeq_feat &cds, const string &orig_prot_seq)
void x_TrimStartOfExon(TSeqRange &range, TSeqPos offset)
void x_ShiftExonPairForward(TSeqRange &range_prev, TSeqRange &range, TSeqPos offset)
CRef< CCmdComposite > GetCommand(const objects::CSeq_feat &cds)
AdjustForConsensusSpliceSite Adjust internal intervals of a CDS (and its associated mRNA feature) to ...
static bool s_IsAdjustedSpliceSitePairOK(const string &seqdata, TSeqPos intron_start, TSeqPos intron_stop)
bool AdjustmRNAToMatchCDSEnds(const objects::CSeq_feat &edit_cds, objects::CSeq_feat &mrna)
void x_UpdateIntervals(objects::CSeq_loc::TIntervals &locs)
CRef< objects::CSeq_loc > x_UpdateLocation(const objects::CSeq_feat &cds)
void x_UpdateExonFeatures(CCmdComposite *cmd, const objects::CSeq_feat &orig_cds, const objects::CSeq_feat &new_cds)
bool AdjustCDSEnds(objects::CSeq_feat &cds)
CRef< CCmdComposite > AdjustmRNAandExonFeatures(const objects::CSeq_feat &new_cds, const objects::CSeq_feat &orig_cds)
const string & GetLineage(void) const
Definition: BioSource.cpp:360
bool IsSetLineage(void) const
Definition: BioSource.cpp:355
CBioseq_Handle –.
CFeat_CI –.
Definition: feat_ci.hpp:64
TTaxId GetTaxId() const
Definition: Org_ref.cpp:72
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_feat_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:593
void AddInterval(const CSeq_id &id, TSeqPos from, TSeqPos to, ENa_strand strand=eNa_strand_unknown)
void AddSeqLoc(const CSeq_loc &other)
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
static CS_COMMAND * cmd
Definition: ct_dynamic.c:26
int offset
Definition: replacements.h:160
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define NULL
Definition: ncbistd.hpp:225
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
CRef< CSeq_loc > MakeSeq_loc(EMakeType make_type=eMake_CompactType) const
return constructed CSeq_loc with all changes
Definition: Seq_loc.cpp:2946
void SetPacked_int(TPacked_int &v)
Definition: Seq_loc.hpp:984
void SetFrom(TSeqPos from)
Set the range from position.
Definition: Seq_loc.cpp:2818
void SetMix(TMix &v)
Definition: Seq_loc.hpp:987
void SetTo(TSeqPos to)
Set the range to position.
Definition: Seq_loc.cpp:2829
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
Definition: Seq_loc.cpp:3222
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
CSeq_loc_mix_Base::Tdata TLocations
Definition: Seq_loc.hpp:99
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
CConstRef< CSeq_loc > GetRangeAsSeq_loc(void) const
Get seq-loc for the current iterator position.
Definition: Seq_loc.cpp:2585
void SetPnt(TPnt &v)
Definition: Seq_loc.hpp:985
bool IsSetStrand(void) const
Get strand.
Definition: Seq_loc.hpp:1049
bool HasChanges(void) const
return true of any part was changed since initialization
Definition: Seq_loc.cpp:2706
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
CPacked_seqint_Base::Tdata TIntervals
Definition: Seq_loc.hpp:98
void SetPos(size_t pos)
Set iterator's position.
Definition: Seq_loc.cpp:2642
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
size_t GetSize(void) const
Get number of ranges.
Definition: Seq_loc.cpp:2636
TRange GetRange(void) const
Get the range.
Definition: Seq_loc.hpp:1042
void SetPartialStart(bool val, ESeqLocExtremes ext)
set / remove e_Lim fuzz on start or stop (lt/gt - indicating partial interval)
Definition: Seq_loc.cpp:3280
ENa_strand GetStrand(void) const
Definition: Seq_loc.hpp:1056
void SetPartialStop(bool val, ESeqLocExtremes ext)
Definition: Seq_loc.cpp:3313
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
@ eOrder_Biological
Iterate sub-locations in positional order.
Definition: Seq_loc.hpp:462
@ eOrder_Positional
Definition: Seq_loc.hpp:461
CConstRef< CSeq_feat > GetmRNAforCDS(const CSeq_feat &cds, CScope &scope)
GetmRNAforCDS A function to find a CSeq_feat representing the appropriate mRNA for a given CDS.
Definition: sequence.cpp:1261
static void Translate(const string &seq, string &prot, const CGenetic_code *code, bool include_stop=true, bool remove_trailing_X=false, bool *alt_start=NULL, bool is_5prime_complete=true, bool is_3prime_complete=true)
Translate a string using a specified genetic code.
Definition: sequence.cpp:4095
TSeqPos GetBioseqLength(void) const
bool IsProtein(void) const
TInst_Length GetInst_Length(void) const
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Ncbi
Set coding to binary coding (Ncbi4na or Ncbistdaa)
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
const CSeq_feat & GetMappedFeature(void) const
Feature mapped to the master sequence.
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
void SetCoding(TCoding coding)
TObjectType & GetObject(void)
Get object.
Definition: ncbiobj.hpp:1011
TObjectType & GetObject(void) const
Get object.
Definition: ncbiobj.hpp:1697
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define kEmptyStr
Definition: ncbistr.hpp:123
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
void SetFrom(TFrom value)
Assign a value to From data member.
Definition: Range_.hpp:231
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
void SetTo(TTo value)
Assign a value to To data member.
Definition: Range_.hpp:278
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
bool IsCdregion(void) const
Check if variant Cdregion is selected.
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
TFrame GetFrame(void) const
Get the Frame member data.
Definition: Cdregion_.hpp:534
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
bool IsSetExcept(void) const
something funny about this? Check if a value has been assigned to Except data member.
Definition: Seq_feat_.hpp:990
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
Definition: Seq_feat_.hpp:1393
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
const TCdregion & GetCdregion(void) const
Get the variant data.
const TProduct & GetProduct(void) const
Get the Product member data.
Definition: Seq_feat_.hpp:1096
void ResetLocation(void)
Reset Location data member.
Definition: Seq_feat_.cpp:122
TExcept GetExcept(void) const
Get the Except member data.
Definition: Seq_feat_.hpp:1009
bool IsSetProduct(void) const
product of process Check if a value has been assigned to Product data member.
Definition: Seq_feat_.hpp:1084
bool IsSetFrame(void) const
Check if a value has been assigned to Frame data member.
Definition: Cdregion_.hpp:509
bool IsSetLocation(void) const
feature made from Check if a value has been assigned to Location data member.
Definition: Seq_feat_.hpp:1105
@ eFrame_not_set
not set, code uses one
Definition: Cdregion_.hpp:95
@ eFrame_three
reading frame
Definition: Cdregion_.hpp:98
void SetTo(TTo value)
Assign a value to To data member.
bool IsMix(void) const
Check if variant Mix is selected.
Definition: Seq_loc_.hpp:552
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
const TPnt & GetPnt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:238
TPoint GetPoint(void) const
Get the Point member data.
Definition: Seq_point_.hpp:303
const TId & GetId(void) const
Get the Id member data.
Definition: Seq_point_.hpp:390
void SetFrom(TFrom value)
Assign a value to From data member.
bool IsPacked_int(void) const
Check if variant Packed_int is selected.
Definition: Seq_loc_.hpp:534
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
bool IsPnt(void) const
Check if variant Pnt is selected.
Definition: Seq_loc_.hpp:540
@ eNa_strand_other
Definition: Na_strand_.hpp:70
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
bool IsSetBiomol(void) const
Check if a value has been assigned to Biomol data member.
Definition: MolInfo_.hpp:422
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
Definition: Seq_inst_.cpp:130
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
range(_Ty, _Ty) -> range< _Ty >
const char * command
bool IsOrganelle(int genome)
Definition: utilities.cpp:2831
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
SAnnotSelector –.
#define _ASSERT
Modified on Fri Apr 12 17:19:11 2024 by modify_doxy.py rev. 669887