NCBI C++ ToolKit
variation_util.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: variation_util.cpp 55004 2012-07-03 15:27:43Z astashya $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Description:
27  * Sample library
28  *
29  */
30 
31 #include <ncbi_pch.hpp>
32 #include <corelib/ncbiargs.hpp>
33 
36 
43 
44 
46 
53 
54 
56 
58 #include <objects/seq/Seq_data.hpp>
60 #include <objects/seq/Num_ref.hpp>
64 #include <objects/seq/Seq_inst.hpp>
68 
70 
71 #include <serial/iterator.hpp>
72 #include <objmgr/util/sequence.hpp>
73 #include <objmgr/bioseq_handle.hpp>
74 #include <objmgr/seq_vector.hpp>
76 
77 
79 
80 namespace variation_ref {
81 
82 
84 
85 #if 0
86 CVariationUtil::ETestStatus CVariationUtil::CheckAssertedAllele(
87  const CSeq_feat& variation_feat,
88  string* asserted_out,
89  string* actual_out)
90 {
91  return eNotApplicable;
92  if(!variation_feat.GetData().IsVariation()) {
93  return eNotApplicable;
94  }
95 
96  CVariation_ref vr;
97  vr.Assign(variation_feat.GetData().GetVariation());
98  if(!vr.IsSetLocation()) {
99  vr.SetLocation().Assign(variation_feat.GetLocation());
100  }
102 
103 
104  bool have_asserted_seq = false;
105  bool is_ok = true;
106  for(CTypeIterator<CVariation_ref> it1(Begin(vr)); it1; ++it1) {
107  const CVariation_ref& vr1 = *it1;
108  if(vr1.GetData().IsInstance()
111  {
112  string asserted_seq;
113  const CSeq_literal& literal = vr1.GetData().GetInstance().GetDelta().front()->GetSeq().GetLiteral();
114  if(literal.GetSeq_data().IsIupacna()) {
115  asserted_seq = literal.GetSeq_data().GetIupacna();
116  have_asserted_seq = true;
117  } else if(literal.GetSeq_data().IsNcbieaa()) {
118  asserted_seq = literal.GetSeq_data().GetNcbieaa();
119  have_asserted_seq = true;
120  }
121 
122  //an asserted sequnece may be of the form "A..BC", where ".." is to be interpreted as a
123  //gap of arbitrary length - we need to match prefix and suffix separately
124  string prefix, suffix;
125  string str_tmp = NStr::Replace(asserted_seq, "..", "\t"); //SplitInTwo's delimiter must be single-character
126  NStr::SplitInTwo(str_tmp, "\t", prefix, suffix);
127 
129  string actual_seq;
130  v.GetSeqData(v.begin(), v.end(), actual_seq);
131 
132  if( prefix.size() > 0 && !NStr::StartsWith(actual_seq, prefix)
133  || suffix.size() > 0 && !NStr::EndsWith(actual_seq, suffix))
134  {
135  is_ok = false;
136  if(asserted_out) {
137  *asserted_out = asserted_seq;
138  }
139  if(actual_out) {
140  *actual_out = actual_seq;
141  }
142  break;
143  }
144  }
145  }
146 
147  return !have_asserted_seq ? eNotApplicable : is_ok ? ePass : eFail;
148 }
149 #endif
150 
151 
152 /*!
153  * if variation-feat is not intronic, or alignment is not spliced-seg -> eNotApplicable
154  * else if variation is intronic but location is not at exon boundary -> eFail
155  * else -> ePass
156  */
158 {
159  if(!variation_feat.GetData().IsVariation() || !aln.GetSegs().IsSpliced()) {
160  return eNotApplicable;
161  }
162 
163  CVariation_ref vr;
164  vr.Assign(variation_feat.GetData().GetVariation());
165  if(!vr.IsSetLocation()) {
166  vr.SetLocation().Assign(variation_feat.GetLocation());
167  }
169 
170  set<TSeqPos> exon_terminal_pts;
172  const CSpliced_exon& exon = **it;
173  exon_terminal_pts.insert(exon.GetProduct_start().IsNucpos() ?
174  exon.GetProduct_start().GetNucpos() :
175  exon.GetProduct_start().GetProtpos().GetAmin());
176  exon_terminal_pts.insert(exon.GetProduct_end().IsNucpos() ?
177  exon.GetProduct_end().GetNucpos() :
178  exon.GetProduct_end().GetProtpos().GetAmin());
179  }
180 
181  bool is_intronic = false;
182  bool is_ok = true;
183  for(CTypeIterator<CVariation_ref> it1(Begin(vr)); it1; ++it1) {
184  const CVariation_ref& vr1 = *it1;
185  if(!vr1.GetData().IsInstance()) {
186  continue;
187  }
188  const CSeq_id* id1 = vr1.GetLocation().GetId();
189  if(!id1 || !aln.GetSeq_id(0).Equals(*id1)) {
190  continue;
191  }
192 
193  if(vr1.GetData().GetInstance().GetDelta().size() == 0) {
194  continue;
195  }
196 
197  const CDelta_item& first_delta = *vr1.GetData().GetInstance().GetDelta().front();
198  const CDelta_item& last_delta = *vr1.GetData().GetInstance().GetDelta().back();
199 
200  //check intronic offsets for bio-start
201  if(first_delta.IsSetAction() && first_delta.GetAction() == CDelta_item::eAction_offset) {
202  is_intronic = true;
203  if(exon_terminal_pts.find(vr1.GetLocation().GetStart(eExtreme_Biological)) == exon_terminal_pts.end()) {
204  is_ok = false;
205  }
206  }
207 
208  //check intronic offsets for bio-stop
209  if(last_delta.IsSetAction() && last_delta.GetAction() == CDelta_item::eAction_offset) {
210  is_intronic = true;
211  if(exon_terminal_pts.find(vr1.GetLocation().GetStop(eExtreme_Biological)) == exon_terminal_pts.end()) {
212  is_ok = false;
213  }
214  }
215 
216  if(!is_ok) {
217  break;
218  }
219  }
220 
221  return !is_intronic ? eNotApplicable : is_ok ? ePass : eFail;
222 }
223 
224 
226 {
227  if(!v.GetData().IsSet()) {
228  return;
229  }
230 
231  //round-1: calculate this loc as union of the members
232  CRef<CSeq_loc> aggregate_loc(new CSeq_loc(CSeq_loc::e_Mix));
233  NON_CONST_ITERATE(CVariation_ref::TData::TSet::TVariations, it, v.SetData().SetSet().SetVariations()) {
234  CVariation_ref& vr = **it;
236  if(vr.IsSetLocation()) {
237  aggregate_loc->Add(vr.GetLocation());
238  }
239  }
240  aggregate_loc = aggregate_loc->Merge(CSeq_loc::fSortAndMerge_All, NULL);
241  v.SetLocation(*aggregate_loc);
242 
243  //round-2: reset the set-member locations if they are the same as this
244  NON_CONST_ITERATE(CVariation_ref::TData::TSet::TVariations, it, v.SetData().SetSet().SetVariations()) {
245  CVariation_ref& vr = **it;
246  if(vr.IsSetLocation() && vr.GetLocation().Equals(v.GetLocation())) {
247  vr.ResetLocation();
248  }
249  }
250 }
251 
253 {
254  if(!v.GetData().IsSet()) {
255  return;
256  }
257 
258  NON_CONST_ITERATE(CVariation_ref::TData::TSet::TVariations, it, v.SetData().SetSet().SetVariations()) {
259  CVariation_ref& vr = **it;
260  if(!vr.IsSetLocation()) {
261  vr.SetLocation().Assign(v.GetLocation());
262  }
264  }
265 }
266 
268 {
269  const CSeq_loc& variation_loc = v.IsSetLocation() ? v.GetLocation() : parent_variation_loc;
270 
271  if(v.GetData().IsSet()) {
272  NON_CONST_ITERATE(CVariation_ref::TData::TSet::TVariations, it, v.SetData().SetSet().SetVariations()) {
273  s_ResolveIntronicOffsets(**it, variation_loc);
274  }
275  } else if(v.GetData().IsInstance()) {
276  const CDelta_item& delta_first = *v.GetData().GetInstance().GetDelta().front();
277 
278  if(variation_loc.IsPnt() && delta_first.IsSetAction() && delta_first.GetAction() == CDelta_item::eAction_offset) {
279  if(!v.IsSetLocation()) {
280  v.SetLocation().Assign(variation_loc);
281  }
282  int offset = delta_first.GetSeq().GetLiteral().GetLength()
283  * (delta_first.IsSetMultiplier() ? delta_first.GetMultiplier() : 1)
284  * (v.GetLocation().GetStrand() == eNa_strand_minus ? -1 : 1);
285  v.SetLocation().SetPnt().SetPoint() += offset;
286  v.SetData().SetInstance().SetDelta().pop_front();
287  } else {
288  //If the location is not a point, then the offset(s) apply to start and/or stop individually
289  if(delta_first.IsSetAction() && delta_first.GetAction() == CDelta_item::eAction_offset) {
290  if(!v.IsSetLocation()) {
291  v.SetLocation().Assign(variation_loc);
292  }
294  TSeqPos& bio_start = range_loc->GetStrand() == eNa_strand_minus ? range_loc->SetInt().SetTo() : range_loc->SetInt().SetFrom();
295  int offset = delta_first.GetSeq().GetLiteral().GetLength()
296  * (delta_first.IsSetMultiplier() ? delta_first.GetMultiplier() : 1)
297  * (range_loc->GetStrand() == eNa_strand_minus ? -1 : 1);
298  bio_start += offset;
299  v.SetLocation().Assign(*range_loc);
300  v.SetData().SetInstance().SetDelta().pop_front();
301  }
302 
303  const CDelta_item& delta_last = *v.GetData().GetInstance().GetDelta().back();
304  if(delta_last.IsSetAction() && delta_last.GetAction() == CDelta_item::eAction_offset) {
305  if(!v.IsSetLocation()) {
306  v.SetLocation().Assign(variation_loc);
307  }
309  TSeqPos& bio_end = range_loc->GetStrand() == eNa_strand_minus ? range_loc->SetInt().SetFrom() : range_loc->SetInt().SetTo();
310  int offset = delta_last.GetSeq().GetLiteral().GetLength()
311  * (delta_last.IsSetMultiplier() ? delta_last.GetMultiplier() : 1)
312  * (range_loc->GetStrand() == eNa_strand_minus ? -1 : 1);
313  bio_end += offset;
314  v.SetLocation().Assign(*range_loc);
315  v.SetData().SetInstance().SetDelta().pop_back();
316  }
317  }
318  }
319 }
320 
321 
322 void CVariationUtil::s_AddIntronicOffsets(CVariation_ref& v, const CSpliced_seg& ss, const CSeq_loc& parent_variation_loc)
323 {
324  const CSeq_loc& vloc = v.IsSetLocation() ? v.GetLocation() : parent_variation_loc;
325 
326  if(v.GetData().IsSet()) {
327  NON_CONST_ITERATE(CVariation_ref::TData::TSet::TVariations, it, v.SetData().SetSet().SetVariations()) {
328  s_AddIntronicOffsets(**it, ss, vloc);
329  }
330  } else if(v.GetData().IsInstance()) {
331  if(!vloc.GetId() || !vloc.GetId()->Equals(ss.GetGenomic_id()))
332  {
333  NCBI_THROW(CArgException, eInvalidArg, "Expected genomic_id in the variation to be the same as in spliced-seg");
334  }
335 
336  long start = vloc.GetStart(eExtreme_Positional);
337  long stop = vloc.GetStop(eExtreme_Positional);
338 
339  long closest_start = 0; //closest-exon-boundary for bio-start of variation location
340  long closest_stop = 0; //closest-exon-boundary for bio-stop of variation location
341 
343  const CSpliced_exon& se = **it;
344 
345  if(se.GetGenomic_end() >= start && se.GetGenomic_start() <= start) {
346  closest_start = start; //start is within exon - use itself.
347  } else {
348  if(abs((long)se.GetGenomic_end() - start) < abs(closest_start - start)) {
349  closest_start = (long)se.GetGenomic_end();
350  }
351  if(abs((long)se.GetGenomic_start() - start) < abs(closest_start - start)) {
352  closest_start = (long)se.GetGenomic_start();
353  }
354  }
355 
356  if(se.GetGenomic_end() >= stop && se.GetGenomic_start() <= stop) {
357  closest_stop = stop; //end is within exon - use itself.
358  } else {
359  if(abs((long)se.GetGenomic_end() - stop) < abs(closest_stop - stop)) {
360  closest_stop = (long)se.GetGenomic_end();
361  }
362  if(abs((long)se.GetGenomic_start() - stop) < abs(closest_stop - stop)) {
363  closest_stop = (long)se.GetGenomic_start();
364  }
365  }
366  }
367 
368  //adjust location
369  if(start != closest_start || stop != closest_stop) {
371  loc->SetInt().SetFrom(closest_start);
372  loc->SetInt().SetTo(closest_stop);
373  v.SetLocation().Assign(*loc);
374  }
375 
376  //add offsets
377  if(start != closest_start) {
378  int offset = start - closest_start;
381  delta->SetSeq().SetLiteral().SetLength(abs(offset));
382 
383  int sign = (v.GetLocation().GetStrand() == eNa_strand_minus ? -1 : 1) * (offset < 0 ? -1 : 1);
384  if(sign < 0) {
385  delta->SetMultiplier(-1);
386  }
387  if(v.GetLocation().GetStrand() == eNa_strand_minus) {
388  v.SetData().SetInstance().SetDelta().push_back(delta);
389  } else {
390  v.SetData().SetInstance().SetDelta().push_front(delta);
391  }
392  }
393 
394  if(stop != closest_stop && start != stop) {
395  int offset = stop - closest_stop;
398  delta->SetSeq().SetLiteral().SetLength(abs(offset));
399  int sign = (v.GetLocation().GetStrand() == eNa_strand_minus ? -1 : 1) * (offset < 0 ? -1 : 1);
400  if(sign < 0) {
401  delta->SetMultiplier(-1);
402  }
403  if(v.GetLocation().GetStrand() == eNa_strand_minus) {
404  v.SetData().SetInstance().SetDelta().push_front(delta);
405  } else {
406  v.SetData().SetInstance().SetDelta().push_back(delta);
407  }
408  }
409  }
410 }
411 
412 
413 bool IsFirstSubsetOfSecond(const CSeq_loc& aa, const CSeq_loc& bb)
414 {
416  a->Assign(aa);
417  a->ResetStrand();
418 
420  b->Assign(bb);
421  b->ResetStrand();
422 
423  CRef<CSeq_loc> sub_loc = a->Subtract(*b, CSeq_loc::fSortAndMerge_All, NULL, NULL);
424  return !sub_loc->Which() || sequence::GetLength(*sub_loc, NULL) == 0;
425 }
426 
427 
428 void CVariationUtil::s_Remap(CVariation_ref& vr, CSeq_loc_Mapper& mapper, const CSeq_loc& parent_variation_loc)
429 {
430  const CSeq_loc& variation_loc = vr.IsSetLocation() ? vr.GetLocation() : parent_variation_loc;
431 
432  if(vr.GetData().IsSet()) {
433  NON_CONST_ITERATE(CVariation_ref::TData::TSet::TVariations, it, vr.SetData().SetSet().SetVariations()) {
434  s_Remap(**it, mapper, variation_loc);
435  }
436  } else if(vr.GetData().IsInstance()) {
437  //remap inst: process inst's locations in delta that are subset of the variation-loc.
438  NON_CONST_ITERATE(CVariation_inst::TDelta, it, vr.SetData().SetInstance().SetDelta()) {
439  CDelta_item& di = **it;
440  if(!di.IsSetSeq() || !di.GetSeq().IsLoc() || !IsFirstSubsetOfSecond(di.GetSeq().GetLoc(), variation_loc)) {
441  continue;
442  }
443  CRef<CSeq_loc> mapped_loc = mapper.Map(di.GetSeq().GetLoc());
444  CRef<CSeq_loc> merged_mapped_loc = sequence::Seq_loc_Merge(*mapped_loc, CSeq_loc::fSortAndMerge_All, NULL);
445  di.SetSeq().SetLoc().Assign(*merged_mapped_loc);
446  }
447  }
448 
449  //remap the location.
450  if(vr.IsSetLocation()) {
451  CRef<CSeq_loc> mapped_loc = mapper.Map(vr.GetLocation());
452  CRef<CSeq_loc> merged_mapped_loc = sequence::Seq_loc_Merge(*mapped_loc, CSeq_loc::fSortAndMerge_All, NULL);
453  vr.SetLocation().Assign(*merged_mapped_loc);
454  }
455 }
456 
457 
458 CRef<CSeq_feat> CVariationUtil::Remap(const CSeq_feat& variation_feat, const CSeq_align& aln)
459 {
460  CRef<CSeq_feat> feat(new CSeq_feat);
461  feat->Assign(variation_feat);
462 
463  if(!feat->GetData().IsVariation()) {
464  NCBI_THROW(CArgException, eInvalidArg, "feature must be of variation-feat type");
465  }
466 
467  CVariation_ref& vr = feat->SetData().SetVariation();
468 
469  //copy the feature's location to root variation's for remapping (will move back when done)
470  vr.SetLocation().Assign(feat->GetLocation());
471  if(!vr.GetLocation().GetId()) {
472  NCBI_THROW(CArgException, eInvalidArg, "Can't get unique seq-id for location");
473  }
474 
475  //todo: propagation and factoring of locs later in this method is required for
476  //proper processing of intronic offsets. perhaps this can be addressed in the respective *IntronicOffsets
477  //functions.
479 
480  if(aln.GetSegs().IsSpliced() && aln.GetSegs().GetSpliced().GetGenomic_id().Equals(*vr.GetLocation().GetId())) {
482  }
483 
484  CSeq_align::TDim target_row = -1;
485  for(int i = 0; i < 2; i++) {
487  target_row = 1 - i;
488  }
489  }
490  if(target_row == -1) {
491  NCBI_THROW(CException, eUnknown, "The alignment has no row for seq-id " + vr.GetLocation().GetId()->AsFastaString());
492  }
493 
494  CRef<CSeq_loc_Mapper> mapper(new CSeq_loc_Mapper(aln, target_row, m_scope));
495 
496  //save the original in ext-locs (for root variation only)
498  TExtLoc ext_loc(new TExtLoc::TObjectType);
499  ext_loc->SetId().SetStr("mapped-from");
500  ext_loc->SetLocation().Assign(vr.GetLocation());
501  vr.SetExt_locs().push_back(ext_loc);
502 
503  s_Remap(vr, *mapper, vr.GetLocation());
504 
505  if(vr.GetLocation().GetId()
506  && aln.GetSegs().IsSpliced()
507  && aln.GetSegs().GetSpliced().GetGenomic_id().Equals(*vr.GetLocation().GetId()))
508  {
510  }
511 
512  //Note that at this point, if we started with a genomic variation in an intron,
513  //if we remapped to cDNA, the remapped location for the root variation set that
514  //has no offsets applied will be NULL, but the inst-specific subvariations will
515  //have locs adjusted into exon and offsets applied. After factoring-out the root
516  //location will inherit the exonic base-locations.
518 
519  //transfer the root location of the variation back to feat.location
520  feat->SetLocation(feat->SetData().SetVariation().SetLocation());
521  feat->SetData().SetVariation().ResetLocation();
522 
523  return feat;
524 }
525 
526 
527 
529 {
532 
534  TSignedSeqPos max_pos = bsh.GetInst_Length() - 1;
535 
536  SFlankLocs flanks;
537  flanks.upstream.Reset(new CSeq_loc);
538  flanks.upstream->SetInt().SetId().Assign(sequence::GetId(loc, NULL));
539  flanks.upstream->SetInt().SetStrand(sequence::GetStrand(loc, NULL));
540  flanks.upstream->SetInt().SetTo(min(max_pos, stop + (TSignedSeqPos)len));
541  flanks.upstream->SetInt().SetFrom(max((TSignedSeqPos)0, start - (TSignedSeqPos)len));
542 
543  flanks.downstream.Reset(new CSeq_loc);
544  flanks.downstream->Assign(*flanks.upstream);
545 
546  CSeq_loc& second = sequence::GetStrand(loc, NULL) == eNa_strand_minus ? *flanks.upstream : *flanks.downstream;
548 
549  if(start == 0) {
550  first.SetNull();
551  } else {
552  first.SetInt().SetTo(start - 1);
553  }
554 
555  if(stop == max_pos) {
556  second.SetNull();
557  } else {
558  second.SetInt().SetFrom(stop + 1);
559  }
560 
561  return flanks;
562 }
563 
564 
565 
566 ///////////////////////////////////////////////////////////////////////////////
567 //
568 // Methods and functions pertaining to converting protein variation in precursor coords
569 //
570 ///////////////////////////////////////////////////////////////////////////////
571 void CVariationUtil::s_UntranslateProt(const string& prot_str, vector<string>& codons)
572 {
573  if(prot_str.size() != 1) {
574  NCBI_THROW(CException, eUnknown, "Expected prot_str of length 1");
575  }
576 
577  static const char* alphabet = "ACGT";
578  string codon = "AAA";
579  CSeqTranslator translator;
580  for(size_t i0 = 0; i0 < 4; i0++) {
581  codon[0] = alphabet[i0];
582  for(size_t i1 = 0; i1 < 4; i1++) {
583  codon[1] = alphabet[i1];
584  for(size_t i2 = 0; i2 < 4; i2++) {
585  codon[2] = alphabet[i2];
586  string prot("");
587  translator.Translate(codon, prot, CSeqTranslator::fIs5PrimePartial);
588  NStr::ReplaceInPlace(prot, "*", "X"); //Conversion to IUPAC produces "X", but Translate produces "*"
589 
590  //LOG_POST(">>>" << codon << " " << prot << " " << prot_str);
591  if(prot == prot_str) {
592  codons.push_back(codon);
593  }
594  }
595  }
596  }
597 }
598 
599 size_t CVariationUtil::s_CountMatches(const string& a, const string& b)
600 {
601  size_t count(0);
602  for(size_t i = 0; i < min(a.size(), b.size()); i++ ) {
603  if(a[i] == b[i]) {
604  count++;
605  }
606  }
607  return count;
608 }
609 
611  const string& codon_from, //codon on cDNA
612  const string& prot_to, //missense/nonsense AA
613  vector<string>& codons_to) //calculated variation-codons
614 {
615  vector<string> candidates1;
616  size_t max_matches(0);
617  s_UntranslateProt(prot_to, candidates1);
618  codons_to.clear();
619 
620  ITERATE(vector<string>, it1, candidates1) {
621  size_t matches = s_CountMatches(codon_from, *it1);
622 
623 // LOG_POST("CalcPrecursorVariationCodon:" << codon_from << " " << prot_to << " " << *it1 << " " << matches);
624  if(matches == 3) {
625  //all three bases in a codon matched - we must be processing a silent mutation.
626  //in this case we want to consider candidate codons other than itself.
627  continue;
628  }
629 
630  if(matches >= max_matches) {
631  if(matches > max_matches) {
632  codons_to.clear();
633  }
634  codons_to.push_back(*it1);
635  max_matches = matches;
636  }
637  }
638 }
639 
640 string CVariationUtil::s_CollapseAmbiguities(const vector<string>& seqs)
641 {
642  string collapsed_seq;
643 
644  vector<int> bits; //4-bit bitmask denoting whether a nucleotide occurs at this pos at any seq
645 
646  typedef const vector<string> TConstStrs;
647  ITERATE(TConstStrs, it, seqs) {
648  const string& seq = *it;
649  if(seq.size() > bits.size()) {
650  bits.resize(seq.size());
651  }
652 
653  for(size_t i= 0; i < seq.size(); i++) {
654  char nt = seq[i];
655  int m = (nt == 'T' ? 1
656  : nt == 'G' ? 2
657  : nt == 'C' ? 4
658  : nt == 'A' ? 8 : 0);
659  if(!m) {
660  NCBI_THROW(CException, eUnknown, "Expected [ACGT] alphabet");
661  }
662 
663  bits[i] |= m;
664  }
665  }
666 
667  static const char* iupac_bases = "NTGKCYSBAWRDMHVN";
668  collapsed_seq.resize(bits.size());
669  for(size_t i = 0; i < collapsed_seq.size(); i++) {
670  collapsed_seq[i] = iupac_bases[bits[i]];
671  }
672  return collapsed_seq;
673 }
674 
675 
677 {
678  if(v.GetData().IsSet()) {
679  NON_CONST_ITERATE(CVariation_ref::TData::TSet::TVariations, it, v.SetData().SetSet().SetVariations()) {
680  CVariation_ref& v2 = **it;
682  }
683  } else if(v.GetData().IsInstance()) {
684  if(!v.GetData().GetInstance().GetDelta().size() == 1) {
685  NCBI_THROW(CArgException, eInvalidArg, "Expected single-element delta");
686  }
687 
688  const CDelta_item& delta = *v.GetData().GetInstance().GetDelta().front();
689  if(delta.IsSetAction() && delta.GetAction() != CDelta_item::eAction_morph) {
690  NCBI_THROW(CArgException, eInvalidArg, "Expected morph action");
691  }
692 
693  if(!delta.IsSetSeq() || !delta.GetSeq().IsLiteral() || delta.GetSeq().GetLiteral().GetLength() != 1) {
694  NCBI_THROW(CArgException, eInvalidArg, "Expected literal of length 1 in inst.");
695  }
696 
697  CSeq_data variant_prot_seq;
698  CSeqportUtil::Convert(delta.GetSeq().GetLiteral().GetSeq_data(), &variant_prot_seq, CSeq_data::e_Iupacaa);
699 
700  if(sequence::GetLength(v.GetLocation(), NULL) != 1) {
701  NCBI_THROW(CArgException, eInvalidArg, "Expected single-aa location");
702  }
703 
705  //note: sel by product; location is prot; found feature is mrna having this prot as product
706  CRef<CSeq_loc_Mapper> prot2precursor_mapper;
707  for(CFeat_CI ci(*m_scope, v.GetLocation(), sel); ci; ++ci) {
708  prot2precursor_mapper.Reset(new CSeq_loc_Mapper(ci->GetMappedFeature(), CSeq_loc_Mapper::eProductToLocation, m_scope));
709  break;
710  }
711 
712  if(!prot2precursor_mapper) {
713  NCBI_THROW(CException, eUnknown, "Can't create prot2precursor mapper. Is this a prot?");
714  }
715 
716  CRef<CSeq_loc> nuc_loc = prot2precursor_mapper->Map(v.GetLocation());
717  if(!nuc_loc->IsInt() || sequence::GetLength(*nuc_loc, NULL) != 3) {
718  NCBI_THROW(CException, eUnknown, "AA does not remap to a single codon.");
719  }
720 
722 
723  string original_allele_codon; //nucleotide allele on the sequence
724  seqv.GetSeqData(seqv.begin(), seqv.end(), original_allele_codon);
725 
726  vector<string> variant_codons;
727  s_CalcPrecursorVariationCodon(original_allele_codon, variant_prot_seq.GetIupacaa(), variant_codons);
728 
729  string variant_codon = s_CollapseAmbiguities(variant_codons);
730 
731  //If the original and variant codons have terminal bases shared, we can truncate the variant codon and location accordingly.
732  while(variant_codon.length() > 1 && variant_codon.at(0) == original_allele_codon.at(0)) {
733  variant_codon = variant_codon.substr(1);
734  original_allele_codon = variant_codon.substr(1);
735  if(nuc_loc->GetStrand() == eNa_strand_minus) {
736  nuc_loc->SetInt().SetTo()--;
737  } else {
738  nuc_loc->SetInt().SetFrom()++;
739  }
740  }
741  while(variant_codon.length() > 1 &&
742  variant_codon.at(variant_codon.length() - 1) == original_allele_codon.at(original_allele_codon.length() - 1))
743  {
744  variant_codon.resize(variant_codon.length() - 1);
745  original_allele_codon.resize(variant_codon.length() - 1);
746  //Note: normally given a protein, the parent will be a mRNA and the CDS location
747  //will have plus strand; however, the parent could be MT, so we can't assume plus strand
748  if(nuc_loc->GetStrand() == eNa_strand_minus) {
749  nuc_loc->SetInt().SetFrom()++;
750  } else {
751  nuc_loc->SetInt().SetTo()--;
752  }
753  }
754 
755  CRef<CDelta_item> delta2(new CDelta_item);
756  delta2->SetSeq().SetLiteral().SetLength(variant_codon.length());
757  delta2->SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set(variant_codon);
758 
760 
761  //merge loc to convert int of length 1 to a pnt as necessary
762  v2->SetLocation(*sequence::Seq_loc_Merge(*nuc_loc, CSeq_loc::fSortAndMerge_All, NULL));
763  CVariation_inst& inst2 = v2->SetData().SetInstance();
764  inst2.SetType(variant_codon.length() == 1 ? CVariation_inst::eType_snv : CVariation_inst::eType_mnp);
765  inst2.SetDelta().push_back(delta2);
766 
767  if(v.GetData().GetInstance().IsSetObservation()) {
769  }
770 
771  v.Assign(*v2);
772  }
773 }
774 
775 //vr must be a prot missense or nonsense (inst) with location set; inst must not have offsets.
777 {
778  if(!prot_variation_feat.GetData().IsVariation()) {
779  NCBI_THROW(CArgException, eInvalidArg, "Expected variation-feature");
780  }
781 
782  CRef<CSeq_feat> nuc_feat(new CSeq_feat);
783  CVariation_ref& nuc_vr = nuc_feat->SetData().SetVariation();
784 
785  nuc_vr.Assign(prot_variation_feat.GetData().GetVariation());
786  nuc_vr.SetLocation().Assign(prot_variation_feat.GetLocation());
787  s_PropagateLocsInPlace(nuc_vr);
788  x_ProtToPrecursor(nuc_vr);
789  s_FactorOutLocsInPlace(nuc_vr);
790  nuc_feat->SetLocation().Assign(nuc_vr.GetLocation());
791  nuc_vr.ResetLocation();
792  return nuc_feat;
793 }
794 
795 
797 {
799  string seq;
800  v.GetSeqData(v.begin(), v.end(), seq);
802  literal->SetLength(seq.length());
803  if(v.IsProtein()) {
804  literal->SetSeq_data().SetNcbieaa().Set(seq);
805  } else if (v.IsNucleotide()) {
806  literal->SetSeq_data().SetIupacna().Set(seq);
807  }
808  return literal;
809 }
810 
812 {
814 
815  if(b.GetLength() == 0) {
816  c->Assign(a);
817  } else if(a.GetLength() == 0) {
818  c->Assign(b);
819  } else {
821  a.GetSeq_data(), 0, a.GetLength(),
822  b.GetSeq_data(), 0, b.GetLength());
823 
824  c->SetLength(a.GetLength() + b.GetLength());
825 
826  if(a.IsSetFuzz() || b.IsSetFuzz()) {
827  c->SetFuzz().SetLim(CInt_fuzz::eLim_unk);
828  }
829  }
830  return c;
831 }
832 
833 
834 
835 /*!
836  * Convert any simple nucleotide variation to delins form, if possible; throw if not.
837  * Precondition: location must be set.
838  */
840 {
842  if(v.GetData().IsSet()) {
843  NON_CONST_ITERATE(CVariation_ref::TData::TSet::TVariations, it, v.SetData().SetSet().SetVariations()) {
844  ChangeToDelins(**it);
845  }
846  } else if(v.GetData().IsInstance()) {
847  CVariation_inst& inst = v.SetData().SetInstance();
849 
850  if(inst.GetDelta().size() == 0) {
852  di->SetSeq().SetLiteral().SetLength(0);
853  di->SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set("");
854  inst.SetDelta().push_back(di);
855  } else if(inst.GetDelta().size() > 1) {
856  NCBI_THROW(CArgException, eInvalidArg, "Deltas of length >1 are not supported");
857  } else {
858  CDelta_item& di = *inst.SetDelta().front();
859 
860 
861  //convert 'del' to 'replace-with-empty-literal'
863  di.ResetAction();
864  di.SetSeq().SetLiteral().SetLength(0);
865  di.SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set("");
866  }
867 
868  //convert 'loc' or 'this'-based deltas to literals
869  if(di.GetSeq().IsLoc()) {
871  di.SetSeq().SetLiteral(*literal);
872  } else if(di.GetSeq().IsThis()) {
874  di.SetSeq().SetLiteral(*literal);
875  }
876 
877  //expand multipliers.
878  if(di.IsSetMultiplier()) {
879  if(di.GetMultiplier() < 0) {
880  NCBI_THROW(CArgException, eInvalidArg, "Encountered negative multiplier");
881  } else {
882  CSeq_literal& literal = di.SetSeq().SetLiteral();
883  string str_kernel = literal.GetSeq_data().GetIupacna().Get();
884  literal.SetSeq_data().SetIupacna().Set("");
885  for(int i = 0; i < di.GetMultiplier(); i++) {
886  literal.SetSeq_data().SetIupacna().Set() += str_kernel;
887  }
888  literal.SetLength(literal.GetSeq_data().GetIupacna().Get().size());
889  if(literal.IsSetFuzz()) {
890  literal.SetFuzz().SetLim(CInt_fuzz::eLim_unk);
891  }
892 
893  di.ResetMultiplier();
894  if(di.IsSetMultiplier_fuzz()) {
896  }
897  }
898  }
899 
900  //Convert ins-X-before-loc to 'replace seq@loc with X + seq@loc'
901  if(!di.IsSetAction() || di.GetAction() == CDelta_item::eAction_morph) {
902  ; //already done
903  } else if(di.GetAction() == CDelta_item::eAction_ins_before) {
904  di.ResetAction();
905  CRef<CSeq_literal> suffix_literal = x_GetLiteralAtLoc(v.GetLocation());
906  CRef<CSeq_literal> cat_literal = s_CatLiterals(di.GetSeq().GetLiteral(), *suffix_literal);
907  di.SetSeq().SetLiteral(*cat_literal);
908  }
909  }
910  }
911 }
912 
913 
914 
915 /*!
916  * Extend or truncate delins to specified location.
917  * truncate or attach suffixes/prefixes to seq-literals as necessary).
918  *
919  * Precondition:
920  * -variation must be a normalized delins (via x_ChangeToDelins)
921  * -loc must be a superset of variation's location.
922  */
924 {
925  if(!loc.IsInt()) {
926  NCBI_THROW(CArgException, eInvalidArg, "Expected Int location");
927  }
928 
929  if(v.GetData().IsSet()) {
930  NON_CONST_ITERATE(CVariation_ref::TData::TSet::TVariations, it, v.SetData().SetSet().SetVariations()) {
931  AdjustDelinsToInterval(**it, loc);
932  }
933  } else if(v.GetData().IsInstance()) {
934  CVariation_inst& inst = v.SetData().SetInstance();
936 
937  CRef<CSeq_loc> sub_loc = v.GetLocation().Subtract(loc, 0, NULL, NULL);
938  if(sub_loc->Which() && sequence::GetLength(*sub_loc, NULL) > 0) {
939  NCBI_THROW(CArgException, eInvalidArg, "Location must be a superset of the variation's loc");
940  }
941 
942  if(!inst.GetDelta().size() == 1) {
943  NCBI_THROW(CArgException, eInvalidArg, "Expected single-element delta");
944  }
945 
946  CDelta_item& delta = *inst.SetDelta().front();
947 
948  if(!delta.IsSetSeq() || !delta.GetSeq().IsLiteral()) {
949  NCBI_THROW(CArgException, eInvalidArg, "Expected literal");
950  }
951 
953  tmp_loc->SetInt().SetFrom(sequence::GetStart(v.GetLocation(), NULL, eExtreme_Positional));
955 
959 
961  swap(prefix_loc, suffix_loc);
962  }
963 
964  CRef<CSeq_literal> prefix_literal = x_GetLiteralAtLoc(*prefix_loc);
965  CRef<CSeq_literal> suffix_literal = x_GetLiteralAtLoc(*suffix_loc);
966 
967  CRef<CSeq_literal> tmp_literal1 = s_CatLiterals(*prefix_literal, delta.SetSeq().SetLiteral());
968  CRef<CSeq_literal> tmp_literal2 = s_CatLiterals(*tmp_literal1, *suffix_literal);
969  delta.SetSeq().SetLiteral(*tmp_literal2);
970  v.SetLocation().Assign(loc);
971  }
972 }
973 
974 
976 {
977  /*
978  * Method:
979  * 1. Normalize variation into a delins form :
980  * location: what is being replaced; del := seq-literal(location)
981  * delta: what replaces the location; ins := seq-literal(delta)
982  * E.g. "ins 'AC' before loc" is expressed as "replace seq@loc with 'AC'+seq@loc".
983  * "del@loc" is expressed as "replace seq@loc with ''".
984  *
985  * 2. Throw if location not completely within CDS.
986  * In the future, might truncate to CDS location:
987  * 1. If the location crosses CDS edge (cds-start, or splice-junction):
988  * 1. If |ins| == 0 or |ins| == |del| - can truncate the variation to part covered by CDS
989  * 2. Else - bail, because we don't know base-for-base correspondence in ins vs del.
990  *
991  * 3. Extend the location and append suffix/prefix literal up to nearest codon boundaries.
992  * Note that the suffix and prefix for ins and del might be different. Account for the fact
993  * that if part of a codon is deleted, the bases from the downstream codon will replace it.
994  *
995  * 3. Translate modified ins and del literals and remap location to prot.
996  *
997  * 4. If translated sequences are the same -> silent variation.
998  * Else, truncate common prefixes and suffixes and adjust location accordingy
999  * (make sure to leave at least one pnt).
1000  *
1001  * 5. Attach frameshift based on (|ins|-|del|) % 3
1002  */
1003 
1004 
1005  bool verbose = false;
1006 
1007  if(verbose) NcbiCerr << "Original variation: " << MSerial_AsnText << nuc_variation_feat;
1008 
1009  if(!nuc_variation_feat.GetData().IsVariation()) {
1010  NCBI_THROW(CArgException, eInvalidArg, "Expected variation-feature");
1011  }
1012 
1013  const CVariation_ref& nuc_v = nuc_variation_feat.GetData().GetVariation();
1014 
1015  if(!nuc_v.GetData().IsInstance()) {
1016  NCBI_THROW(CArgException, eInvalidArg, "Expected variation.inst");
1017  }
1018 
1019  if(!nuc_v.GetData().GetInstance().GetDelta().size() == 1) {
1020  //can't process intronic, etc.
1021  NCBI_THROW(CArgException, eInvalidArg, "Expected single-element delta");
1022  }
1023 
1024  CRef<CSeq_loc_Mapper> nuc2prot_mapper;
1025  CRef<CSeq_loc_Mapper> prot2nuc_mapper;
1026 
1028  for(CFeat_CI ci(*m_scope, nuc_variation_feat.GetLocation(), sel); ci; ++ci) {
1029  nuc2prot_mapper.Reset(new CSeq_loc_Mapper(ci->GetMappedFeature(), CSeq_loc_Mapper::eLocationToProduct, m_scope));
1030  prot2nuc_mapper.Reset(new CSeq_loc_Mapper(ci->GetMappedFeature(), CSeq_loc_Mapper::eProductToLocation, m_scope));
1031  break;
1032  }
1033 
1034  if(!prot2nuc_mapper) {
1035  return CRef<CSeq_feat>(NULL); //not in cds
1036  }
1037 
1039  v->Assign(nuc_variation_feat.GetData().GetVariation());
1040  if(!v->IsSetLocation()) {
1041  v->SetLocation().Assign(nuc_variation_feat.GetLocation());
1042  if(!v->GetLocation().GetId()) {
1043  NCBI_THROW(CArgException, eInvalidArg, "Expected variation's location to have unique seq-id");
1044  }
1045  }
1046 
1047  if(verbose) NcbiCerr << "Original variation: " << MSerial_AsnText << *v;
1048 
1049  ChangeToDelins(*v);
1050 
1051  if(verbose) NcbiCerr << "Normalized variation: " << MSerial_AsnText << *v;
1052 
1053 
1054  const CDelta_item& delta = *v->GetData().GetInstance().GetDelta().front();
1055 
1056  bool have_frameshift = ((long)sequence::GetLength(v->GetLocation(), NULL) - (long)delta.GetSeq().GetLiteral().GetLength()) % 3 != 0;
1057 
1058  CRef<CSeq_loc> prot_loc = nuc2prot_mapper->Map(v->GetLocation());
1059  CRef<CSeq_loc> codons_loc = prot2nuc_mapper->Map(*prot_loc);
1060  codons_loc->SetId(*v->GetLocation().GetId()); //restore the original id, as mapping forward and back may have changed the type
1061 
1062  if(verbose) NcbiCerr << "Prot-loc: " << MSerial_AsnText << *prot_loc;
1063 
1064  if(verbose) NcbiCerr << "Codons-loc: " << MSerial_AsnText << *codons_loc;
1065 
1066  //extend codons-loc by two bases downstream, since a part of the next
1067  //codon may become part of the variation (e.g. 1-base deletion in a codon
1068  //results in first base of the downstream codon becoming part of modified one)
1069  //If, on the other hand, the downstream codon does not participate, there's
1070  //only two bases if it, so it won't get translated.
1071  SFlankLocs flocs = CreateFlankLocs(*codons_loc, 2);
1072  CRef<CSeq_loc> codons_loc_ext = sequence::Seq_loc_Add(*codons_loc, *flocs.downstream, CSeq_loc::fSortAndMerge_All, NULL);
1073 
1074  if(verbose) NcbiCerr << "Codons-loc-ext: " << MSerial_AsnText << *codons_loc_ext;
1075 
1076  AdjustDelinsToInterval(*v, *codons_loc_ext);
1077 
1078  CSeq_literal& literal = v->SetData().SetInstance().SetDelta().front()->SetSeq().SetLiteral();
1079  int prot_literal_len = literal.GetLength() / 3;
1080  literal.SetLength(prot_literal_len * 3); //divide by 3 and multiply by 3 to truncate to codon boundary.
1081  literal.SetSeq_data().SetIupacna().Set().resize(literal.GetLength());
1082 
1083  if(verbose) NcbiCerr << "Adjusted variation: " << MSerial_AsnText << *v;
1084 
1085 
1086  string prot_delta_str("");
1087  CSeqTranslator translator;
1088  translator.Translate(
1089  delta.GetSeq().GetLiteral().GetSeq_data().GetIupacna(),
1090  prot_delta_str,
1092  prot_delta_str.resize(delta.GetSeq().GetLiteral().GetLength() / 3); //Translator may optimistically translate last partial codon
1093  NStr::ReplaceInPlace(prot_delta_str, "*", "X"); //Conversion to IUPAC produces "X", but Translate produces "*"
1094 
1095  literal.SetLength(prot_delta_str.size());
1096  literal.SetSeq_data().SetNcbieaa().Set(prot_delta_str);
1097 
1098 
1099  string prot_ref_str("");
1100  CSeqVector nuc_ref_seqvector(v->GetLocation(), *m_scope, CBioseq_Handle::eCoding_Iupac);
1101  translator.Translate(
1102  nuc_ref_seqvector,
1103  prot_ref_str,
1105  prot_ref_str.resize(sequence::GetLength(v->GetLocation(), NULL) / 3);
1106  NStr::ReplaceInPlace(prot_ref_str, "*", "X");
1107 
1108 
1109  v->SetVariant_prop().SetEffect(0);
1110 
1111  if(literal.GetLength() == 0) {
1112  v->SetData().SetInstance().SetType(CVariation_inst::eType_del);
1113  v->SetData().SetInstance().SetDelta().clear();
1114  } else if(prot_delta_str.size() != prot_ref_str.size()) {
1115  v->SetData().SetInstance().SetType(CVariation_inst::eType_prot_other);
1116  } else {
1117  //sequence of the same length
1118  if(prot_ref_str == prot_delta_str) {
1119  v->SetData().SetInstance().SetType(CVariation_inst::eType_prot_silent);
1120  } else if(NStr::Find(prot_delta_str, "X") != NPOS) {
1121  v->SetData().SetInstance().SetType(CVariation_inst::eType_prot_nonsense);
1122  } else {
1123  v->SetData().SetInstance().SetType(CVariation_inst::eType_prot_missense);
1124  }
1125 
1126  for(size_t i = 0; i < prot_ref_str.size() && i < prot_delta_str.size(); i++) {
1127  if(prot_ref_str[i] == prot_delta_str[i]) {
1128  v->SetVariant_prop().SetEffect() |= CVariantProperties::eEffect_synonymous;
1129  } else if(prot_ref_str[i] == 'X') {
1130  v->SetVariant_prop().SetEffect() |= CVariantProperties::eEffect_stop_loss;
1131  } else if(prot_delta_str[i] == 'X') {
1132  v->SetVariant_prop().SetEffect() |= CVariantProperties::eEffect_stop_gain;
1133  } else {
1134  v->SetVariant_prop().SetEffect() |= CVariantProperties::eEffect_missense;
1135  }
1136  }
1137  }
1138 
1139  CRef<CSeq_feat> prot_variation_feat(new CSeq_feat);
1140  prot_variation_feat->SetLocation(*prot_loc);
1141  prot_variation_feat->SetData().SetVariation(*v);
1142  v->ResetLocation();
1143 
1144  if(have_frameshift) {
1145  v->SetVariant_prop().SetEffect() |= CVariantProperties::eEffect_frameshift;
1146  }
1147 
1149  uo->SetType().SetStr("HGVS");
1150  uo->AddField("reference_sequence", prot_ref_str);
1151  v->SetExt(*uo);
1152 
1153 
1154  if(!v->IsSetVariant_prop() || !v->GetVariant_prop().IsSetVersion()) {
1155  v->SetVariant_prop().SetVersion(m_variant_properties_schema_version);
1156  }
1157 
1158  if(v->IsSetVariant_prop() && v->GetVariant_prop().GetEffect() == 0) {
1159  v->SetVariant_prop().ResetEffect();
1160  }
1161 
1162  if(verbose) NcbiCerr << "protein variation:" << MSerial_AsnText << *v;
1163 
1164  if(verbose) NcbiCerr << "Done with protein variation\n";
1165 
1166  return prot_variation_feat;
1167 }
1168 
1169 
1171 {
1172  const CSeq_loc& loc = v.IsSetLocation() ? v.GetLocation() : parent_location;
1173 
1174  bool ret = true; //True iff not enconutered a case with offsets that could not have a refrence loc computed
1175 
1176  if(!v.GetData().IsSet()) {
1177  NCBI_THROW(CArgException, eInvalidArg, "Expected variation-set");
1178  }
1179 
1181  bool have_offsets = false;
1182 
1183  CRef<CVariation_ref> observation_vr;
1184 
1185  //try to find existing reference-observation to overwrite.
1186  NON_CONST_ITERATE(CVariation_ref::TData::TSet::TVariations, it, v.SetData().SetSet().SetVariations()) {
1187  CVariation_ref& vr2 = **it;
1188  if(vr2.GetData().IsSet()) {
1189  ret = ret && SetReferenceSequence(**it, loc);
1190  } else if(vr2.GetData().IsInstance()) {
1192  const CDelta_item& di = **it2;
1193  have_offsets = have_offsets || di.IsSetAction() && di.GetAction() == CDelta_item::eAction_offset;
1194  }
1195 
1196  if(vr2.GetData().GetInstance().IsSetObservation()
1198  {
1199  observation_vr.Reset(&vr2);
1200  }
1201  }
1202  }
1203 
1204  if(!have_offsets) {
1205  if(!observation_vr) {
1206  observation_vr.Reset(new CVariation_ref);
1207  v.SetData().SetSet().SetVariations().push_back(observation_vr);
1208  }
1209  observation_vr->SetData().SetInstance().SetObservation(CVariation_inst::eObservation_reference);
1210  observation_vr->SetData().SetInstance().SetType(CVariation_inst::eType_identity);
1211 
1213 
1215  di->SetSeq().SetLiteral(*literal);
1216  observation_vr->SetData().SetInstance().SetDelta().clear();
1217  observation_vr->SetData().SetInstance().SetDelta().push_back(di);
1218  } else {
1219  ret = false;
1220  }
1221  } else {
1222  NON_CONST_ITERATE(CVariation_ref::TData::TSet::TVariations, it, v.SetData().SetSet().SetVariations()) {
1223  CVariation_ref& vr2 = **it;
1224  if(vr2.GetData().IsSet()) {
1225  ret = ret && SetReferenceSequence(**it, loc);
1226  }
1227  }
1228  }
1229 
1230  return ret;
1231 }
1232 
1233 
1234 
1235 ////////////////////////////////////////////////////////////////////////////////
1236 //
1237 // SO-terms calculations
1238 //
1239 ////////////////////////////////////////////////////////////////////////////////
1240 
1241 
1243 {
1245  terms.push_back(eSO_2KB_upstream_variant);
1246  }
1248  terms.push_back(eSO_500B_downstream_variant);
1249  }
1251  terms.push_back(eSO_splice_donor_variant);
1252  }
1254  terms.push_back(eSO_splice_acceptor_variant);
1255  }
1257  terms.push_back(eSO_intron_variant);
1258  }
1260  terms.push_back(eSO_5_prime_UTR_variant);
1261  }
1263  terms.push_back(eSO_3_prime_UTR_variant);
1264  }
1266  terms.push_back(eSO_nc_transcript_variant);
1267  }
1268 
1270  terms.push_back(eSO_frameshift_variant);
1271  }
1273  terms.push_back(eSO_non_synonymous_codon);
1274  }
1276  terms.push_back(eSO_stop_gained);
1277  }
1279  terms.push_back(eSO_stop_lost);
1280  }
1282  terms.push_back(eSO_synonymous_codon);
1283  }
1284 }
1285 
1287 {
1288  if(term == eSO_2KB_upstream_variant) {
1289  return "2KB_upstream_variant";
1290  } else if(term == eSO_500B_downstream_variant) {
1291  return "500B_downstream_variant";
1292  } else if(term == eSO_splice_donor_variant) {
1293  return "splice_donor_variant";
1294  } else if(term == eSO_splice_acceptor_variant) {
1295  return "splice_acceptor_varian";
1296  } else if(term == eSO_intron_variant) {
1297  return "intron_variant";
1298  } else if(term == eSO_5_prime_UTR_variant) {
1299  return "5_prime_UTR_variant";
1300  } else if(term == eSO_3_prime_UTR_variant) {
1301  return "3_prime_UTR_variant";
1302  } else if(term == eSO_coding_sequence_variant) {
1303  return "coding_sequence_variant";
1304  } else if(term == eSO_nc_transcript_variant) {
1305  return "nc_transcript_variant";
1306  } else if(term == eSO_synonymous_codon) {
1307  return "synonymous_codon";
1308  } else if(term == eSO_non_synonymous_codon) {
1309  return "non_synonymous_codon";
1310  } else if(term == eSO_stop_gained) {
1311  return "stop_gained";
1312  } else if(term == eSO_stop_lost) {
1313  return "stop_lost";
1314  } else if(term == eSO_frameshift_variant) {
1315  return "frameshift_variant";
1316  } else {
1317  return "other_variant";
1318  }
1319 };
1320 
1321 /// Calculate location-specific categories
1323 {
1324  if(!prop.IsSetVersion()) {
1326  }
1327 
1328  if(!prop.IsSetGene_location()) {
1329  //need to zero-out the bitmask, otherwise in debug mode it will be preset to a magic value,
1330  //and then modifying it with "|=" will produce garbage.
1331  prop.SetGene_location(0);
1332  }
1333 
1334  CRef<CSeq_loc> loc(new CSeq_loc);
1335  loc->Assign(orig_loc);
1336  loc->SetStrand(eNa_strand_plus); //will set to plus temporarily to create flanks such that upstream=high and downstream=low
1337  SFlankLocs flanks = CreateFlankLocs(*loc, 2); //will use 2-nt flanks to check if inside a splice-site
1338 
1339  //Set strand to both on our query locs because we need to consider annotation on both strands
1340  loc->SetStrand(eNa_strand_both);
1343 
1344  SAnnotSelector sel;
1348  sel.SetOverlapTotalRange();
1349  sel.SetIgnoreStrand();
1350 
1351  //the following will indicate total-range overlap
1352  bool overlaps_gene_range = false;
1353  bool overlaps_rna_range = false;
1354  bool overlaps_cds_range = false;
1355 
1356  CBioseq_Handle bsh = m_scope->GetBioseqHandle(*loc);
1357 
1358 
1359  for(CFeat_CI ci(*m_scope, *loc, sel); ci; ++ci) {
1360  if(ci->GetData().IsGene()) {
1361  overlaps_gene_range = true;
1362  } if(ci->GetData().IsRna()) {
1363  overlaps_rna_range = true;
1364  } else if(ci->GetData().IsCdregion()) {
1365  overlaps_cds_range = true;
1366  }
1367 
1368  bool have_overlap = sequence::Compare(ci->GetLocation(), *loc, m_scope) != sequence::eNoOverlap;
1369 
1370  if((ci->GetData().IsRna() || ci->GetData().IsCdregion()) && !have_overlap)
1371  {
1372  //within range, but no overlap - must be intronic or splice-site.
1373 
1374  bool is_minus_strand = (eNa_strand_minus == sequence::GetStrand(ci->GetLocation(), NULL));
1375 
1376  if(sequence::Compare(ci->GetLocation(), *flanks.upstream, m_scope) != sequence::eNoOverlap) {
1377  prop.SetGene_location() |= is_minus_strand ?
1380  } else if(sequence::Compare(ci->GetLocation(), *flanks.downstream, m_scope) != sequence::eNoOverlap) {
1381  prop.SetGene_location() |= is_minus_strand ?
1384 
1385  } else {
1387  }
1388  }
1389 
1390  if(have_overlap && ci->GetData().IsRna() && ci->GetData().GetSubtype() != CSeqFeatData::eSubtype_mRNA) {
1392  }
1393 
1394  if(ci->GetData().IsCdregion()) {
1395 
1396  //check if in start/stop codons. This happens iff the query location, expanded by 2nt, overlaps
1397  //non-partial cds-start/cds-stop
1398 
1399  if( !ci->GetLocation().IsPartialStop(eExtreme_Biological)
1400  && !ci->GetLocation().IsTruncatedStop(eExtreme_Biological)
1401  && ci->GetLocation().GetStop(eExtreme_Biological) + 2 >= loc->GetStart(eExtreme_Positional)
1402  && ci->GetLocation().GetStop(eExtreme_Biological) <= loc->GetStop(eExtreme_Positional) + 2)
1403  {
1405  }
1406 
1407  if( !ci->GetLocation().IsPartialStart(eExtreme_Biological)
1408  && !ci->GetLocation().IsTruncatedStart(eExtreme_Biological)
1409  && ci->GetLocation().GetStart(eExtreme_Biological) + 2 >= loc->GetStart(eExtreme_Positional)
1410  && ci->GetLocation().GetStart(eExtreme_Biological) <= loc->GetStop(eExtreme_Positional) + 2)
1411  {
1413  }
1414  }
1415  }
1416 
1417  //We checked for noncoding RNA cases above, but if the location is on on transcript there will be
1418  //no mRNA feature - we need to check either by mol subtype or by looking for annotated CDS.
1419  if(bsh.GetBioseqMolType() == CSeq_inst::eMol_rna) {
1420  bool found_any_cds = false;
1421  for(CFeat_CI ci(bsh, sel); ci; ++ci) {
1422  if(ci->GetData().IsCdregion()) {
1423  found_any_cds = true;
1424  break;
1425  }
1426  }
1427  if(!found_any_cds) {
1429  }
1430  }
1431 
1433  && !overlaps_rna_range
1434  && !overlaps_cds_range
1435  && overlaps_gene_range)
1436  {
1438  }
1439 
1440  //Check if in UTR:
1441  if((bsh.GetBioseqMolType() == CSeq_inst::eMol_rna || overlaps_rna_range) && !overlaps_cds_range) {
1442  for(CFeat_CI ci(*m_scope, *loc, sel); ci; ++ci) {
1444 
1445  if(bsh.GetBioseqMolType() == CSeq_inst::eMol_rna && ci->GetData().IsCdregion()) {
1446  cds.Reset(&ci->GetMappedFeature());
1447  } else if(ci->GetData().IsRna()) {
1448  cds = sequence::GetBestCdsForMrna(ci->GetMappedFeature(), *m_scope);
1449  }
1450 
1451  if(cds) {
1452  bool is_minus_strand = (eNa_strand_minus == sequence::GetStrand(cds->GetLocation(), NULL));
1453 
1455  prop.SetGene_location() |= is_minus_strand ?
1458  }
1459 
1461  prop.SetGene_location() |= is_minus_strand ?
1464  }
1465  }
1466  }
1467  }
1468 
1469 
1470  //check if in neighborhood
1471  if(!overlaps_rna_range
1472  && !overlaps_cds_range
1473  && !overlaps_gene_range
1475  {
1476  SFlankLocs flanks2k = CreateFlankLocs(*loc, 2000);
1477  CRef<CSeq_loc> neighborhood_loc = sequence::Seq_loc_Add(*flanks2k.upstream, *flanks2k.downstream, CSeq_loc::fMerge_SingleRange, NULL);
1478  neighborhood_loc->SetStrand(eNa_strand_both);
1479 
1480  SAnnotSelector gene_sel;
1482  gene_sel.SetIgnoreStrand();
1483 
1484  bool found_in_neighborhood = false;
1485  for(CFeat_CI ci(*m_scope, *neighborhood_loc, gene_sel); ci; ++ci) {
1486  const CSeq_loc& gene_loc = ci->GetLocation();
1487  SFlankLocs flanks500 = CreateFlankLocs(gene_loc, 500);
1488  SFlankLocs flanks2000 = CreateFlankLocs(gene_loc, 2000);
1489 
1490  if(sequence::Compare(*loc, *flanks500.downstream, m_scope) != sequence::eNoOverlap) {
1491  found_in_neighborhood = true;
1493  }
1494 
1495  if(sequence::Compare(*loc, *flanks2000.upstream, m_scope) != sequence::eNoOverlap) {
1496  found_in_neighborhood = true;
1498  }
1499  }
1500 
1501  //if there's any gene feature on the bioseq, we must be in an intergenic region;
1502  if(!found_in_neighborhood) {
1503  for(CFeat_CI ci(m_scope->GetBioseqHandle(*loc), gene_sel); ci; ++ci) {
1505  break;
1506  }
1507  }
1508  }
1509 
1510  if(prop.GetGene_location() == 0) {
1511  prop.ResetGene_location();
1512  }
1513 }
1514 
1515 
1516 
1517 
1519 {
1520  if(!p.IsSetGene_location()) {
1521  //need to zero-out the bitmask, otherwise in debug mode it will be preset to a magic value,
1522  //and then modifying it with "|=" will produce garbage.
1523  p.SetGene_location(0);
1524  }
1525 
1526  if(loc.GetStop(eExtreme_Positional) + 1 >= bsh.GetInst_Length() && offset > 0) {
1527  //at the 3'-end; check if near-gene or intergenic
1528  if(offset <= 500) {
1530  } else {
1532  }
1533  } else if(loc.GetStart(eExtreme_Positional) == 0 && offset < 0) {
1534  //at the 5'-end; check if near-gene or intergenic
1535  if(offset >= -2000) {
1537  } else {
1539  }
1540  } else {
1541  //intronic or splice
1542  if(offset < 0 && offset >= -2) {
1544  } else if(offset > 0 && offset <= 2) {
1546  } else {
1548  }
1549  }
1550 
1551  if(p.GetGene_location() == 0) {
1552  p.ResetGene_location();
1553  }
1554 }
1555 
1557 {
1558  if(!p.IsSetVersion()) {
1560  }
1562 
1563  //if variation is cDNA/intronic, we need to calculate location-specific terms as well (intron, splice-site, etc.)
1564 
1565  if(bsh.GetBioseqMolType() == CSeq_inst::eMol_rna) {
1566  const CDelta_item& first_delta = *vi.GetDelta().front();
1567  const CDelta_item& last_delta = *vi.GetDelta().back();
1568 
1569  if(first_delta.IsSetAction()
1570  && first_delta.GetAction() == CDelta_item::eAction_offset
1571  && first_delta.GetSeq().IsLiteral())
1572  {
1573  x_SetVariantPropertiesForIntronic(p, first_delta.GetSeq().GetLiteral().GetLength(), loc, bsh);
1574  }
1575  if(last_delta.IsSetAction()
1576  && last_delta.GetAction() == CDelta_item::eAction_offset
1577  && last_delta.GetSeq().IsLiteral())
1578  {
1579  x_SetVariantPropertiesForIntronic(p, last_delta.GetSeq().GetLiteral().GetLength(), loc, bsh);
1580  }
1581  }
1582 
1583 
1584  //Calculate protein variation and inherit the prot-variation's properties
1585  if(bsh.IsAa()) {
1586  ; //nothing to do here
1587  } else if(vi.GetDelta().size() <= 1) { //can only process simple deltas
1588  CRef<CSeq_feat> nuc_vf(new CSeq_feat);
1589  nuc_vf->SetLocation().Assign(loc);
1590  nuc_vf->SetData().SetVariation().SetData().SetInstance().Assign(vi);
1591  CRef<CSeq_feat> prot_variation = this->PrecursorToProt(*nuc_vf);
1592 
1593  if(prot_variation
1594  && prot_variation->GetData().GetVariation().GetVariant_prop().IsSetEffect()
1595  ) {
1596  p.SetEffect() = prot_variation->GetData().GetVariation().GetVariant_prop().GetEffect();
1597  }
1598  }
1599 }
1600 
1601 
1603 {
1605 
1606  bool found_inst = false;
1607  for(CTypeIterator<CVariation_ref> it(Begin(vr)); it; ++it) {
1608  CVariation_ref& vr2 = *it;
1609  if(!vr2.GetData().IsInstance()) {
1610  continue;
1611  }
1612  CVariation_inst& inst = vr2.SetData().SetInstance();
1614  continue;
1615  }
1616 
1617  found_inst = true;
1619  }
1620 
1622 
1623  for(CTypeIterator<CVariation_ref> it(Begin(vr)); it; ++it) {
1624  CVariation_ref& vr2 = *it;
1625  if(vr2.IsSetLocation()) {
1627  }
1628  }
1629 }
1630 
1631 };
1632 
1634 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CArgException –.
Definition: ncbiargs.hpp:120
CBioseq_Handle –.
CFeat_CI –.
Definition: feat_ci.hpp:64
CSeqVector –.
Definition: seq_vector.hpp:65
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeq_loc_Mapper –.
static TSeqPos Convert(const CSeq_data &in_seq, CSeq_data *out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx=0, TSeqPos uLength=0, bool bAmbig=false, Uint4 seed=17734276)
static TSeqPos Append(CSeq_data *out_seq, const CSeq_data &in_seq1, TSeqPos uBeginIdx1, TSeqPos uLength1, const CSeq_data &in_seq2, TSeqPos uBeginIdx2, TSeqPos uLength2)
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
CVariation_inst –.
const TLocation & GetLocation(void) const
void SetLocation(TLocation &value)
bool IsSetLocation(void) const
NOTE: THESE ARE GOING AWAY SOON!!
TExt_locs & SetExt_locs(void)
void ResetLocation(void)
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
void ChangeToDelins(CVariation_ref &v)
static const int m_variant_properties_schema_version
static void s_AddIntronicOffsets(CVariation_ref &v, const CSpliced_seg &ss, const CSeq_loc &parent_variation_loc)
void AdjustDelinsToInterval(CVariation_ref &delins_variation, const CSeq_loc &int_loc)
CRef< CSeq_literal > x_GetLiteralAtLoc(const CSeq_loc &loc)
void s_CalcPrecursorVariationCodon(const string &codon_from, const string &prot_to, vector< string > &codons_to)
static void s_FactorOutLocsInPlace(CVariation_ref &v)
CRef< CSeq_feat > ProtToPrecursor(const CSeq_feat &prot_variation_feat)
Convert protein-variation (single-AA missense/nonsense) to nuc-variation on the parent.
CRef< CSeq_feat > PrecursorToProt(const CSeq_feat &prot_variation_feat)
Convert to nuc-variation on the parent to protein-variation (single-AA missense/nonsense) Only a subs...
static void s_PropagateLocsInPlace(CVariation_ref &v)
Propagate parent variation location to the members of set, unles they have their own location set.
CRef< CSeq_feat > Remap(const CSeq_feat &variation_feat, const CSeq_align &aln)
static void s_UntranslateProt(const string &prot_str, vector< string > &codons)
static void s_Remap(CVariation_ref &vr, CSeq_loc_Mapper &mapper, const CSeq_loc &parent_variation_loc)
static string s_CollapseAmbiguities(const vector< string > &seqs)
static CRef< CSeq_literal > s_CatLiterals(const CSeq_literal &a, const CSeq_literal &b)
void x_SetVariantPropertiesForIntronic(CVariantProperties &p, int offset, const CSeq_loc &loc, CBioseq_Handle &bsh)
ETestStatus CheckExonBoundary(const CSeq_feat &variation_feat, const CSeq_align &aln)
bool SetReferenceSequence(CVariation_ref &vr, const CSeq_loc &location)
void AsSOTerms(const CVariantProperties &p, TSOTerms &terms)
void SetVariantProperties(CVariantProperties &prop, const CSeq_loc &orig_loc)
Set location-specific variant-properties.
static string AsString(ESOTerm term)
static void s_ResolveIntronicOffsets(CVariation_ref &v, const CSeq_loc &parent_variation_loc)
SFlankLocs CreateFlankLocs(const CSeq_loc &loc, TSeqPos len)
void x_ProtToPrecursor(CVariation_ref &v)
static size_t s_CountMatches(const string &a, const string &b)
void x_SetVariantProperties(CVariantProperties &p, const CVariation_inst &vi, const CSeq_loc &loc)
static ulg bb
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
int offset
Definition: replacements.h:160
const TResidue codons[4][4]
Definition: gnomon_seq.cpp:76
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const CVect2< U > & v2
Definition: globals.hpp:440
@ eUnknown
Definition: app_popup.hpp:72
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
Definition: Seq_loc.cpp:3474
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
CRef< CSeq_loc > Merge(TOpFlags flags, ISynonymMapper *syn_mapper) const
All functions create and return a new seq-loc object.
Definition: Seq_loc.cpp:5037
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
void Add(const CSeq_loc &other)
Simple adding of seq-locs.
Definition: Seq_loc.cpp:3875
CRef< CSeq_loc > Subtract(const CSeq_loc &other, TOpFlags flags, ISynonymMapper *syn_mapper, ILengthGetter *len_getter) const
Subtract seq-loc from this, merge/sort resulting ranges depending on flags.
Definition: Seq_loc.cpp:5087
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
void SetStrand(ENa_strand strand)
Set the strand for all of the location's ranges.
Definition: Seq_loc.cpp:5196
void SetNull(void)
Override all setters to incorporate cache invalidation.
Definition: Seq_loc.hpp:960
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
@ fSortAndMerge_All
Definition: Seq_loc.hpp:334
@ fMerge_SingleRange
Definition: Seq_loc.hpp:332
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
CMappedFeat GetBestCdsForMrna(const CMappedFeat &mrna_feat, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0)
Definition: feature.cpp:3360
TSeqPos GetStop(const CSeq_loc &loc, CScope *scope, ESeqLocExtremes ext=eExtreme_Positional)
If only one CBioseq is represented by CSeq_loc, returns the position at the stop of the location.
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
ENa_strand GetStrand(const CSeq_loc &loc, CScope *scope=0)
Returns eNa_strand_unknown if multiple Bioseqs in loc Returns eNa_strand_other if multiple strands in...
TSeqPos GetStart(const CSeq_loc &loc, CScope *scope, ESeqLocExtremes ext=eExtreme_Positional)
If only one CBioseq is represented by CSeq_loc, returns the position at the start of the location.
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
CRef< CSeq_loc > Seq_loc_Subtract(const CSeq_loc &loc1, const CSeq_loc &loc2, CSeq_loc::TOpFlags flags, CScope *scope)
Subtract the second seq-loc from the first one.
CRef< CSeq_loc > Seq_loc_Merge(const CSeq_loc &loc, CSeq_loc::TOpFlags flags, CScope *scope)
Merge ranges in the seq-loc.
CRef< CSeq_loc > Seq_loc_Add(const CSeq_loc &loc1, const CSeq_loc &loc2, CSeq_loc::TOpFlags flags, CScope *scope)
Add two seq-locs.
bool IsSameBioseq(const CSeq_id &id1, const CSeq_id &id2, CScope *scope, CScope::EGetBioseqFlag get_flag=CScope::eGetBioseq_All)
Determines if two CSeq_ids represent the same CBioseq.
@ eNoOverlap
CSeq_locs do not overlap or abut.
static void Translate(const string &seq, string &prot, const CGenetic_code *code, bool include_stop=true, bool remove_trailing_X=false, bool *alt_start=NULL, bool is_5prime_complete=true, bool is_3prime_complete=true)
Translate a string using a specified genetic code.
Definition: sequence.cpp:4095
@ fIs5PrimePartial
= 0x4 Translate first codon even if not start codon (because sequence is 5' partial)
Definition: sequence.hpp:984
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
@ eProductToLocation
Map from the feature's product to location.
@ eLocationToProduct
Map from the feature's location to product.
bool IsAa(void) const
TInst_Length GetInst_Length(void) const
TMol GetBioseqMolType(void) const
Get some values from core:
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
SAnnotSelector & SetOverlapTotalRange(void)
Check overlapping only of total ranges.
SAnnotSelector & IncludeFeatType(TFeatType type)
Include feature type in the search.
SAnnotSelector & SetIgnoreStrand(bool value=true)
Ignore strand when testing for range overlap.
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
bool IsProtein(void) const
Definition: seq_vector.hpp:350
const_iterator begin(void) const
Definition: seq_vector.hpp:298
bool IsNucleotide(void) const
Definition: seq_vector.hpp:357
const_iterator end(void) const
Definition: seq_vector.hpp:305
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NcbiCerr
Definition: ncbistre.hpp:544
CTempString literal(const char(&str)[Size])
Templatized initialization from a string literal.
Definition: tempstr.hpp:441
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3314
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3554
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3405
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
void SetType(TType &value)
Assign a value to Type data member.
@ eLim_unk
unknown
Definition: Int_fuzz_.hpp:210
const TProtpos & GetProtpos(void) const
Get the variant data.
const TGenomic_id & GetGenomic_id(void) const
Get the Genomic_id member data.
TGenomic_start GetGenomic_start(void) const
Get the Genomic_start member data.
TAmin GetAmin(void) const
Get the Amin member data.
Definition: Prot_pos_.hpp:220
const TProduct_start & GetProduct_start(void) const
Get the Product_start member data.
const TProduct_end & GetProduct_end(void) const
Get the Product_end member data.
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
TGenomic_end GetGenomic_end(void) const
Get the Genomic_end member data.
bool IsSpliced(void) const
Check if variant Spliced is selected.
Definition: Seq_align_.hpp:778
bool IsNucpos(void) const
Check if variant Nucpos is selected.
TNucpos GetNucpos(void) const
Get the variant data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
bool IsVariation(void) const
Check if variant Variation is selected.
const TVariation & GetVariation(void) const
Get the variant data.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_loc_.hpp:475
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
bool IsPnt(void) const
Check if variant Pnt is selected.
Definition: Seq_loc_.hpp:540
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_both
in forward orientation
Definition: Na_strand_.hpp:68
const TIupacaa & GetIupacaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:530
void SetLength(TLength value)
Assign a value to Length data member.
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
TLength GetLength(void) const
Get the Length member data.
void SetFuzz(TFuzz &value)
Assign a value to Fuzz data member.
@ e_Iupacaa
IUPAC 1 letter amino acid code.
Definition: Seq_data_.hpp:105
const TInstance & GetInstance(void) const
Get the variant data.
const TVariant_prop & GetVariant_prop(void) const
Get the Variant_prop member data.
TAction GetAction(void) const
Get the Action member data.
void ResetGene_location(void)
Reset Gene_location data member.
list< CRef< CVariation_ref > > TVariations
TObservation GetObservation(void) const
Get the Observation member data.
const TSet & GetSet(void) const
Get the variant data.
TType GetType(void) const
Get the Type member data.
bool IsSetSeq(void) const
Check if a value has been assigned to Seq data member.
bool IsSetAction(void) const
Check if a value has been assigned to Action data member.
const TLoc & GetLoc(void) const
Get the variant data.
TEffect GetEffect(void) const
Get the Effect member data.
void SetObservation(TObservation value)
Assign a value to Observation data member.
bool IsSetVersion(void) const
Check if a value has been assigned to Version data member.
const TDelta & GetDelta(void) const
Get the Delta member data.
void SetVersion(TVersion value)
Assign a value to Version data member.
void SetMultiplier_fuzz(TMultiplier_fuzz &value)
Assign a value to Multiplier_fuzz data member.
void SetData(TData &value)
Assign a value to Data data member.
const TData & GetData(void) const
Get the Data member data.
const TSeq & GetSeq(void) const
Get the Seq member data.
bool IsInstance(void) const
Check if variant Instance is selected.
const TLiteral & GetLiteral(void) const
Get the variant data.
void SetVariant_prop(TVariant_prop &value)
Assign a value to Variant_prop data member.
void ResetAction(void)
Reset Action data member.
bool IsSet(void) const
Check if variant Set is selected.
void SetType(TType value)
Assign a value to Type data member.
void SetSeq(TSeq &value)
Assign a value to Seq data member.
TMultiplier GetMultiplier(void) const
Get the Multiplier member data.
bool IsSetMultiplier_fuzz(void) const
Check if a value has been assigned to Multiplier_fuzz data member.
list< CRef< CDelta_item > > TDelta
bool IsLiteral(void) const
Check if variant Literal is selected.
void SetGene_location(TGene_location value)
Assign a value to Gene_location data member.
bool IsSetMultiplier(void) const
Multiplier allows representing a tandem, e.g.
void ResetMultiplier(void)
Reset Multiplier data member.
bool IsSetObservation(void) const
Check if a value has been assigned to Observation data member.
bool IsSetEffect(void) const
Check if a value has been assigned to Effect data member.
TDelta & SetDelta(void)
Assign a value to Delta data member.
void SetEffect(TEffect value)
Assign a value to Effect data member.
bool IsThis(void) const
Check if variant This is selected.
TGene_location GetGene_location(void) const
Get the Gene_location member data.
bool IsSetGene_location(void) const
Check if a value has been assigned to Gene_location data member.
bool IsLoc(void) const
Check if variant Loc is selected.
@ eType_snv
delta=[morph of length 1] NOTE: this is snV not snP; the latter requires frequency-based validation t...
@ eType_mnp
delta=[morph of length >1]
@ eType_delins
delta=[del, ins]
@ eType_prot_nonsense
delta=[del]; variation-location is the tail of the protein being truncated
@ eType_prot_silent
delta=[morph of length 1, same AA as at variation-location]
@ eType_prot_missense
delta=[morph of length 1]
@ eEffect_stop_gain
reference codon is not stop codon, but the snp variant allele changes the codon to a terminating codo...
@ eEffect_missense
one allele in the set changes protein peptide (0x4)
@ eEffect_nonsense
one allele in the set changes to STOP codon (TER). (0x2)
@ eEffect_stop_loss
reverse of STOP-GAIN: reference codon is a stop codon, but a snp variant allele changes the codon to ...
@ eEffect_synonymous
one allele in the set does not change the encoded amino acid (0x1)
@ eEffect_frameshift
one allele in the set changes all downstream amino acids (0x8)
@ eGene_location_in_start_codon
the variant is observed in a start codon (0x100)
@ eGene_location_acceptor
In acceptor splice-site (0x20)
@ eGene_location_near_gene_5
Within 2kb of the 5' end of a gene feature.
@ eGene_location_near_gene_3
Within 0.5kb of the 3' end of a gene feature.
@ eGene_location_utr_3
In 3' UTR (0x80)
@ eGene_location_in_gene
Sequence intervals covered by a gene ID but not having an aligned transcript (0x01)
@ eGene_location_utr_5
In 5' UTR (0x40)
@ eGene_location_intron
In Intron (0x08)
@ eGene_location_intergenic
variant located between genes (0x400)
@ eGene_location_donor
In donor splice-site (0x10)
@ eGene_location_in_stop_codon
the variant is observed in a stop codon (0x200)
@ eGene_location_conserved_noncoding
variant is located in a conserved non-coding region (0x800)
@ eAction_offset
go downstream by distance specified by multiplier (upstream if < 0), in genomic context.
@ eAction_morph
replace len(seq) positions starting with location.start with seq
@ eAction_del_at
excise sequence at location if multiplier is specified, delete len(location)*multiplier positions dow...
@ eAction_ins_before
insert seq before the location.start
@ eData_set_type_package
set represents a package of observations at a given location, generally containing asserted + referen...
@ eObservation_variant
inst represent the observed variant at a given position
@ eObservation_asserted
inst represents the asserted base at a position
@ eObservation_reference
inst represents the reference base at the position
int i
int len
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
bool IsFirstSubsetOfSecond(const CSeq_loc &aa, const CSeq_loc &bb)
#define abs(a)
Definition: ncbi_heapmgr.c:130
unsigned int a
Definition: ncbi_localip.c:102
Defines command line argument related classes.
T max(T x_, T y_)
T min(T x_, T y_)
Int4 delta(size_t dimension_, const Int4 *score_)
static const char * suffix[]
Definition: pcregrep.c:408
static const char * prefix[]
Definition: pcregrep.c:405
true_type verbose
Definition: processing.cpp:890
SAnnotSelector –.
Calculate upstream (first) and downstream(second) flanks for loc.
Modified on Fri Apr 26 16:22:57 2024 by modify_doxy.py rev. 669887