NCBI C++ ToolKit
hgvs_writer2.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: hgvs_writer2.cpp 94999 2021-09-27 15:03:40Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Description:
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 
35 
38 
44 
48 
56 
59 #include <objects/seq/Seq_data.hpp>
61 #include <objects/seq/Num_ref.hpp>
65 
66 #include <serial/iterator.hpp>
67 #include <objmgr/util/sequence.hpp>
68 #include <objmgr/seq_vector.hpp>
70 
73 
75 
76 
77 namespace variation {
78 
79 
81  const CVariantPlacement& p,
82  const CVariation_inst& inst)
83 {
85  placement->Assign(p);
86 
88  && inst.GetDelta().size() > 0
89  && inst.GetDelta().front()->IsSetAction()
90  && inst.GetDelta().front()->GetAction() == CDelta_item::eAction_ins_before
91  && p.GetLoc().IsPnt()
92  && !(p.IsSetStart_offset() && p.IsSetStop_offset()))
93  {
94  //insertion: convert the loc to dinucleotide representation as necessary.
96  CVariationUtil::SFlankLocs flanks = util.CreateFlankLocs(p.GetLoc(), 1);
97  CRef<CSeq_loc> dinucleotide_loc =
99  p.GetLoc(),
101  NULL);
102  placement->SetLoc(*dinucleotide_loc);
104  && !p.IsSetStart_offset()
105  && !p.IsSetStop_offset())
106  {
107  CRef<CSeq_loc> loc =
110  NULL);
111 
112  TSeqPos unit_length = x_GetInstLength(inst, p, false);
113 
114  if(loc->GetStrand() == eNa_strand_minus) {
115  loc->SetInt().SetFrom(loc->GetInt().GetTo() - (unit_length - 1) );
116  } else {
117  loc->SetInt().SetTo(loc->GetInt().GetFrom() + unit_length - 1 );
118  }
119  placement->SetLoc(*loc);
120  //todo: do we need to do anything special if it is an offset-loc?
121  }
122 
123  return placement;
124 }
125 
127 {
128  CConstRef<CSeq_literal> asserted_seq;
129 
131  return asserted_seq;
132  }
133 
135  const CVariation& v2 = **it;
136  if(!v2.GetData().IsInstance() || !v2.GetData().GetInstance().IsSetObservation()) {
137  continue;
138  }
139 
140  //Note that we are only interested in allerted observation if it has the same
141  //placement as its sibling (i.e. placement attached at the parent level, rather at v2). This situation may
142  //arise when we compute a nucleotide precursor variation from a protein variation, and truncate
143  //common suffix and prefix. The variant allele may be truncated differently than the asserted allele,
144  //and they each will get different placements.
145  if(v2.GetData().GetInstance().GetObservation() & (CVariation_inst::eObservation_asserted | CVariation_inst::eObservation_reference)
146  && !v2.IsSetPlacements()
147  && v2.GetData().GetInstance().GetDelta().size() > 0 //VAR-528
148  && v2.GetData().GetInstance().GetDelta().front()->IsSetSeq()
149  && v2.GetData().GetInstance().GetDelta().front()->GetSeq().IsLiteral())
150  {
151  asserted_seq.Reset(&v2.GetData().GetInstance().GetDelta().front()->GetSeq().GetLiteral());
152  break;
153  }
154  }
155 
156  return asserted_seq;
157 }
158 
159 namespace{
160  bool s_IsInv(const CVariation& var)
161  {
162  return var.GetData().IsInstance() &&
164  }
165 
166  // Determines if the variation's an inverted duplication
167  bool s_IsInvDup(const CVariation& var)
168  {
169  if (!var.GetData().IsSet()) {
170  return false;
171  }
172 
173  const CVariation::TData::TSet& vset = var.GetData().GetSet();
174 
176  vset.GetVariations().size() != 2u) { // i.e., ins and inv
177  return false;
178  }
179 
180  // Check that the first one is of type insertion
181  const auto& first_var_data = vset.GetVariations().front()->GetData();
182  if (!first_var_data.IsInstance() ||
183  first_var_data.GetInstance().GetType() != CVariation_inst::eType_ins) {
184  return false;
185  }
186 
187  // Check that the second one is of type inversion
188  return s_IsInv(*vset.GetVariations().back());
189  }
190 
191  // Determines if the parent of the passed variation is an inverted duplication
192  bool s_ParentIsInvDup(const CVariation& var)
193  {
194  return var.GetParent() &&
195  s_IsInv(var) &&
196  s_IsInvDup(*var.GetParent());
197  }
198 }
199 
201 {
202  //create a copy so we can call Index on it, and attach
203  //seq to placements, as necessary, as it will potentially be used to construct
204  //asserted-sequence part of HGVS
205 
207  v->Assign(variation);
208  v->Index();
209 
210 #if 0
211  //sometimes we don't need it, and creating it is slow
213  for(CTypeIterator<CVariantPlacement> it(Begin(*v)); it; ++it) {
214  CVariantPlacement& p = *it;
215  if(!p.IsSetSeq()) {
216  util.AttachSeq(p);
217  }
218  }
219 #endif
220 
221  return x_AsHgvsExpression(*v, seq_id, CConstRef<CSeq_literal>(NULL));
222 }
223 
225  const CVariation& variation,
226  CConstRef<CSeq_id> seq_id,
227  CConstRef<CSeq_literal> asserted_seq)
228 {
229  //Find the placement to use (It is possible not to have one at the top level, if subvariations have their own)
230  CRef<CVariantPlacement> placement;
231 
233  if(placements) {
234  ITERATE(CVariation::TPlacements, it, *placements) {
235  const CVariantPlacement& p = **it;
236  if(!seq_id || (p.GetLoc().GetId() && sequence::IsSameBioseq(*p.GetLoc().GetId(), *seq_id, m_scope))) {
237  placement.Reset(new CVariantPlacement);
238  placement->Assign(p);
239  break;
240  }
241  }
242  if(!placement) {
243  NCBI_THROW(CException, eUnknown, "Variations.placements is set, but could not find requested one");
244  }
245  }
246 
247  //Hgvs can't represent opposite orientation; flip as necessary and call recursively
248  if(placement && placement->GetLoc().GetStrand() == eNa_strand_minus) {
249  CRef<CVariation> flipped_variation(new CVariation);
250  flipped_variation->Assign(variation);
251 
253  util.FlipStrand(*flipped_variation);
254 
255  CRef<CSeq_literal> flipped_asserted_seq;
256 
257  if(asserted_seq) {
258  flipped_asserted_seq.Reset(new CSeq_literal);
259  flipped_asserted_seq->Assign(*asserted_seq);
260  if(asserted_seq->IsSetSeq_data()) {
262  asserted_seq->GetSeq_data(),
263  &flipped_asserted_seq->SetSeq_data(),
264  0, asserted_seq->GetLength());
265  }
266  }
267  return x_AsHgvsExpression(*flipped_variation, seq_id, flipped_asserted_seq);
268  }
269 
270  //Asserted sequence does not participate in HGVS expression as independent instance subexpression.
271  //Instead, we'll find it here (e.g. "A") and pass down to create variant subexpressions, e.g. [A>C]+[A>G]
272  if(variation.GetData().IsSet()
273  && variation.GetData().GetSet().GetType() == CVariation::TData::TSet::eData_set_type_package)
274  {
275  asserted_seq = x_FindAssertedSequence(variation);
276  }
277 
278  string hgvs_data_str = "";
279  size_t subvariation_count(0);
280  if(variation.GetData().IsSet()) {
281  const CVariation::TData::TSet& vset = variation.GetData().GetSet();
282  string delim_type =
292  : "(;)";
293 
294  string delim = "";
296  const CVariation& v2 = **it;
297 
298  //asserted or reference instances don't participate in HGVS expressions as individual subvariation expressions
299  //Exception: it is the only member of the set: (JIRA: VAR-626)
300  if(v2.GetData().IsInstance()
301  && vset.GetVariations().size() > 1
302  && v2.GetData().GetInstance().IsSetObservation()
303  && !(v2.GetData().GetInstance().GetObservation() & CVariation_inst::eObservation_variant))
304  {
305  continue;
306  }
307 
308  string subvariation_expr = x_AsHgvsExpression(v2, seq_id, asserted_seq);
309 
310  hgvs_data_str += delim + subvariation_expr;
311  delim = delim_type;
312  ++subvariation_count;
313  }
314  } else if(variation.GetData().IsInstance()) {
315  if(placement) {
316  placement = x_AdjustPlacementForHgvs(*placement, variation.GetData().GetInstance());
317  }
318 
319  hgvs_data_str = x_AsHgvsInstExpression(variation, placement, asserted_seq);
320  } else if(variation.GetData().IsUnknown()) {
321  hgvs_data_str = "?";
322  } else if(variation.GetData().IsNote()) {
323  hgvs_data_str = ":" + variation.GetData().GetNote();
324  } else {
325  hgvs_data_str = ":OTHER";
326  }
327 
328  if(variation.IsSetFrameshift()) {
329  if(hgvs_data_str == "Xaa" || hgvs_data_str == "delinsXaa") {
330  //short-form of frameshift, e.g. p.Ser20fs is internally
331  //represented in "long form", ie. p.Ser20Xaafs.
332  //When writing we'll drop Xaa to convert back to short-form
333  hgvs_data_str = "";
334  }
335 
336  hgvs_data_str += "fs";
337  if(variation.GetFrameshift().IsSetX_length()) {
338  hgvs_data_str += "*" + NStr::NumericToString(variation.GetFrameshift().GetX_length());
339  }
340  }
341 
342  if(variation.IsSetMethod()
343  && find(variation.GetMethod().GetMethod().begin(),
344  variation.GetMethod().GetMethod().end(),
345  CVariationMethod::eMethod_E_computational) != variation.GetMethod().GetMethod().end())
346  {
347  hgvs_data_str = "(" + hgvs_data_str + ")";
348  }
349 
350 
351  bool is_bracketed = false; //will compute whether need to put this subexpression in brackets
352  bool location_within_brackets = true;
353  //will compute whether the location prefix should be factored from brackets, e.g. NM_004004.2:c.35[dupG;A>G]
354  //or within the brackets, e.g. [NM_004004.2:c.35delG]+[NM_006783.1:c.689_690insT]
355 
356  if(variation.GetParent()) {
357  //If a variation is an element of an alleles|genotype set,
358  //it describes an allele and must be bracketed.
359  CVariation::TData::TSet::TType type = variation.GetParent()->GetData().GetSet().GetType();
360  is_bracketed =
363 
364 
365  if(variation.GetData().IsInstance()
366  && variation.GetData().GetInstance().GetType() == CVariation_inst::eType_microsatellite)
367  {
368  //Except for microsatellites, as they are bracketed at the inst-level (see the SSR grammar for details)
369  is_bracketed = false;
370  }
371  } else if(subvariation_count > 1) {
372  //Root non-singleton variation: it describes a single allele (i.e. also needs to be bracketed)
373  //UNLESS it is a set that itself describes individual alleles.
374  CVariation::TData::TSet::TType type = variation.GetData().GetSet().GetType();
375  is_bracketed =
379 
380  location_within_brackets = false;
381  }
382 
383  string hgvs_loc_str = "";
384  if(placement && variation.IsSetPlacements()) {
385  //prefix the placement only if it is defined at this level (otherwise will be handled at the parent level)
386  hgvs_loc_str = AsHgvsExpression(*placement);
387  }
388 
389  if(is_bracketed) {
390  if(location_within_brackets) {
391  hgvs_data_str = "[" + hgvs_loc_str + hgvs_data_str + "]";
392  } else {
393  hgvs_data_str = hgvs_loc_str + "[" + hgvs_data_str + "]";
394  }
395  } else {
396  hgvs_data_str = hgvs_loc_str + hgvs_data_str;
397  }
398 
399  return hgvs_data_str;
400 }
401 
402 
403 string Ncbieaa2HgvsAA(const string& prot_str)
404 {
405  string out = "";
406  //convert to 3-letter AA codes
407  const static char* ncbieaa = "-ABCDEFGHIKLMNPQRSTVWXYZU*O";
408 
409  //Note: with Hgvs-flavor "Xaa"
410  const static char* iupac3aa = "---AlaAsxCysAspGluPheGlyHisIleLysLeuMetAsnProGlnArgSerThrValTrpXaaTyrGlxSecTerPyl";
411 
412  for(size_t i = 0; i < prot_str.size(); i++) {
413  char aa = prot_str[i];
414  size_t pos = CTempString(ncbieaa).find(aa);
415  if(pos == NPOS) {
416  //Can't convert. Use ncbistdaa alphabet
417  out = prot_str;
418  break;
419  } else {
420  out += CTempString(iupac3aa).substr(pos*3, 3);
421  }
422  }
423 
424  return out;
425 }
426 
427 string CHgvsParser::x_SeqLiteralToStr(const CSeq_literal& literal, bool translate, bool is_mito)
428 {
429  string out("");
430 
431  if(literal.IsSetSeq_data()) {
432  CRef<CSeq_data> sd(new CSeq_data);
433  sd->Assign(literal.GetSeq_data());
434 
435  if( sd->IsIupacna()
436  || sd->IsNcbi2na()
437  || sd->IsNcbi4na()
438  || sd->IsNcbi8na()
439  || sd->IsNcbipna())
440  {
441  CSeqportUtil::Convert(*sd, sd, CSeq_data::e_Iupacna, 0, literal.GetLength() );
442  const string& nuc_str = sd->GetIupacna().Get();
443 
444  if(translate) {
446  code.SetId(is_mito ? 2 : 1);
448  nuc_str,
449  out,
451  &code);
453  } else {
454  out = nuc_str;
455  }
456  } else if(sd->IsIupacaa()
457  || sd->IsNcbi8aa()
458  || sd->IsNcbieaa()
459  || sd->IsNcbipaa()
460  || sd->IsNcbistdaa())
461  {
462  string prot_str;
463  CSeqportUtil::Convert(*sd, sd, CSeq_data::e_Ncbieaa, 0, literal.GetLength() );
464  prot_str = sd->GetNcbieaa().Get();
465  out = Ncbieaa2HgvsAA(prot_str);
466  }
467 
468  } else {
469  if(translate && literal.GetLength() > 0) {
471  NCBI_THROW(CException, eUnknown, "Not supported");
472  }
473  out = s_IntWithFuzzToStr(literal.GetLength(),
474  NULL,
475  false,
476  literal.IsSetFuzz() ? &literal.GetFuzz() : NULL);
477  }
478  return out;
479 }
480 
481 
482 string CHgvsParser::x_LocToSeqStr(const CSeq_loc& loc)
483 {
485  string seq_str;
486  v.GetSeqData(v.begin(), v.end(), seq_str);
487  return seq_str;
488 }
489 
490 
492 {
493  if(!atg_pos) {
494  return abs_pos;
495  } else {
496  TSignedSeqPos pos = (TSignedSeqPos)abs_pos + 1 - *atg_pos; //hgvs absolute coordinates are 1-based.
497  if(pos <= 0) {
498  pos--;
499  }
500  return pos;
501  }
502 }
503 
504 
506  long pos,
507  const TSeqPos* hgvs_ref_pos,
508  bool with_sign,
509  const CInt_fuzz* fuzz)
510 {
511 /*
512  * with_sign indicates whether the sign is mandatory and must be factored out
513  * (as offset part of an intronic expression)
514  * In this case we'll prefix the sign in the end, and will adjust for sign
515  * of values inside the expressions by multiplying by k
516  */
517 
518  const bool fuzz_gt =
519  fuzz
520  && fuzz->IsLim()
521  && ( fuzz->GetLim() == CInt_fuzz::eLim_gt
522  || fuzz->GetLim() == CInt_fuzz::eLim_tr);
523 
524  const bool fuzz_lt =
525  fuzz
526  && fuzz->IsLim()
527  && ( fuzz->GetLim() == CInt_fuzz::eLim_lt
528  || fuzz->GetLim() == CInt_fuzz::eLim_tl);
529 
530  const long hgvs_pos = s_GetHgvsPos(pos, hgvs_ref_pos);
531  const int sign = hgvs_pos > 0 ? 1
532  : hgvs_pos < 0 ? -1
533  : fuzz_gt ? 1
534  : fuzz_lt ? -1
535  : 0;
536 
537  const int k = (with_sign && sign == -1) ? -1 : 1;
538 
539  string val = "";
540  if(fuzz && fuzz->IsRange()) {
541  const string from = NStr::LongToString(
542  k * s_GetHgvsPos(fuzz->GetRange().GetMin(),
543  hgvs_ref_pos));
544  const string to = NStr::LongToString(
545  k * s_GetHgvsPos(fuzz->GetRange().GetMax(),
546  hgvs_ref_pos));
547  val = "(" + from + "_" + to + ")";
548  } else {
549  val = NStr::LongToString(k*hgvs_pos);
550 
551  val = !fuzz ? val // no-fuzz
552  : !hgvs_pos
553  && with_sign
554  && (fuzz_gt || fuzz_lt) ? "?" // fuzz-only offset, e.g. 10+? instead of 10+(0_?)
555  : !hgvs_pos && fuzz_gt ? "?" // positive fuzz, e.g. ins? instead of ins(0_?)
556  : !fuzz->IsLim() ? "(" + val + ")"
557  : fuzz_gt ? "(" + val + "_?)"
558  : fuzz_lt ? "(?_" + val + ")"
559  : "(" + val + ")";
560  }
561 
562  const string sign_str = (!with_sign ? ""
563  : sign >= 0 ? "+"
564  : "-");
565 
566  return sign_str + val;
567 }
568 
570 {
571  string moltype = "";
572 
574  moltype = "g.";
575  } else if(vp.GetMol() == CVariantPlacement::eMol_cdna) {
576  moltype = "c.";
577  } else if(vp.GetMol() == CVariantPlacement::eMol_rna) {
578  moltype = "n.";
579  } else if(vp.GetMol() == CVariantPlacement::eMol_protein) {
580  moltype = "p.";
581  } else if(vp.GetMol() == CVariantPlacement::eMol_mitochondrion) {
582  moltype = "m.";
583  } else {
584  moltype = "u.";
585  }
586 
587  string idstr;
588  {{
589  const CSeq_id& id = sequence::GetId(vp.GetLoc(), NULL);
590  idstr = scope && id.IsGi() ? sequence::GetAccessionForGi(id.GetGi(), *scope)
591  : id.GetSeqIdString(true);
592 
593  if(NStr::StartsWith(idstr, "LRG:")) {
594  idstr = idstr.substr(4);
595  }
596  }}
597 
598  return (vp.GetLoc().GetStrand() == eNa_strand_minus ? "o" : "") //in HGVS minus-strand is prefixed with "o"
599  +idstr + ":" + moltype;
600 }
601 
603  TSeqPos anchor_pos,
604  const CInt_fuzz* anchor_fuzz,
605  TSeqPos anchor_ref_pos,
606  TSeqPos effective_seq_length,
607  const long* offset_pos,
608  const CInt_fuzz* offset_fuzz)
609 {
610  if(offset_pos && (anchor_pos == 0 || anchor_pos >= effective_seq_length - 1)) {
611  //JIRA:VAR-343
612  //If we have an offset-point anchored to either start of end of the transcript (less polyA),
613  //then the anchor+offset must be resolved, i.e. reported as absolute position relative the origin of the coordinate system.
614  //That is, intronic positions are reported relative to closest exon boundary, while near-gene positions are reported
615  //relative to the coordinate system origin, which could be start of the sequence, cds-start, or cds-stop, depending on the context.
616  //
617  // Note: not sure whether anchor_fuzz and/or offset_fuzz should be used, and whether it needs to be modified
618  long resolved_pos = anchor_pos + *offset_pos;
619  return s_IntWithFuzzToStr(resolved_pos, &anchor_ref_pos, false, anchor_fuzz);
620  } else {
621  string anchor_str = s_IntWithFuzzToStr(anchor_pos, &anchor_ref_pos, false, anchor_fuzz);
622  string offset_str = !offset_pos ? "" : s_IntWithFuzzToStr(*offset_pos, NULL, true, offset_fuzz);
623  return anchor_str+offset_str;
624  }
625 }
626 
628 {
630 }
631 
633 {
634  //For protein placement we'll need seq-data (e.g. p.123Glu)
636 
638 
639  if(orig_vp.GetMol() == CVariantPlacement::eMol_protein && !orig_vp.IsSetSeq()) {
640  vp_ref.Reset(new CVariantPlacement);
641  vp_ref->Assign(orig_vp);
642  util.AttachSeq(*vp_ref);
643  }
644  const CVariantPlacement& vp = vp_ref ? *vp_ref : orig_vp;
645 
646  CBioseq_Handle bsh;
647  {{
648  const CSeq_id& id = sequence::GetId(vp.GetLoc(), NULL);
649  if(id.IsGeneral() && id.GetGeneral().GetDb() == "LRG") {
651  if(lrg_resolver->CanCreate(id.GetGeneral().GetTag().GetStr())) {
652  CSeq_id_Handle idh = lrg_resolver->Get(id.GetGeneral().GetTag().GetStr());
653  bsh = m_scope->GetBioseqHandle(idh);
654  }
655  } else {
656  bsh = m_scope->GetBioseqHandle(id);
657  }
658  }}
659 
660  //we'll need to detect when an anchor in anchor+offset case occurs at last position of
661  //the last exon; we'll need to know the effective length.
662  size_t effective_seq_length = util.GetEffectiveTranscriptLength(bsh);
663 
664  //For c.-based coordinates, the first pos as start of CDS.
665  //If the position falls it 3'-UTR, the origin is CDS-stop.
666  TSeqPos first_pos = 0;
667  TSeqPos cds_last_pos = 0;
669  for(CFeat_CI ci(bsh, SAnnotSelector(CSeqFeatData::e_Cdregion)); ci; ++ci) {
670  const CMappedFeat& mf = *ci;
671  if(mf.GetData().IsCdregion()) {
673  cds_last_pos = sequence::GetStop(mf.GetLocation(), NULL, eExtreme_Biological);
674  break;
675  }
676  }
677  }
678 
679  string loc_str = "";
680  if(vp.GetLoc().IsEmpty() || vp.GetLoc().IsNull()) {
681  loc_str = "?";
682 
683  //Note: it is possible that the location is not known, but the sequence is known, e.g. if
684  //protein variation was derived from a variation on a partial CDS.
685 
686  if(vp.GetMol() == CVariantPlacement::eMol_protein && vp.IsSetSeq() && vp.GetSeq().IsSetSeq_data()) {
687  //prepend first AA of asserted sequence
688  string aa = vp.GetSeq().GetSeq_data().GetNcbieaa().Get().substr(0,1);
689  loc_str = Ncbieaa2HgvsAA(aa) + loc_str;
690  }
691 
692  } else if(vp.GetLoc().IsWhole()) {
693  ; //E.g. "NG_12345.6:g.=" represents no-change ("=") on the whole "NG_12345.6:g."
694  } else if(vp.GetLoc().IsPnt() && CVariationUtil::s_GetLength(vp, NULL) == 1) {
695  //Note, if this is a point, but we have stop-offset, we need to treat it as interval
696  //This happens when we have an offset-based placement with start==stop, which gets
697  //collapsed to a single point after remapping instead of remaining a single-base interval.
698 
699  const CSeq_point& pnt = vp.GetLoc().GetPnt();
700 
701  long start_offset = 0;
702  if(vp.IsSetStart_offset()) {
703  start_offset = vp.GetStart_offset();
704  }
705 
706  bool is_cdsstop_relative = cds_last_pos
707  && ( pnt.GetPoint() > cds_last_pos
708  || (pnt.GetPoint() == cds_last_pos && vp.IsSetStart_offset()));
709  //VAR-1076
710 
711  loc_str = s_OffsetPointToString(
712  vp.GetLoc().GetPnt().GetPoint(),
713  vp.GetLoc().GetPnt().IsSetFuzz() ? &vp.GetLoc().GetPnt().GetFuzz() : NULL,
714  is_cdsstop_relative ? cds_last_pos + 1 : first_pos,
715  effective_seq_length,
716  (vp.IsSetStart_offset() ? &start_offset : NULL),
718 
719  if(is_cdsstop_relative) {
720  loc_str = "*" + loc_str;
721  }
722 
724  //prepend first AA of asserted sequence
725  string aa = vp.GetSeq().GetSeq_data().GetNcbieaa().Get().substr(0,1);
726  loc_str = Ncbieaa2HgvsAA(aa) + loc_str;
727  }
728  } else {
729  CConstRef<CSeq_loc> int_loc;
730  if(vp.GetLoc().IsInt()) {
731  int_loc.Reset(&vp.GetLoc());
732  } else {
734  }
735 
736  bool is_biostart_cdsstop_relative = cds_last_pos && int_loc->GetStart(eExtreme_Biological) > cds_last_pos;
737  const CInt_fuzz* biostart_fuzz = sequence::GetStrand(vp.GetLoc(), NULL) == eNa_strand_minus ?
738  (int_loc->GetInt().IsSetFuzz_to() ? &int_loc->GetInt().GetFuzz_to() : NULL)
739  : (int_loc->GetInt().IsSetFuzz_from() ? &int_loc->GetInt().GetFuzz_from() : NULL);
740  long biostart_offset = 0;
741  if(vp.IsSetStart_offset()) {
742  biostart_offset = vp.GetStart_offset();
743  }
744  string biostart_str = s_OffsetPointToString(
745  int_loc->GetStart(eExtreme_Biological),
746  biostart_fuzz,
747  is_biostart_cdsstop_relative ? cds_last_pos + 1 : first_pos,
748  effective_seq_length,
749  vp.IsSetStart_offset() ? &biostart_offset : NULL,
752  string aa = vp.GetSeq().GetSeq_data().GetNcbieaa().Get().substr(0,1);
753  biostart_str = Ncbieaa2HgvsAA(aa) + biostart_str;
754  }
755  if(is_biostart_cdsstop_relative) {
756  biostart_str = "*" + biostart_str;
757  }
758 
759 
760  bool is_biostop_cdsstop_relative = cds_last_pos && int_loc->GetStop(eExtreme_Biological) > cds_last_pos;
761  const CInt_fuzz* biostop_fuzz = sequence::GetStrand(vp.GetLoc(), NULL) == eNa_strand_minus ?
762  (int_loc->GetInt().IsSetFuzz_from() ? &int_loc->GetInt().GetFuzz_from() : NULL)
763  : (int_loc->GetInt().IsSetFuzz_to() ? &int_loc->GetInt().GetFuzz_to() : NULL);
764  long biostop_offset = 0;
765  if(vp.IsSetStop_offset()) {
766  biostop_offset = vp.GetStop_offset();
767  }
768  string biostop_str = s_OffsetPointToString(
769  int_loc->GetStop(eExtreme_Biological),
770  biostop_fuzz,
771  is_biostop_cdsstop_relative ? cds_last_pos + 1 : first_pos,
772  effective_seq_length,
773  vp.IsSetStop_offset() ? &biostop_offset : NULL,
776  //prepend last aa of the asserted sequence
777  const string& prot_str = vp.GetSeq().GetSeq_data().GetNcbieaa().Get();
778  biostop_str = Ncbieaa2HgvsAA(prot_str.substr(prot_str.size() - 1,1)) + biostop_str;
779  }
780  if(is_biostop_cdsstop_relative) {
781  biostop_str = "*" + biostop_str;
782  }
783 
785  swap(biostart_str, biostop_str);
786  }
787 
788  loc_str = biostart_str + "_" + biostop_str;
789  }
790  return loc_str;
791 }
792 
793 
794 TSeqPos CHgvsParser::x_GetInstLength(const CVariation_inst& inst, const CVariantPlacement& p, bool account_for_multiplier)
795 {
796  TSeqPos len(0);
797 
799  const CDelta_item& d = **it;
800  int multiplier = d.IsSetMultiplier() && account_for_multiplier ? d.GetMultiplier() : 1;
801  TSeqPos d_len(0);
802  if(d.GetSeq().IsLiteral()) {
803  d_len = d.GetSeq().GetLiteral().GetLength();
804  } else if(d.GetSeq().IsThis()) {
806  } else if(d.GetSeq().IsLoc()) {
807  d_len = sequence::GetLength(d.GetSeq().GetLoc(), m_scope);
808  } else {
809  NCBI_THROW(CException, eUnknown, "Unhandled code");
810  }
811  len += d_len * multiplier;
812  }
813  return len;
814 }
815 
817 {
818  const CBioSource* bs = sequence::GetBioSource(bsh);
819  return bs && bs->GetGenome() == CBioSource::eGenome_mitochondrion;
820 }
821 
822 //VAR-1556
824 {
825  return !p ? false
826  : p->IsSetStart_offset_fuzz() ? true
827  : p->IsSetStop_offset_fuzz() ? true
829  Begin(p->GetLoc())) ? true
830  : false;
831 }
832 
833 
835  const CVariation& variation,
837  CConstRef<CSeq_literal> explicit_asserted_seq)
838 {
839 
840  CBioseq_Handle bsh;
841  if(placement) {
842  bsh = m_scope->GetBioseqHandle(sequence::GetId(placement->GetLoc(), NULL));
843  }
844  bool is_mito = bsh && IsMitochondrion(bsh);
845 
846  const CVariation_inst& inst = variation.GetData().GetInstance();
847  bool is_prot_inst =
853 
854  if(is_prot_inst && placement && placement->GetMol() != CVariantPlacement::eMol_protein) {
855  NCBI_THROW(CException, eUnknown, "Can't make protein HGVS expression for nucleotide placement");
856  }
857  bool is_prot = is_prot_inst || (placement && placement->GetMol() == CVariantPlacement::eMol_protein);
858 
859 
860  CConstRef<CSeq_literal> asserted_seq(NULL);
861  {{
862  //Priority for using asserted-sequence:
863  //use from placement (instantiate if necessary); otherwise use explicit packaged asserted-observation
864  //seq-literal passed from above. Will only use it if have seq-data (SNP-5605) and don't have fuzz (VAR-638)
865  if( placement
866  && placement->IsSetSeq()
867  && placement->GetSeq().IsSetSeq_data()
868  && !CTypeConstIterator<CInt_fuzz>(Begin(placement->GetLoc())))
869  {
870  asserted_seq.Reset(&placement->GetSeq());
871  } else {
872 #if 0
873  //don't automatically fetch asserted sequence, because want to allow explicit assertions, e.g.
874  //NC_000001.9:g.(2472747_?)_(?_2489105)inv16359, but don't compute 16359 automatically because
875  //this may not be applicable to fuzzy locs
876 
877  if(placement) {
878  //have placement but no sequence, see if we can fetch it
880  p2->Assign(*placement);
882  if(util.AttachSeq(*p2)) {
883  asserted_seq.Reset(&p2->GetSeq());
884  }
885  }
886 #endif
887 
888  if(!asserted_seq
889  && explicit_asserted_seq
890  && !(placement && placement->GetMol() == CVariantPlacement::eMol_protein))
891  {
892  /*
893  * Getting seq from placement might or might not have worked (e.g. can't get for intronic case).
894  * If asserted sequence is not filled out, see if we have apriori asserted sequence, except
895  * cannot use explicit asserted seq to construct prot inst, as it could be partially-specified: e.g.
896  * "NP_079142.2:p.C11_G21delinsGlnSerLys - the asserted seq is C..G, so we cannot construct
897  * del??ins representation that asserts the sequence being deleted within a delins.
898  */
899  asserted_seq = explicit_asserted_seq.GetPointer();
900  }
901  }
902  }}
903 
904  static const size_t s_max_literal_length = 16;
905 
906  string asserted_seq_str =
907  !asserted_seq ? ""
908  : asserted_seq->GetLength() < s_max_literal_length ? x_SeqLiteralToStr(*asserted_seq, is_prot, is_mito)
909  : NStr::NumericToString(asserted_seq->GetLength());
910 
911  string inst_str = "";
912  bool append_delta = false;
915  {
916  //Prepend the asserted sequence, but only if its length is under threshold.
917  //If it is too long, it can't be used, as it will be represented by a number, and
918  //the preceding context also ends with a number (location): e.g
919  // NC_000001:g.100000A= - correct
920  // NC_000001:g.100000_100123= - correct
921  // NC_000001:g.100000_100123124= - wrong, can't use literal's length "124"
922  inst_str = ( asserted_seq
923  && asserted_seq->GetLength() < s_max_literal_length
924  && asserted_seq->IsSetSeq_data()
925  && !is_prot ? asserted_seq_str : "")
926  + "=";
927  } else if(!asserted_seq_str.empty() && s_ParentIsInvDup(variation)) {
928  inst_str = "inv";
929  } else if(inst.GetType() == CVariation_inst::eType_inv) {
930  inst_str = "inv" + asserted_seq_str;
931  } else if(inst.GetType() == CVariation_inst::eType_snv) {
932  inst_str = (asserted_seq ? asserted_seq_str : "N" )+ ">";
933  append_delta = true;
934  } else if(inst.GetType() == CVariation_inst::eType_mnp
937  {
939  placement && placement->GetLoc().IsPnt() &&
940  placement->GetLoc().GetPnt().GetPoint() == 0)
941  {
942  inst_str = "extMet-";
943  } else if(inst.GetType() == CVariation_inst::eType_prot_other
944  && placement && placement->GetLoc().IsPnt()
945  && bsh
946  && placement->GetLoc().GetPnt().GetPoint() == bsh.GetInst_Length() - 1)
947  {
948  inst_str = "ext*";
949  } else if(inst.GetType() == CVariation_inst::eType_prot_other) {
950  inst_str = "delins";
951  } else {
952  inst_str = "del" + asserted_seq_str + "ins";
953  }
954 
955  append_delta = true;
956  } else if(inst.GetType() == CVariation_inst::eType_del) {
957  if(placement && placement->GetLoc().IsWhole()) {
958  inst_str = "0"; //whole-product deletion
959  } else if(is_prot) {
960  inst_str = "del"; //do not generate asserted part for protein expressions: SNP-4623
961  } else if(ContainsAnyFuzz(placement)) { //VAR-1556
962  inst_str = "del";
963  } else {
964  inst_str = "del" + asserted_seq_str;
965  }
966  } else if(inst.GetType() == CVariation_inst::eType_ins) {
967  //If the insertion is this*2 then this is a dup
968  bool is_dup = false;
969  if(inst.GetDelta().size() == 1) {
970  const CDelta_item& delta = **inst.GetDelta().begin();
971  if(delta.GetSeq().IsThis() && delta.IsSetMultiplier() && delta.GetMultiplier() == 2) {
972  is_dup = true;
973  }
974  }
975 
976  if(is_dup) {
977  // According to the HGVS standard
978  // http://varnomen.hgvs.org/recommendations/DNA/variant/duplication/
979  // the 'dup' HGVS expressions are not to include
980  // the duplicated nucleotides.
981  // Format: “prefix”“position(s)_duplicated”“dup”, e.g. g.123_345dup
982  inst_str = "dup";
983  append_delta = false;
984  } else {
985  inst_str = "ins";
986  append_delta = true;
987  }
988  } else if(inst.GetType() == CVariation_inst::eType_microsatellite) {
989  inst_str = "";
990  append_delta = true;
991  } else if(inst.GetType() == CVariation_inst::eType_transposon) {
992  inst_str = "con";
993  append_delta = true;
997  {
998  append_delta = true;
999  } else {
1000  inst_str = "?";
1001  }
1002 
1003 
1004  if(append_delta) {
1006 
1007  const CDelta_item& delta = **it;
1008 
1009  if(variation.GetData().GetInstance().GetType() == CVariation_inst::eType_microsatellite
1010  && variation.GetParent()
1011  && !variation.IsSetPlacements()
1012  && variation.GetParent()->GetData().GetSet().GetVariations().front().GetPointer() != &variation)
1013  {
1014  /*
1015  * Don't use literal subsequent subvariations in a multi-allele microsatellite expression,
1016  * e.g. NM_000815.2:c.100_101TC[5]+[3], as opposed to NM_000815.2:c.100_101TC[5]+TC[3] -
1017  * in the second subexpression we simply want [3] instead of TC[3]
1018  */
1019  ;
1020  } else if(delta.GetSeq().IsThis()) {
1021  ;
1022  } else if(delta.GetSeq().IsLiteral()) {
1024  literal->Assign(delta.GetSeq().GetLiteral());
1025  if(NStr::StartsWith(inst_str, "extMet") || NStr::StartsWith(inst_str, "extX")) {
1026  literal->SetLength()--;
1027  //length of extension is one less than the length of the sequence that replaces first or last AA
1028  }
1029 
1030  string variant_str = x_SeqLiteralToStr(*literal, is_prot, is_mito);
1031  if( inst_str == variant_str + ">"
1032  || inst_str == "del" + variant_str + "ins")
1033  {
1034  // instead of "G>G" report "G=", but not "1=",
1035  // since "1" will coalesce into prefixed position.
1036  //instead of "delACTinsACT" report "ACT="
1037  inst_str = (!variant_str.empty() && isdigit(variant_str.at(0)) ? "" : variant_str) + "=";
1038  } else {
1039  inst_str += variant_str;
1040  }
1041  } else if(delta.GetSeq().IsLoc()) {
1042  string delta_loc_str;
1043  //the repeat-unit in microsattelite is always literal sequence:
1044  //NG_011572.1:g.5658NG_011572.1:g.5658_5660(15_24) - incorrect
1045  //NG_011572.1:g.5658CAG(15_24) - correct
1047  delta_loc_str = x_LocToSeqStr(delta.GetSeq().GetLoc());
1048  } else {
1050  p_tmp->SetLoc().Assign(delta.GetSeq().GetLoc());
1052  p_tmp->SetMol(util.GetMolType(sequence::GetId(p_tmp->GetLoc(), NULL)));
1053 
1054  if(p_tmp->GetLoc().GetId()
1055  && placement->GetLoc().GetId()
1056  && p_tmp->GetLoc().GetId()->Equals(*placement->GetLoc().GetId()))
1057  {
1058  //if delta has same seq-id as placement, omit the seq-id header,
1059  //e.g. NM_000815.2:c.100delTins5_10
1060  //instead of NM_000815.2:c.100delTinsNM_000815.2:c.5_10
1061  delta_loc_str = x_PlacementCoordsToStr(*p_tmp);
1062  } else {
1063  //with header NM_000815.3:c.100delTinsNM_000815.2:c.5_10
1064  delta_loc_str = AsHgvsExpression(*p_tmp);
1065  }
1066  }
1067 
1068  inst_str += delta_loc_str;
1069 
1070  } else {
1071  NCBI_THROW(CException, eUnknown, "Unhandled delta class");
1072  }
1073 
1074  //add multiplier, but make sure we're dealing with SSR.
1075  if(delta.IsSetMultiplier()) {
1077  string multiplier_str = s_IntWithFuzzToStr(
1078  delta.GetMultiplier(),
1079  NULL,
1080  false,
1081  delta.IsSetMultiplier_fuzz() ? &delta.GetMultiplier_fuzz() : NULL);
1082 
1083  if(!NStr::StartsWith(multiplier_str, "(")) {
1084  multiplier_str = "[" + multiplier_str + "]";
1085  //In HGVS-land the fuzzy multiplier value (in parentheses) existis as is, but an exact value
1086  //is enclosed in brackets like an allele-set.
1087  }
1088 
1089  inst_str += multiplier_str;
1090  } else if(!NStr::StartsWith(inst_str, "dup")) {
1091  //multiplier is expected for dup or ssr representation only
1092  NCBI_THROW(CException, eUnknown, "Multiplier value is set in unexpected context (only STR supported)");
1093  }
1094  }
1095  }
1096  }
1097 
1098  return inst_str;
1099 }
1100 
1101 };
1102 
1104 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
CBioseq_Handle –.
CConstRef –.
Definition: ncbiobj.hpp:1266
CFeat_CI –.
Definition: feat_ci.hpp:64
CMappedFeat –.
Definition: mapped_feat.hpp:59
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
Resolve LRG seq-ids, e.g. LRG_123, LRG_123t1, LRG_123p1.
static TSeqPos Convert(const CSeq_data &in_seq, CSeq_data *out_seq, CSeq_data::E_Choice to_code, TSeqPos uBeginIdx=0, TSeqPos uLength=0, bool bAmbig=false, Uint4 seed=17734276)
static TSeqPos ReverseComplement(CSeq_data *in_seq, TSeqPos uBeginIdx=0, TSeqPos uLength=0)
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
CVariantPlacement –.
Set of related Variations.
Definition: Variation_.hpp:127
CVariation_inst –.
const CVariation * GetParent() const
Definition: Variation.hpp:92
static string s_OffsetPointToString(TSeqPos anchor_pos, const CInt_fuzz *anchor_fuzz, TSeqPos anchor_ref_pos, TSeqPos effective_seq_length, const long *offset_pos, const CInt_fuzz *offset_fuzz)
Construct an HGVS coordinate, which may be an intronic offset-point, e.g. "5+(10_11)".
static TSignedSeqPos s_GetHgvsPos(TSeqPos abs_pos, const TSeqPos *atg_pos)
if no atg_pos, assume that not dealing with coordinate systems (simply return abs-pos) otherwise,...
static string s_SeqIdToHgvsStr(const CVariantPlacement &vp, CScope *scope=NULL)
Construct an hgvs "header" consisting of seq-id and mol-type, e.g. "NG_016831.1:g....
CRef< CVariantPlacement > x_AdjustPlacementForHgvs(const CVariantPlacement &p, const CVariation_inst &inst)
In some cases the placement needs to be adjusted depending on inst, e.g.
string x_AsHgvsInstExpression(const CVariation &inst_variation, CConstRef< CVariantPlacement > p, CConstRef< CSeq_literal > asserted_seq)
Create "inst" part of HGVS expression.
string AsHgvsExpression(const CVariation &variation, CConstRef< CSeq_id > seq_id=CConstRef< CSeq_id >(NULL))
static string s_IntWithFuzzToStr(long pos, const TSeqPos *ref_pos, bool with_sign, const CInt_fuzz *fuzz)
this function may be used to create hgvs-coordinates (if ref_pos is not null), or to create a fuzzy h...
CConstRef< CSeq_literal > x_FindAssertedSequence(const CVariation &v)
If the variation is a package-set, find the subvariation with observation-type "asserted" and return ...
CRef< CScope > m_scope
string x_LocToSeqStr(const CSeq_loc &loc)
Get literal seq at location.
TSeqPos x_GetInstLength(const CVariation_inst &inst, const CVariantPlacement &p, bool account_for_multiplier)
Compute length of the delta.
string x_AsHgvsExpression(const CVariation &variation, CConstRef< CSeq_id > id, CConstRef< CSeq_literal > asserted_seq)
Construct HGVS expression for a variation: use first VariantPlacement, or, if id is specified,...
string x_SeqLiteralToStr(const CSeq_literal &literal, bool translate, bool is_mito)
translate=true will translate nucleotide literal to prot as appropriate.
string x_PlacementCoordsToStr(const CVariantPlacement &vp)
static const CVariation::TPlacements * s_GetPlacements(const CVariation &v)
static TSeqPos s_GetLength(const CVariantPlacement &p, CScope *scope)
std::ofstream out("events_result.xml")
main entry point for tests
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const CVect2< U > & v2
Definition: globals.hpp:440
@ eUnknown
Definition: app_popup.hpp:72
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
const TPrim & Get(void) const
Definition: serialbase.hpp:347
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
@ fSortAndMerge_All
Definition: Seq_loc.hpp:334
@ fMerge_SingleRange
Definition: Seq_loc.hpp:332
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
TSeqPos GetStop(const CSeq_loc &loc, CScope *scope, ESeqLocExtremes ext=eExtreme_Positional)
If only one CBioseq is represented by CSeq_loc, returns the position at the stop of the location.
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
ENa_strand GetStrand(const CSeq_loc &loc, CScope *scope=0)
Returns eNa_strand_unknown if multiple Bioseqs in loc Returns eNa_strand_other if multiple strands in...
TSeqPos GetStart(const CSeq_loc &loc, CScope *scope, ESeqLocExtremes ext=eExtreme_Positional)
If only one CBioseq is represented by CSeq_loc, returns the position at the start of the location.
CRef< CSeq_loc > Seq_loc_Merge(const CSeq_loc &loc, CSeq_loc::TOpFlags flags, CScope *scope)
Merge ranges in the seq-loc.
CRef< CSeq_loc > Seq_loc_Add(const CSeq_loc &loc1, const CSeq_loc &loc2, CSeq_loc::TOpFlags flags, CScope *scope)
Add two seq-locs.
bool IsSameBioseq(const CSeq_id &id1, const CSeq_id &id2, CScope *scope, CScope::EGetBioseqFlag get_flag=CScope::eGetBioseq_All)
Determines if two CSeq_ids represent the same CBioseq.
const CBioSource * GetBioSource(const CBioseq &bioseq)
Retrieve the BioSource object for a given bioseq handle.
Definition: sequence.cpp:104
string GetAccessionForGi(TGi gi, CScope &scope, EAccessionVersion use_version=eWithAccessionVersion, EGetIdType flags=0)
Retrieve the accession for a given GI.
Definition: sequence.cpp:686
static void Translate(const string &seq, string &prot, const CGenetic_code *code, bool include_stop=true, bool remove_trailing_X=false, bool *alt_start=NULL, bool is_5prime_complete=true, bool is_3prime_complete=true)
Translate a string using a specified genetic code.
Definition: sequence.cpp:4095
@ fIs5PrimePartial
= 0x4 Translate first codon even if not start codon (because sequence is 5' partial)
Definition: sequence.hpp:984
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
const CSeqFeatData & GetData(void) const
TInst_Length GetInst_Length(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
const CSeq_loc & GetLocation(void) const
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
const_iterator begin(void) const
Definition: seq_vector.hpp:298
const_iterator end(void) const
Definition: seq_vector.hpp:305
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1684
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NcbiCerr
Definition: ncbistre.hpp:544
CTempString literal(const char(&str)[Size])
Templatized initialization from a string literal.
Definition: tempstr.hpp:441
static string LongToString(long value, TNumToStringFlags flags=0, int base=10)
Convert Int to string.
Definition: ncbistr.hpp:5140
#define NPOS
Definition: ncbistr.hpp:133
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:776
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
Definition: tempstr.hpp:655
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
bool IsLim(void) const
Check if variant Lim is selected.
Definition: Int_fuzz_.hpp:636
TLim GetLim(void) const
Get the variant data.
Definition: Int_fuzz_.hpp:642
TMin GetMin(void) const
Get the Min member data.
Definition: Int_fuzz_.hpp:519
bool IsRange(void) const
Check if variant Range is selected.
Definition: Int_fuzz_.hpp:603
TMax GetMax(void) const
Get the Max member data.
Definition: Int_fuzz_.hpp:472
const TRange & GetRange(void) const
Get the variant data.
Definition: Int_fuzz_.cpp:159
@ eLim_gt
greater than
Definition: Int_fuzz_.hpp:211
@ eLim_lt
less than
Definition: Int_fuzz_.hpp:212
@ eLim_tl
space to left of position
Definition: Int_fuzz_.hpp:214
@ eLim_tr
space to right of position
Definition: Int_fuzz_.hpp:213
bool IsCdregion(void) const
Check if variant Cdregion is selected.
const TFuzz_from & GetFuzz_from(void) const
Get the Fuzz_from member data.
bool IsEmpty(void) const
Check if variant Empty is selected.
Definition: Seq_loc_.hpp:516
const TPnt & GetPnt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:238
TPoint GetPoint(void) const
Get the Point member data.
Definition: Seq_point_.hpp:303
const TFuzz_to & GetFuzz_to(void) const
Get the Fuzz_to member data.
TFrom GetFrom(void) const
Get the From member data.
bool IsSetFuzz(void) const
Check if a value has been assigned to Fuzz data member.
Definition: Seq_point_.hpp:408
const TFuzz & GetFuzz(void) const
Get the Fuzz member data.
Definition: Seq_point_.hpp:420
bool IsSetFuzz_to(void) const
Check if a value has been assigned to Fuzz_to data member.
TTo GetTo(void) const
Get the To member data.
bool IsWhole(void) const
Check if variant Whole is selected.
Definition: Seq_loc_.hpp:522
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
bool IsNull(void) const
Check if variant Null is selected.
Definition: Seq_loc_.hpp:504
bool IsSetFuzz_from(void) const
Check if a value has been assigned to Fuzz_from data member.
bool IsPnt(void) const
Check if variant Pnt is selected.
Definition: Seq_loc_.hpp:540
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
bool IsNcbipna(void) const
Check if variant Ncbipna is selected.
Definition: Seq_data_.hpp:604
bool IsNcbieaa(void) const
Check if variant Ncbieaa is selected.
Definition: Seq_data_.hpp:644
bool IsNcbi8aa(void) const
Check if variant Ncbi8aa is selected.
Definition: Seq_data_.hpp:624
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
bool IsIupacaa(void) const
Check if variant Iupacaa is selected.
Definition: Seq_data_.hpp:524
const TIupacna & GetIupacna(void) const
Get the variant data.
Definition: Seq_data_.hpp:510
bool IsNcbistdaa(void) const
Check if variant Ncbistdaa is selected.
Definition: Seq_data_.hpp:684
bool IsNcbi4na(void) const
Check if variant Ncbi4na is selected.
Definition: Seq_data_.hpp:564
bool IsNcbi8na(void) const
Check if variant Ncbi8na is selected.
Definition: Seq_data_.hpp:584
TLength GetLength(void) const
Get the Length member data.
const TNcbieaa & GetNcbieaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:650
bool IsNcbipaa(void) const
Check if variant Ncbipaa is selected.
Definition: Seq_data_.hpp:664
bool IsSetSeq_data(void) const
may have the data Check if a value has been assigned to Seq_data data member.
bool IsNcbi2na(void) const
Check if variant Ncbi2na is selected.
Definition: Seq_data_.hpp:544
bool IsIupacna(void) const
Check if variant Iupacna is selected.
Definition: Seq_data_.hpp:504
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Iupacna
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
TMol GetMol(void) const
Get the Mol member data.
bool IsSetStop_offset_fuzz(void) const
Check if a value has been assigned to Stop_offset_fuzz data member.
list< CRef< CVariantPlacement > > TPlacements
list< CRef< CVariation > > TVariations
Definition: Variation_.hpp:158
const TStop_offset_fuzz & GetStop_offset_fuzz(void) const
Get the Stop_offset_fuzz member data.
bool IsInstance(void) const
Check if variant Instance is selected.
const TSet & GetSet(void) const
Get the variant data.
Definition: Variation_.cpp:220
bool IsSetSeq(void) const
for situations in which a raw location isn't sufficient Check if a value has been assigned to Seq dat...
const TInstance & GetInstance(void) const
Get the variant data.
Definition: Variation_.cpp:198
bool IsSetStart_offset_fuzz(void) const
Check if a value has been assigned to Start_offset_fuzz data member.
const TVariations & GetVariations(void) const
Get the Variations member data.
TStop_offset GetStop_offset(void) const
Get the Stop_offset member data.
const TSeq & GetSeq(void) const
Get the Seq member data.
const TData & GetData(void) const
Get the Data member data.
void SetMol(TMol value)
Assign a value to Mol data member.
bool IsSetStop_offset(void) const
Check if a value has been assigned to Stop_offset data member.
const TStart_offset_fuzz & GetStart_offset_fuzz(void) const
Get the Start_offset_fuzz member data.
bool IsSetStart_offset(void) const
location refinements, describing offsets into introns from product coordinates.
const TLoc & GetLoc(void) const
Get the Loc member data.
bool IsSet(void) const
Check if variant Set is selected.
void SetLoc(TLoc &value)
Assign a value to Loc data member.
TType GetType(void) const
Get the Type member data.
TStart_offset GetStart_offset(void) const
Get the Start_offset member data.
@ eData_set_type_genotype
changes on different alleles in the same genotype, e.g. g.[476C>T]+[476C>T]
Definition: Variation_.hpp:143
@ eData_set_type_chimeric
e.g. c.[1C>T//2G>T]
Definition: Variation_.hpp:149
@ eData_set_type_alleles
set represents a set of observed alleles
Definition: Variation_.hpp:147
@ eData_set_type_package
set represents a package of observations at a given location, generally containing asserted + referen...
Definition: Variation_.hpp:148
@ eData_set_type_compound
complex change at the same location on the same molecule
Definition: Variation_.hpp:140
@ eData_set_type_individual
same organism; allele relationship unknown, e.g. g.[476C>T(+)183G>C]
Definition: Variation_.hpp:145
@ eData_set_type_products
different products arising from the same variation in a precursor, e.g. r.[13g>a, 13_88del]
Definition: Variation_.hpp:141
@ eData_set_type_haplotype
changes on the same allele, e.g r.[13g>a;15u>c]
Definition: Variation_.hpp:142
@ eData_set_type_mosaic
different genotypes in the same individual
Definition: Variation_.hpp:144
@ eMol_cdna
"c." coordinates in HGVS
@ eMol_mitochondrion
"mt." coordinates in HGVS
@ eMol_rna
"n." coordinates in HGVS
@ eMol_protein
"p." coordinates in HGVS
@ eMol_genomic
"g." coordinates in HGVS
TType GetType(void) const
Get the Type member data.
const TLoc & GetLoc(void) const
Get the variant data.
const TDelta & GetDelta(void) const
Get the Delta member data.
const TSeq & GetSeq(void) const
Get the Seq member data.
const TLiteral & GetLiteral(void) const
Get the variant data.
TMultiplier GetMultiplier(void) const
Get the Multiplier member data.
list< CRef< CDelta_item > > TDelta
bool IsLiteral(void) const
Check if variant Literal is selected.
bool IsSetMultiplier(void) const
Multiplier allows representing a tandem, e.g.
bool IsThis(void) const
Check if variant This is selected.
bool IsLoc(void) const
Check if variant Loc is selected.
@ eType_snv
delta=[morph of length 1] NOTE: this is snV not snP; the latter requires frequency-based validation t...
@ eType_inv
delta=[del, ins.seq= RevComp(variation-location)]
@ eType_mnp
delta=[morph of length >1]
@ eType_microsatellite
delta=[del, ins.seq= repeat-unit with fuzzy multiplier] variation-location is the microsat expansion ...
@ eType_delins
delta=[del, ins]
@ eType_prot_nonsense
delta=[del]; variation-location is the tail of the protein being truncated
@ eType_transposon
delta=[del, ins.seq= known donor or 'this'] variation-location is equiv of transposon locs.
@ eType_prot_silent
delta=[morph of length 1, same AA as at variation-location]
@ eType_prot_missense
delta=[morph of length 1]
@ eType_prot_neutral
delta=[morph of length 1]
@ eAction_ins_before
insert seq before the location.start
@ eObservation_variant
inst represent the observed variant at a given position
@ eObservation_asserted
inst represents the asserted base at a position
@ eObservation_reference
inst represents the reference base at the position
int i
int len
string Ncbieaa2HgvsAA(const string &prot_str)
bool ContainsAnyFuzz(CConstRef< CVariantPlacement > p)
bool IsMitochondrion(CBioseq_Handle bsh)
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
Int4 delta(size_t dimension_, const Int4 *score_)
SAnnotSelector –.
Definition: inftrees.h:24
Definition: type.c:6
Calculate upstream (first) and downstream(second) flanks for loc.
Modified on Sat Dec 02 09:19:33 2023 by modify_doxy.py rev. 669887