NCBI C++ ToolKit
hgvs_parser.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: hgvs_parser.hpp 102847 2024-07-30 14:24:30Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Description:
27 *
28 * Translate HGVS expression to Variation-ref seq-feats.
29 * HGVS nomenclature rules: http://www.hgvs.org/mutnomen/
30 *
31 * ===========================================================================
32 */
33 
34 #ifndef HGVSPARSER_HPP_
35 #define HGVSPARSER_HPP_
36 
37 #include <corelib/ncbiobj.hpp>
38 #include <corelib/ncbistd.hpp>
39 
40 #include <boost/version.hpp>
41 #if BOOST_VERSION >= 103800
42 #include <boost/spirit/include/classic.hpp>
43 #include <boost/spirit/include/classic_core.hpp>
44 #include <boost/spirit/include/classic_ast.hpp>
45 #include <boost/spirit/include/classic_parse_tree.hpp>
46 #include <boost/spirit/include/classic_tree_to_xml.hpp>
47 
48 using namespace BOOST_SPIRIT_CLASSIC_NS;
49 #else
50 //older boost
51 #include <boost/spirit.hpp>
52 #include <boost/spirit/core.hpp>
53 #include <boost/spirit/tree/ast.hpp>
54 #include <boost/spirit/tree/parse_tree.hpp>
55 #include <boost/spirit/tree/tree_to_xml.hpp>
56 using namespace boost::spirit;
57 #endif
58 
64 #include <objects/seq/Seq_data.hpp>
65 
69 
70 #include <objmgr/scope.hpp>
71 #include <objmgr/feat_ci.hpp>
72 #include <objmgr/seq_vector.hpp>
73 
74 
77 
78 #define HGVS_THROW(err_code, message) NCBI_THROW(CHgvsParser::CHgvsParserException, err_code, message)
79 
80 namespace variation_ref {
81 
82 class CHgvsParser : public CObject
83 {
84 public:
86  : m_scope(&scope)
87  {}
88 
89  enum EOpFlags
90  {
91  fOpFlags_RelaxedAA = 1 << 0, ///< try assumbing all-uppercase three-letter AA representation
92  fOpFlags_Default = fOpFlags_RelaxedAA
93  };
94  typedef int TOpFlags;
95 
96 
97  CRef<CSeq_feat> AsVariationFeat(const string& hgvs_expression, TOpFlags = fOpFlags_Default);
98  string AsHgvsExpression(const CSeq_feat& feat);
99 
101  {
102  return *m_scope;
103  }
104 
106  {
107  public:
108  enum EErrCode {
109  eLogic, ///<Problem with the code
110  eGrammatic, ///<Expression is not a valid language
111  eSemantic, ///<Expression is invalid in some way
112  eContext, ///<Some problem with context
113  eAlignment, ///<Some problem with getting alignment
114  ePrecondition, ///<Precondition is not met
116  };
117 
118  virtual const char* GetErrCodeString(void) const
119  {
120  switch(GetErrCode()) {
121  case eLogic: return "eLogic";
122  case eGrammatic: return "eGrammatic";
123  case eSemantic: return "eSemantic";
124  case eContext: return "eContext";
125  case eAlignment: return "eAlignment";
126  case ePrecondition: return "ePrecondition";
127  case eOther: return "eOther";
128  default: return CException::GetErrCodeString();
129  }
130  }
132  };
133 
134 
135 protected:
136 
137  struct SFuzzyInt
138  {
140  {
141  Reset();
142  }
143 
144  void Assign(const SFuzzyInt& other) {
145  value = other.value;
146  if(!other.fuzz) {
147  fuzz.Reset();
148  } else {
149  if(!fuzz) {
150  fuzz.Reset(new CInt_fuzz);
151  }
152  fuzz->Assign(*other.fuzz);
153  }
154  }
155 
156  void Reset()
157  {
158  value = 0;
159  fuzz.Reset();
160  }
161 
162  long value;
163  CRef<CInt_fuzz> fuzz; //can be null;
164  };
165 
167  {
169  {
170  Reset();
171  }
172 
173  bool IsOffset() const {
174  return offset.value || offset.fuzz;
175  }
176 
177  void Reset()
178  {
179  pnt.Reset();
180  offset.Reset();
181  }
182 
183  void Assign(const SOffsetPoint& other)
184  {
185  offset.Assign(other.offset);
186  if(!other.pnt) {
187  pnt.Reset();
188  } else {
189  if(!pnt) {
190  pnt.Reset(new CSeq_point);
191  }
192  pnt->Assign(*other.pnt);
193  }
194  }
195 
199  };
200 
201  struct SOffsetLoc
202  {
204  {
205  Reset();
206  }
207 
208  void Reset()
209  {
210  loc.Reset();
211  start_offset.Reset();
212  stop_offset.Reset();
213  }
214 
215  void Assign(const SOffsetLoc& other)
216  {
217  start_offset.Assign(other.start_offset);
218  stop_offset.Assign(other.stop_offset);
219  if(!other.loc) {
220  loc.Reset();
221  } else {
222  if(!loc) {
223  loc.Reset(new CSeq_loc);
224  }
225  loc->Assign(*other.loc);
226  }
227  }
228 
229  bool IsOffset() const
230  {
231  return start_offset.value || start_offset.value || stop_offset.fuzz || stop_offset.fuzz;
232  }
233 
234  TSeqPos GetLength() const;
235 
240  };
241 
242  /*!
243  * CContext encapsulates sequence or location context for an hgvs sub-expression.
244  * E.g. given an expression id:c.5_10delinsAT, when creating a variation-ref
245  * for delinsAT the context will refer to sequence "id" and location "5_10"
246  */
247  class CContext
248  {
249  public:
251  : m_scope(scope)
252  {
253  Clear();
254  }
255 
256  CContext(const CContext& other)
257  {
258  *this = other; //shallow copy will suffice.
259  }
260 
261  enum EMolType {
267  eMol_mt
268  };
269 
270  void Clear()
271  {
272  m_bsh.Reset();
273  m_mol_type = eMol_not_set;
274  m_cds.Reset();
275  m_seq_id.Reset();
276  m_loc.Reset();
277  }
278 
280  {
281  return m_bsh.GetBioseqLength();
282  }
283 
284  /*!
285  * Clear the context and reset it for the given seq-id.
286  * If the sequence is cdna and we're working with "c." coordinates,
287  * also find the CDS, as the coordinates are (start|stop)codon-relative.
288  */
289  void SetId(const CSeq_id& id, EMolType mol_type);
290 
291  void Validate(const CSeq_literal& literal) const
292  {
293  if(!m_loc.IsOffset()) {
294  //Can only validate normal locs, as with offset loc the asserted
295  //allele does not correspond to the base loc.
296  Validate(literal, GetLoc());
297  } else {
298  //LOG_POST("Ignoring validation of literal due to offset location");
299  }
300  }
301 
302  void Validate(const CSeq_literal& literal, const CSeq_loc& loc) const;
303 
304  void SetLoc(const SOffsetLoc& loc)
305  {
306  m_loc.Assign(loc);
307  }
308 
309  bool IsSetLoc() const
310  {
311  return !m_loc.loc.IsNull();
312  }
313 
314  CScope& GetScope() const
315  {
316  return *m_scope;
317  }
318 
319  const CSeq_loc& GetLoc() const;
320 
321  const SOffsetLoc& GetOffsetLoc() const;
322 
323  const CSeq_id& GetId() const;
324 
325  const CSeq_feat& GetCDS() const;
326 
327  EMolType GetMolType(bool check=true) const;
328 
329 
330  private:
337  };
338 
339  struct SGrammar: public grammar<SGrammar>
340  {
341  /*!
342  * Deviations from the recommendations:
343  *
344  * Flattened compound lists are not supported (a list must have same delimiter type)
345  * [[[a,b];c;f](+)[d,e]] - OK
346  * [a,b;c;f(+)d,e] - NOT OK
347  *
348  * No mixing of different expression types within a list.
349  * Note: this example is not mentioned in specifications, but found it in Mutalyzer docs.
350  * AB026906:c.[1del;6_7insoAL449423.14(CDKN2A):c.[1_10;5del]]
351  * 1_10 specifies a location
352  * 5del specifies a variation instance
353  * [1_10;5del] is not a valid expression
354  *
355  * Only seq-id header containing actual seq-id (gi or acc.ver) are supported:
356  * Y : NM_004006.1:c.3G>T - uses a GenBank file as indicator
357  N : GJB2:c.76A>C - uses a HGNC-approved gene symbol as indicator
358  Y : DMD{NM_004006.1}:c.3G>T - uses both a HGNC-approved gene symbol and a GenBank file as indicator
359  N : chrX:g.32,218,983_32,984,039del - uses a chromosome indicator (here X)
360  N : rs2306220:A>G - using a dbSNP-identifier as indicator
361  N : DXS1219:g.CA[18] (or AFM297yd1:g.CA[18]) - uses marker DXS1219 / AFM297yd1 as indicator
362  Note: cases below are not described in recommendations, but found in Mutalyzer docs.
363  N : AL449423.14(CDKN2A_v003):g.10del - not in symbol{seq_id} format
364  N : CDKN2A_v003{AL449423.14}:c.1_*3352del cDNA coordinates on a genomic sequence.
365  *
366  */
367 
368  enum E_NodeIds {
369  eID_NONE = 0,
410  eID_prot_ext
411  };
412 
413 
414  typedef std::map<parser_id, std::string> TRuleNames;
417  {
418  TRuleNames& m_ = s_rule_names.Get();
419  if(s_rule_names->size() == 0) {
420  m_[eID_NONE] = "NONE";
421  m_[eID_root] = "root";
422  m_[eID_list1a] = "list1a";
423  m_[eID_list2a] = "list2a";
424  m_[eID_list3a] = "list3a";
425  m_[eID_list1b] = "list1b";
426  m_[eID_list2b] = "list2b";
427  m_[eID_list3b] = "list3b";
428  m_[eID_expr1] = "expr1";
429  m_[eID_expr2] = "expr2";
430  m_[eID_expr3] = "expr3";
431  m_[eID_translocation] = "translocation";
432  m_[eID_header] = "header";
433  m_[eID_location] = "location";
434  m_[eID_mol] = "mol";
435  m_[eID_seq_id] = "seq_id";
436  m_[eID_mut_list] = "mut_list";
437  m_[eID_mut_ref] = "mut_ref";
438  m_[eID_nuc_range] = "nuc_range";
439  m_[eID_prot_range] = "prot_range";
440  m_[eID_mut_inst] = "mut_inst";
441  m_[eID_int_fuzz] = "int_fuzz";
442  m_[eID_abs_pos] = "abs_pos";
443  m_[eID_general_pos] = "general_pos";
444  m_[eID_fuzzy_pos] = "fuzzy_pos";
445  m_[eID_pos_spec] = "pos_spec";
446  m_[eID_raw_seq] = "raw_seq";
447  m_[eID_aminoacid] = "aminoacid";
448  m_[eID_nuc_subst] = "nuc_subst";
449  m_[eID_deletion] = "deletion";
450  m_[eID_insertion] = "insertion";
451  m_[eID_delins] = "delins";
452  m_[eID_duplication] = "duplication";
453  m_[eID_nuc_inv] = "nuc_inv";
454  m_[eID_ssr] = "ssr";
455  m_[eID_conversion] = "conversion";
456  m_[eID_seq_loc] = "seq_loc";
457  m_[eID_seq_ref] = "seq_ref";
458  m_[eID_prot_pos] = "prot_pos";
459  m_[eID_prot_fs] = "prot_fs";
460  m_[eID_prot_missense] = "prot_missense";
461  m_[eID_prot_ext] = "prot_ext";
462  }
463  return s_rule_names.Get();
464  }
465 
466  static const string& s_GetRuleName(parser_id id);
467 
468  template <typename ScannerT>
469  struct definition
470  {
471  rule<ScannerT, parser_context<>, parser_tag<eID_root> > root;
472  rule<ScannerT, parser_context<>, parser_tag<eID_list1a> > list1a;
473  rule<ScannerT, parser_context<>, parser_tag<eID_list2a> > list2a;
474  rule<ScannerT, parser_context<>, parser_tag<eID_list3a> > list3a;
475  rule<ScannerT, parser_context<>, parser_tag<eID_list1b> > list1b;
476  rule<ScannerT, parser_context<>, parser_tag<eID_list2b> > list2b;
477  rule<ScannerT, parser_context<>, parser_tag<eID_list3b> > list3b;
478  rule<ScannerT, parser_context<>, parser_tag<eID_expr1> > expr1;
479  rule<ScannerT, parser_context<>, parser_tag<eID_expr2> > expr2;
480  rule<ScannerT, parser_context<>, parser_tag<eID_expr3> > expr3;
481  rule<ScannerT, parser_context<>, parser_tag<eID_translocation> > translocation;
482  rule<ScannerT, parser_context<>, parser_tag<eID_header> > header;
483  rule<ScannerT, parser_context<>, parser_tag<eID_seq_id> > seq_id;
484  rule<ScannerT, parser_context<>, parser_tag<eID_mol> > mol;
485  rule<ScannerT, parser_context<>, parser_tag<eID_mut_list > > mut_list;
486  rule<ScannerT, parser_context<>, parser_tag<eID_mut_ref> > mut_ref;
487  rule<ScannerT, parser_context<>, parser_tag<eID_mut_inst> > mut_inst;
488  rule<ScannerT, parser_context<>, parser_tag<eID_int_fuzz> > int_fuzz;
489  rule<ScannerT, parser_context<>, parser_tag<eID_abs_pos> > abs_pos;
490  rule<ScannerT, parser_context<>, parser_tag<eID_general_pos> > general_pos;
491  rule<ScannerT, parser_context<>, parser_tag<eID_fuzzy_pos> > fuzzy_pos;
492  rule<ScannerT, parser_context<>, parser_tag<eID_pos_spec> > pos_spec;
493  rule<ScannerT, parser_context<>, parser_tag<eID_location> > location;
494  rule<ScannerT, parser_context<>, parser_tag<eID_nuc_range> > nuc_range;
495  rule<ScannerT, parser_context<>, parser_tag<eID_prot_range> > prot_range;
496  rule<ScannerT, parser_context<>, parser_tag<eID_raw_seq> > raw_seq;
497  rule<ScannerT, parser_context<>, parser_tag<eID_aminoacid> > aminoacid;
498  rule<ScannerT, parser_context<>, parser_tag<eID_nuc_subst> > nuc_subst;
499  rule<ScannerT, parser_context<>, parser_tag<eID_deletion> > deletion;
500  rule<ScannerT, parser_context<>, parser_tag<eID_insertion> > insertion;
501  rule<ScannerT, parser_context<>, parser_tag<eID_delins> > delins;
502  rule<ScannerT, parser_context<>, parser_tag<eID_duplication> > duplication;
503  rule<ScannerT, parser_context<>, parser_tag<eID_nuc_inv> > nuc_inv;
504  rule<ScannerT, parser_context<>, parser_tag<eID_ssr> > ssr;
505  rule<ScannerT, parser_context<>, parser_tag<eID_conversion> > conversion;
506  rule<ScannerT, parser_context<>, parser_tag<eID_seq_loc> > seq_loc;
507  rule<ScannerT, parser_context<>, parser_tag<eID_seq_ref> > seq_ref;
508  rule<ScannerT, parser_context<>, parser_tag<eID_prot_pos> > prot_pos;
509  rule<ScannerT, parser_context<>, parser_tag<eID_prot_missense> > prot_missense;
510  rule<ScannerT, parser_context<>, parser_tag<eID_prot_ext> > prot_ext;
511  rule<ScannerT, parser_context<>, parser_tag<eID_prot_fs> > prot_fs;
512 
514  {
515  aminoacid = str_p("Ala")
516  | str_p("Asx")
517  | str_p("Cys")
518  | str_p("Asp")
519  | str_p("Glu")
520  | str_p("Phe")
521  | str_p("Gly")
522  | str_p("His")
523  | str_p("Ile")
524  | str_p("Lys")
525  | str_p("Leu")
526  | str_p("Met")
527  | str_p("Asn")
528  | str_p("Pro")
529  | str_p("Gln")
530  | str_p("Arg")
531  | str_p("Ser")
532  | str_p("Thr")
533  | str_p("Val")
534  | str_p("Trp")
535  | str_p("Tyr")
536  | str_p("Glx")
537  | chset<>("XARNDCEQGHILKMFPSTWYV") //"X" as in p.X110GlnextX17
538  ;
539 
540  raw_seq = leaf_node_d[+aminoacid | +chset<>("ACGTN") | +chset<>("acgun")];
541  /*
542  * Note: there's no separation between protein, DNA and RNA sequences, as
543  * at the parse time it is not known without context which sequence it is - i.e.
544  * "AC" could be a dna sequence, or AlaCys. Hence, raw_seq will match any sequence
545  * and at the parse-tree transformation stage we'll create proper seq-type as
546  * the mol will be known then.
547  *
548  * Note: +aminoacid precedes +chset<>("ACGTN") so that we don't prematurely match T in Trp etc.
549  */
550 
551 
552  /*
553  * Positions and Locations
554  */
555 
556  int_fuzz = ch_p('(') >> (ch_p('?')|int_p) >> ch_p('_') >> (ch_p('?')|int_p) >> ch_p(')')
557  | ch_p('(') >> int_p >> ch_p(')') //note: not ch_p('?') here as this makes grammar amgiguous
558  | (ch_p('?')|int_p);
559 
560  abs_pos = !ch_p('*') >> int_fuzz;
561  //Note: '*' means the location is CDS-stop-relative; cdna context required.
562  //Otherwise it is CDS-start-relative iff in cdna-context.
563 
564  general_pos = (str_p("IVS") >> int_p | abs_pos) >> sign_p >> int_fuzz
565  | abs_pos;
566  //Warning: offset-pos must be followed by a sign because otherwise
567  //it is ambiguous whether c.123(1_2) denotes a SSR or a fuzzy intronic location
568 
569 
570  fuzzy_pos = discard_node_d[ch_p('(')]
571  >> general_pos
572  >> discard_node_d[ch_p('_')]
573  >> general_pos
574  >> discard_node_d[ch_p(')')];
575 
576  pos_spec = general_pos //intronic offset-pos
577  | fuzzy_pos //(generalpos_generalpos)
578  | !ch_p('o') >> header >> pos_spec; //far-loc, as in id:c.88+101_id2:c.355-1045del
579 
580 
581  prot_pos = raw_seq >> pos_spec; //raw_seq must be of length one (single aminoacid)
582 
583  prot_range = prot_pos >> discard_node_d[ch_p('_')] >> prot_pos;
584 
585  nuc_range = pos_spec >> discard_node_d[ch_p('_')] >> pos_spec;
586 
587  location = nuc_range | pos_spec | prot_range | prot_pos;
588  //Note that this describes "local" coordinates within a sequence context, not a seq-loc
589 
590  /*
591  * Seq-ids and seq-locs
592  */
593  seq_id = leaf_node_d[alpha_p >> +(alnum_p | chset<>("._-|"))];
594 
595  mol = str_p("mt") | chset<>("gcrpm"); //note: for 'mt.' also supporting 'm.'
596 
597  header = seq_id
598  >> !(discard_node_d[ch_p('{')]
599  >> seq_id
600  >> discard_node_d[ch_p('}')])
601  >> discard_node_d[ch_p(':')]
602  >> mol
603  >> discard_node_d[ch_p('.')];
604  /*
605  * A the parsing stage we'll not require that the mutations for
606  * different mol-types don't mix, i.e. protein mutation type for a
607  * mol-type "c." - this will be deferred to the semantic check that will
608  * validate the parsed expression. The reason is that the g/c/r/p specs mostly
609  * overlap, so we can avoid exploding the grammar.
610  */
611 
612  seq_loc = !ch_p('o') >> header >> location;
613 
614  seq_ref = seq_loc //far-loc
615  | (nuc_range|prot_range) //local-loc of range-type, e.g. c.17_18ins5_16 http://www.hgvs.org/mutnomen/FAQ.html
616  //This is to exclude point-locs (see below)
617  | raw_seq //literal sequence
618  | int_fuzz; //unknown sequence of some length
619  // WARNING: this is ambiguous WRT local-loc!
620  // e.g. p.Glu5Valins2fsX3 - 2 in ins2 indicates sequence of length two, NOT the sequence at position 2.
621  // Hence, the local-locs above must be specified via range-types only.
622 
623  nuc_subst = raw_seq >> ch_p('>') >> raw_seq; //semantic check: must be of length 1
624 
625  deletion = str_p("del") >> !(raw_seq | int_p);
626 
627  duplication = str_p("dup") >> !seq_ref;
628 
629  insertion = str_p("ins") >> seq_ref;
630 
631  conversion = str_p("con") >> seq_loc;
632 
633  delins = str_p("del") >> !raw_seq >> str_p("ins") >> seq_ref;
634 
635  nuc_inv = str_p("inv") >> !int_p;
636 
637  ssr = !raw_seq >> ( int_fuzz - (ch_p('?')|int_p) //don't want to interpret 'id:5_6?' as ssr
638  | list_p(discard_node_d[ch_p('[')]
639  >> int_p
640  >> discard_node_d[ch_p(']')],
641  discard_node_d[ch_p('+')]));
642  /*
643  * Note: It is not correct to match [5] and [3] in NM_000815.2:c.101TC[5]+[3]
644  * individually as list3a within c.101 location context, because
645  * the location-spec for an ssr may point to the first repeat-unit base only,
646  * in which case we need to calculate the actual range-loc based on the sequence literal,
647  * and when processing "[3]" later the "TC" literal will not be in context,
648  * and it will appear as if the context is NM_000815.2:c.101 rather than c.101_102.
649  *
650  * Hence, the ssr rule will have to consume the list of multipliers [5]+[3]
651  * and generate compound variation-ref "manually"
652  */
653 
654 
655  prot_fs = str_p("fs") >> !(ch_p('X') >> int_p);
656 
657  prot_ext = (str_p("extMet") | str_p("extX")) >> int_p;
658 
659  prot_missense = aminoacid;
660 
661  //ISCN expression followed by a seq-loc
662  translocation = str_p("t(")
663  >> leaf_node_d[*(print_p - ch_p('(') - ch_p(')'))]
664  >> str_p(")(")
665  >> leaf_node_d[*(print_p - ch_p('(') - ch_p(')'))]
666  >> str_p(")")
667  >> seq_loc;
668 
669  mut_inst = ch_p('?') //can exist within +mut_inst, e.g. p.Met1?extMet-5
670  | ch_p('=')
671  | delins //delins must precede del
672  | deletion
673  | insertion
674  | duplication
675  | nuc_subst
676  | nuc_inv
677  | ssr
678  | conversion
679  | prot_fs
680  | prot_missense
681  | prot_ext //this may occur inside location context, e.g. p.Met1ValextMet-12
682  ;
683  //Note: '?' and '=' can exist both within a location context
684  //(i.e. expr3) or within a sequence context (i.e. expr2)
685  //additionally, prot_ext may exist as expr2 (outside of location context) as well.
686 
687 
688  root = list_p(expr1, ch_p('+'));
689  //At the root level the '+'-delimited expressions are not required to be bracketed, i.e.
690  //NM_004004.2:c.[35delG]+NM_006783.1:c.[689_690insT] instead of
691  //[NM_004004.2:c.35delG]+[NM_006783.1:c.689_690insT]
692 
693 
694  expr1 = ch_p('(') >> expr1 >> ch_p(')')
695  | list1a
696  | header >> expr2
697  | translocation;
698  list1a = list_p(discard_node_d[ch_p('[')] >> list1b >> discard_node_d[ch_p(']')], ch_p('+'));
699  list1b = list_p(expr1, chset<>(",;") | str_p("(+)"));
700 
701 
702  expr2 = ch_p('(') >> expr2 >> ch_p(')')
703  | list2a
704  | str_p("0?") //note: precdes location>>expr3 such that not matched as unknown variation at pos=0 here
705  | ch_p('0')
706  | location >> expr3
707  | prot_ext //can also exist within location context (mut_inst)
708  | ch_p('?') //note: follows location>>expr3 such that variation at unknown pos is not partially-matched as unknown variation
709  | ch_p('=');
710  list2a = list_p(discard_node_d[ch_p('[')] >> list2b >> discard_node_d[ch_p(']')], ch_p('+'));
711  list2b = list_p(expr2, chset<>(",;") | str_p("(+)"));
712 
713 
714  expr3 = ch_p('(') >> expr3 >> ch_p(')')
715  | list3a
716  | +mut_inst;
717  /*
718  * Note: Multiple mut_insts that are not delimited, i.e.
719  * abc instead of [a;b;c] are legit, e.g.
720  *
721  * a) p.X110GlnextX17
722  * b) NM_012345.3:c.123+45_123+51dupinsAB012345.3:g.393_1295
723  */
724  list3a = list_p(discard_node_d[ch_p('[')] >> list3b >> discard_node_d[ch_p(']')], ch_p('+'));
725  list3b = list_p(expr3, chset<>(",;") | str_p("(+)"));
726 
727  //BOOST_SPIRIT_DEBUG_RULE(expr1);
728  }
729 
730  rule<ScannerT, parser_context<>, parser_tag<eID_root> > const& start() const
731  {
732  return root;
733  }
734  };
735 
736  static bool s_is_list(parser_id id)
737  {
738  return id == SGrammar::eID_list1a
739  || id == SGrammar::eID_list2a
740  || id == SGrammar::eID_list3a
741  || id == SGrammar::eID_list1b
742  || id == SGrammar::eID_list2b
743  || id == SGrammar::eID_list3b
744  || id == SGrammar::eID_root;
745  }
746  };
748 
749 
750 private:
751  typedef tree_match<char const*> TParseTreeMatch;
752  typedef TParseTreeMatch::const_tree_iterator TIterator;
755 
756 
757  static SFuzzyInt x_int_fuzz (TIterator const& i, const CContext& context);
758 
759  static CRef<CSeq_point> x_abs_pos (TIterator const& i, const CContext& context);
760  static SOffsetPoint x_general_pos (TIterator const& i, const CContext& context);
761  static SOffsetPoint x_fuzzy_pos (TIterator const& i, const CContext& context);
762  static SOffsetPoint x_pos_spec (TIterator const& i, const CContext& context);
763  static SOffsetPoint x_prot_pos (TIterator const& i, const CContext& context);
764 
765  static SOffsetLoc x_range (TIterator const& i, const CContext& context);
766  static SOffsetLoc x_location (TIterator const& i, const CContext& context);
767 
768  static CRef<CSeq_loc> x_seq_loc (TIterator const& i, const CContext& context);
769  static CRef<CSeq_literal> x_raw_seq (TIterator const& i, const CContext& context);
770  static TDelta x_seq_ref (TIterator const& i, const CContext& context);
771  static CRef<CVariation_ref> x_identity (const CContext& context);
772  static CRef<CVariation_ref> x_delins (TIterator const& i, const CContext& context);
773  static CRef<CVariation_ref> x_deletion (TIterator const& i, const CContext& context);
774  static CRef<CVariation_ref> x_insertion (TIterator const& i, const CContext& context);
775  static CRef<CVariation_ref> x_duplication (TIterator const& i, const CContext& context);
776  static CRef<CVariation_ref> x_nuc_subst (TIterator const& i, const CContext& context);
777  static CRef<CVariation_ref> x_nuc_inv (TIterator const& i, const CContext& context);
778  static CRef<CVariation_ref> x_ssr (TIterator const& i, const CContext& context);
779  static CRef<CVariation_ref> x_conversion (TIterator const& i, const CContext& context);
780  static CRef<CVariation_ref> x_prot_ext (TIterator const& i, const CContext& context);
781  static CRef<CVariation_ref> x_prot_missense (TIterator const& i, const CContext& context);
782  static CRef<CVariation_ref> x_translocation (TIterator const& i, const CContext& context);
783  static CRef<CVariation_ref> x_mut_inst (TIterator const& i, const CContext& context);
784  static CRef<CVariation_ref> x_expr1 (TIterator const& i, const CContext& context);
785  static CRef<CVariation_ref> x_expr2 (TIterator const& i, const CContext& context);
786  static CRef<CVariation_ref> x_expr3 (TIterator const& i, const CContext& context);
787  static CRef<CVariation_ref> x_prot_fs (TIterator const& i, const CContext& context);
788  static CRef<CVariation_ref> x_list (TIterator const& i, const CContext& context);
789  static CContext x_header (TIterator const& i, const CContext& context);
790  static CRef<CSeq_feat> x_root (TIterator const& i, const CContext& context);
791 
792  static CRef<CVariation_ref> x_unwrap_iff_singleton(CVariation_ref& v);
793 
794 
795  ///Convert HGVS amino-acid code to ncbieaa
796  static string s_hgvsaa2ncbieaa(const string& hgvsaa);
797 
798  ///Convert non-HGVS compliant all-uppercase AAs to UpLow, e.g. ILECYS ->IleCys
799  static string s_hgvsUCaa2hgvsUL(const string& hgvsaa);
800 
801 private:
802  //functions to create hgvs expression from a variation-ref
803 
804  /// variatino must have seq-loc specified
805  string x_AsHgvsExpression(const CVariation_ref& variation,
806  const CSeq_loc& parent_loc, //if variation has seq-loc set, it will be used instead.
807  bool is_top_level);
808 
809 
810  //Calculate the length of the inst, multipliers taken into account.
811  TSeqPos x_GetInstLength(const CVariation_inst& inst, const CSeq_loc& this_loc);
812 
813  string x_GetInstData(const CVariation_inst& inst, const CSeq_loc& this_loc);
814 
815 
816  /// Only subset of insts can be expressed as HGVS, this will throw otherwise.
817  /// loc is required to capture sequence at loc for snps, e.g. 'A' in 'A>G'
818  ///
819  /// loc is modified to conform to HGVS flavor (on plus strand, modify
820  /// starts/stops for ins or microsatellite locatinos to conform to HGVS
821  /// convention where differes from Variation-ref standard)
822  string x_InstToString(const CVariation_inst& inst, CSeq_loc& loc);
823 
824  string x_LocToSeqStr(const CSeq_loc& loc);
825 
826  /*
827  * Convert a loc to hgvs representation.
828  *
829  *
830  * if alignment is provided (spliced-seg), then it is assumed that loc is the
831  * genomic loc but the output should be represented in cdna context.
832  */
833  string x_SeqLocToStr(const CSeq_loc& loc, bool with_header);
834 
835  // force-map: remap via alignment
836  string x_SeqPntToStr(const CSeq_point& pnt, TSeqPos first_pos);
837 
838  /// Convert seq-id to HGVS seq-id header, e.g. "NM_123456.7:c." or "NG_123456.7:p"
839  string x_SeqIdToHgvsHeader(const CSeq_id& id);
840 
841  string x_IntWithFuzzToStr(int value, const CInt_fuzz* fuzz = NULL);
842 
843  //string Delta_itemToStr(const CDelta_item& delta, bool flip_strand);
844 
845  string x_SeqLiteralToStr(const CSeq_literal& literal, bool flip_strand);
846 
847 
848 
850 
851 private:
853 };
854 
855 };
856 
858 
859 
860 #endif /* HGVSPARSER_HPP_ */
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_Handle –.
CObject –.
Definition: ncbiobj.hpp:180
T & Get(void)
Create the variable if not created yet, return the reference.
CScope –.
Definition: scope.hpp:92
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CVariation_inst –.
Set of related Variations.
void Validate(const CSeq_literal &literal) const
void SetLoc(const SOffsetLoc &loc)
NCBI_EXCEPTION_DEFAULT(CHgvsParserException, CException)
@ eGrammatic
Expression is not a valid language.
@ eAlignment
Some problem with getting alignment.
@ eSemantic
Expression is invalid in some way.
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
tree_match< char const * > TParseTreeMatch
CVariation_inst::TDelta::value_type TDelta
static CSafeStatic< SGrammar > s_grammar
CVariation_ref::TData::TSet TVariationSet
static CRef< CVariation_ref > s_ProtToCdna(const CVariation_ref &vr, CScope &scope)
TParseTreeMatch::const_tree_iterator TIterator
CHgvsParser(CScope &scope)
Definition: hgvs_parser.hpp:85
Include a standard set of the NCBI C++ Toolkit most basic headers.
#define check(s)
Definition: describecol2.c:21
int offset
Definition: replacements.h:160
static const char location[]
Definition: config.c:97
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define NULL
Definition: ncbistd.hpp:225
EErrCode
Error types that an application can generate.
Definition: ncbiexpt.hpp:884
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
Definition: ncbiexpt.cpp:444
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CTempString literal(const char(&str)[Size])
Templatized initialization from a string literal.
Definition: tempstr.hpp:441
int i
USING_SCOPE(objects)
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
int GetLoc(const string &acc, const string &pat, CSeq_loc &loc, CScope &scope)
void Assign(const SFuzzyInt &other)
rule< ScannerT, parser_context<>, parser_tag< eID_insertion > > insertion
rule< ScannerT, parser_context<>, parser_tag< eID_seq_loc > > seq_loc
rule< ScannerT, parser_context<>, parser_tag< eID_seq_ref > > seq_ref
rule< ScannerT, parser_context<>, parser_tag< eID_prot_fs > > prot_fs
rule< ScannerT, parser_context<>, parser_tag< eID_prot_pos > > prot_pos
rule< ScannerT, parser_context<>, parser_tag< eID_prot_range > > prot_range
rule< ScannerT, parser_context<>, parser_tag< eID_list3b > > list3b
rule< ScannerT, parser_context<>, parser_tag< eID_expr1 > > expr1
rule< ScannerT, parser_context<>, parser_tag< eID_nuc_subst > > nuc_subst
rule< ScannerT, parser_context<>, parser_tag< eID_location > > location
rule< ScannerT, parser_context<>, parser_tag< eID_list3a > > list3a
rule< ScannerT, parser_context<>, parser_tag< eID_delins > > delins
rule< ScannerT, parser_context<>, parser_tag< eID_abs_pos > > abs_pos
rule< ScannerT, parser_context<>, parser_tag< eID_prot_ext > > prot_ext
rule< ScannerT, parser_context<>, parser_tag< eID_nuc_inv > > nuc_inv
rule< ScannerT, parser_context<>, parser_tag< eID_ssr > > ssr
rule< ScannerT, parser_context<>, parser_tag< eID_general_pos > > general_pos
rule< ScannerT, parser_context<>, parser_tag< eID_pos_spec > > pos_spec
rule< ScannerT, parser_context<>, parser_tag< eID_header > > header
rule< ScannerT, parser_context<>, parser_tag< eID_mut_ref > > mut_ref
rule< ScannerT, parser_context<>, parser_tag< eID_list2b > > list2b
rule< ScannerT, parser_context<>, parser_tag< eID_root > > const & start() const
rule< ScannerT, parser_context<>, parser_tag< eID_list1b > > list1b
rule< ScannerT, parser_context<>, parser_tag< eID_int_fuzz > > int_fuzz
rule< ScannerT, parser_context<>, parser_tag< eID_expr2 > > expr2
rule< ScannerT, parser_context<>, parser_tag< eID_translocation > > translocation
rule< ScannerT, parser_context<>, parser_tag< eID_fuzzy_pos > > fuzzy_pos
rule< ScannerT, parser_context<>, parser_tag< eID_list1a > > list1a
rule< ScannerT, parser_context<>, parser_tag< eID_prot_missense > > prot_missense
rule< ScannerT, parser_context<>, parser_tag< eID_duplication > > duplication
rule< ScannerT, parser_context<>, parser_tag< eID_mut_list > > mut_list
rule< ScannerT, parser_context<>, parser_tag< eID_deletion > > deletion
rule< ScannerT, parser_context<>, parser_tag< eID_aminoacid > > aminoacid
rule< ScannerT, parser_context<>, parser_tag< eID_seq_id > > seq_id
rule< ScannerT, parser_context<>, parser_tag< eID_list2a > > list2a
rule< ScannerT, parser_context<>, parser_tag< eID_root > > root
rule< ScannerT, parser_context<>, parser_tag< eID_mut_inst > > mut_inst
rule< ScannerT, parser_context<>, parser_tag< eID_expr3 > > expr3
rule< ScannerT, parser_context<>, parser_tag< eID_nuc_range > > nuc_range
rule< ScannerT, parser_context<>, parser_tag< eID_conversion > > conversion
rule< ScannerT, parser_context<>, parser_tag< eID_mol > > mol
rule< ScannerT, parser_context<>, parser_tag< eID_raw_seq > > raw_seq
static CSafeStatic< TRuleNames > s_rule_names
std::map< parser_id, std::string > TRuleNames
static TRuleNames & s_GetRuleNames()
static bool s_is_list(parser_id id)
void Assign(const SOffsetLoc &other)
void Assign(const SOffsetPoint &other)
static CS_CONTEXT * context
Definition: will_convert.c:21
Modified on Wed Sep 04 15:02:08 2024 by modify_doxy.py rev. 669887