NCBI C++ ToolKit
hgvs_parser2.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: hgvs_parser2.hpp 87983 2019-10-29 03:34:08Z villamar $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Description:
27 *
28 * Translate HGVS expression to Variation-ref seq-feats.
29 * HGVS nomenclature rules: http://www.hgvs.org/mutnomen/
30 *
31 * ===========================================================================
32 */
33 
34 #ifndef HGVSPARSER_HPP_
35 #define HGVSPARSER_HPP_
36 
37 #include <corelib/ncbiobj.hpp>
38 #include <corelib/ncbistd.hpp>
39 
40 #include <boost/version.hpp>
41 #if BOOST_VERSION >= 103800
42 #include <boost/spirit/include/classic.hpp>
43 #include <boost/spirit/include/classic_core.hpp>
44 #include <boost/spirit/include/classic_ast.hpp>
45 #include <boost/spirit/include/classic_parse_tree.hpp>
46 #include <boost/spirit/include/classic_tree_to_xml.hpp>
47 
48 using namespace BOOST_SPIRIT_CLASSIC_NS;
49 #else
50 //older boost
51 #include <boost/spirit.hpp>
52 #include <boost/spirit/core.hpp>
53 #include <boost/spirit/tree/ast.hpp>
54 #include <boost/spirit/tree/parse_tree.hpp>
55 #include <boost/spirit/tree/tree_to_xml.hpp>
56 using namespace boost::spirit;
57 #endif
58 
64 #include <objects/seq/Seq_data.hpp>
65 
70 
71 #include <objmgr/scope.hpp>
72 #include <objmgr/feat_ci.hpp>
73 #include <objmgr/seq_vector.hpp>
74 
76 
78 
79 namespace variation {
80 
82 
83 #define HGVS_THROW(err_code, message) NCBI_THROW(CHgvsParser::CHgvsParserException, err_code, message)
84 
85 
86 class CHgvsParser : public CObject
87 {
88 public:
89  CHgvsParser(CScope& scope, int tax_id = 9606)
90  : m_scope(&scope)
91  {
92  m_seq_id_resolvers.push_back(CRef<CSeq_id_Resolver>(new CSeq_id_Resolver__LRG(scope)));
93  m_seq_id_resolvers.push_back(CRef<CSeq_id_Resolver>(new CSeq_id_Resolver__CCDS(scope)));
94  //m_seq_id_resolvers.push_back(CRef<CSeq_id_Resolver>(new CSeq_id_Resolver__GeneSymbol(scope, tax_id)));
95  (void)tax_id; // FIXME remove this and solve the warnings about unused parameter
96  m_seq_id_resolvers.push_back(CRef<CSeq_id_Resolver>(new CSeq_id_Resolver(scope)));
97  }
98 
99 
100  enum EOpFlags
101  {
102  fOpFlags_Default = 0
103  };
104  typedef int TOpFlags;
105 
106  CRef<CVariation> AsVariation(const string& hgvs_expression, TOpFlags = fOpFlags_Default);
107 
108 
109  string AsHgvsExpression(const CVariation& variation, CConstRef<CSeq_id> seq_id = CConstRef<CSeq_id>(NULL));
110  string AsHgvsExpression(const CVariantPlacement& p);
111 
112  //attach placement-specific HGVS expressions
113  void AttachHgvs(CVariation& v);
114 
115 
117  {
118  return *m_scope;
119  }
120 
121  /// In order of decreasing priority. Last resolver is default catch-all, so
122  /// when adding a custom one don't push_back.
124  {
125  return m_seq_id_resolvers;
126  }
127 
128 
130  {
131  public:
132  enum EErrCode {
133  eLogic, ///<Problem with the code
134  eGrammatic, ///<Expression is not a valid language
135  eSemantic, ///<Expression is invalid in some way
136  eContext, ///<Some problem with context
137  eAlignment, ///<Some problem with getting alignment
138  ePrecondition, ///<Precondition is not met
140  };
141 
142  virtual const char* GetErrCodeString(void) const override
143  {
144  switch(GetErrCode()) {
145  case eLogic: return "eLogic";
146  case eGrammatic: return "eGrammatic";
147  case eSemantic: return "eSemantic";
148  case eContext: return "eContext";
149  case eAlignment: return "eAlignment";
150  case ePrecondition: return "ePrecondition";
151  case eOther: return "eOther";
152  default: return CException::GetErrCodeString();
153  }
154  }
156  };
157 
158 
159 protected:
160 
161  //integer with associated fuzz
162  struct SFuzzyInt
163  {
165  {
166  Reset();
167  }
168 
169  void Assign(const SFuzzyInt& other) {
170  value = other.value;
171  if(!other.fuzz) {
172  fuzz.Reset();
173  } else {
174  if(!fuzz) {
175  fuzz.Reset(new CInt_fuzz);
176  }
177  fuzz->Assign(*other.fuzz);
178  }
179  }
180 
181  void Reset()
182  {
183  value = 0;
184  fuzz.Reset();
185  }
186 
187  void SetPureFuzz()
188  {
189  if(!fuzz) {
190  fuzz.Reset(new CInt_fuzz);
191  }
192  fuzz->SetLim(CInt_fuzz::eLim_other);
193  value = 0;
194  }
195 
196  bool IsPureFuzz() const
197  {
198  return value == 0
199  && fuzz
200  && fuzz->IsLim()
201  && fuzz->GetLim() == CInt_fuzz::eLim_other;
202  }
203 
204  long value;
205  CRef<CInt_fuzz> fuzz; //can be null;
206  // can be null;
207  // value CInt_fuzz::eLim_other indicates
208  // that there's no value, just fuzz, e.g. "?" or "(?)"
209  // CInt_fuzz::eLim_unk corresponds to fuzzy values, e.g. "(5)"
210  };
211 
212  //an hgvs-offset-point (used for pointing into introns from cDNA coordinates, as in NM_1234.5:c.100+10A>G
214  {
216  {
217  Reset();
218  }
219 
220  bool IsOffset() const {
221  return offset.value || offset.fuzz;
222  }
223 
224  void Reset()
225  {
226  pnt.Reset();
227  offset.Reset();
228  }
229 
230  void Assign(const SOffsetPoint& other)
231  {
232  offset.Assign(other.offset);
233  if(!other.pnt) {
234  pnt.Reset();
235  } else {
236  if(!pnt) {
237  pnt.Reset(new CSeq_point);
238  }
239  pnt->Assign(*other.pnt);
240  }
241  }
242 
246  };
247 
248 
249  /*!
250  * CContext encapsulates parsed sequence or location context for an hgvs sub-expression.
251  * E.g. given an expression NM_12345.6:c.5_10delinsAT, when creating a variation-ref
252  * for delinsAT the context will refer to sequence "NM_12345.6" and location "5_10"
253  */
254  class CContext
255  {
256  public:
257  CContext(CRef<CScope> scope, CSeq_id_Resolver::TResolvers& id_resolvers, const string& hgvs)
258  : m_scope(scope)
259  , m_seq_id_resolvers(id_resolvers)
260  , m_hgvs(hgvs)
261  {
262  Clear();
263  }
264 
265  CContext(const CContext& other);
266 
267  void Clear()
268  {
269  m_bsh.Reset();
270  m_cds.Reset();
271  m_placement.Reset();
272  }
273 
275  {
276  return m_bsh;
277  }
278 
279  /*!
280  * Clear the context and reset it for the given seq-id.
281  * If the sequence is cdna and we're working with "c." coordinates,
282  * also find the CDS, as the coordinates are (start|stop)codon-relative.
283  */
284  void SetId(const CSeq_id& id, CVariantPlacement::TMol mol);
285 
286  const CSeq_id& GetId() const;
287 
289  {
290  if(!m_placement) {
291  m_placement.Reset(new CVariantPlacement);
292  m_placement->SetLoc().SetNull();
293  m_placement->SetMol(CVariantPlacement::eMol_unknown);
294  }
295  return *m_placement;
296  }
297 
299  {
300  return *m_placement;
301  }
302 
303  bool IsSetPlacement() const
304  {
305  return !m_placement.IsNull();
306  }
307 
308  CScope& GetScope() const
309  {
310  return *m_scope;
311  }
312 
313  const CSeq_feat& GetCDS() const;
314 
315  CSeq_id_Handle ResolevSeqId(const string& s) const
316  {
317  return CSeq_id_Resolver::s_Get(m_seq_id_resolvers, s);
318  }
319 
320  const string& GetHgvs() const
321  {
322  return m_hgvs;
323  }
324  private:
330  const string& m_hgvs;
331  };
332 
333  struct SGrammar: public grammar<SGrammar>
334  {
335  /*!
336  * Deviations from the recommendations:
337  *
338  * Flattened compound lists are not supported (a list must have same delimiter type)
339  * [[[a,b];c;f](+)[d,e]] - OK
340  * [a,b;c;f(+)d,e] - NOT OK
341  *
342  * No mixing of different expression types within a list.
343  * Note: this example is not mentioned in specifications, but found it in Mutalyzer docs.
344  * AB026906:c.[1del;6_7insoAL449423.14(CDKN2A):c.[1_10;5del]]
345  * 1_10 specifies a location
346  * 5del specifies a variation instance
347  * [1_10;5del] is not a valid expression
348  *
349  * Only seq-id header containing actual seq-id (gi or acc.ver) are supported:
350  * Y : NM_004006.1:c.3G>T - uses a GenBank file as indicator
351  N : GJB2:c.76A>C - uses a HGNC-approved gene symbol as indicator
352  Y : DMD{NM_004006.1}:c.3G>T - uses both a HGNC-approved gene symbol and a GenBank file as indicator
353  N : chrX:g.32,218,983_32,984,039del - uses a chromosome indicator (here X)
354  N : rs2306220:A>G - using a dbSNP-identifier as indicator
355  N : DXS1219:g.CA[18] (or AFM297yd1:g.CA[18]) - uses marker DXS1219 / AFM297yd1 as indicator
356  Note: cases below are not described in recommendations, but found in Mutalyzer docs.
357  N : AL449423.14(CDKN2A_v003):g.10del - not in symbol{seq_id} format
358  N : CDKN2A_v003{AL449423.14}:c.1_*3352del cDNA coordinates on a genomic sequence.
359 
360  * In addition to accessions, CCDS and LRG ids are supported.
361  *
362  * If a seq-id is nucleotide, but HGVS is ".p", the seq-id is must be uniquely mappable
363  * to a protein seq-id (e.g. NM->NP), e.g. "CCDS2.2:c." and "CCDS2.2:p." will be resolved to
364  * related NM/NP respectively.
365  */
366 
367  enum E_NodeIds {
368  eID_NONE = 0,
415 
416  eNodeIds_SIZE
417  }; //note: any changes here must be accompanied to corresponding changes in s_rule_names in the cpp
418 
419  static const char* s_rule_names[SGrammar::eNodeIds_SIZE];
420  static const string s_GetRuleName(parser_id id);
421 
422  template <typename ScannerT>
423  struct definition
424  {
425  rule<ScannerT, parser_context<>, parser_tag<eID_root> > root;
426  rule<ScannerT, parser_context<>, parser_tag<eID_list_delimiter> > list_delimiter;
427  rule<ScannerT, parser_context<>, parser_tag<eID_list1a> > list1a;
428  rule<ScannerT, parser_context<>, parser_tag<eID_list2a> > list2a;
429  rule<ScannerT, parser_context<>, parser_tag<eID_list3a> > list3a;
430  rule<ScannerT, parser_context<>, parser_tag<eID_list1b> > list1b;
431  rule<ScannerT, parser_context<>, parser_tag<eID_list2b> > list2b;
432  rule<ScannerT, parser_context<>, parser_tag<eID_list3b> > list3b;
433  rule<ScannerT, parser_context<>, parser_tag<eID_expr1> > expr1;
434  rule<ScannerT, parser_context<>, parser_tag<eID_expr2> > expr2;
435  rule<ScannerT, parser_context<>, parser_tag<eID_expr3> > expr3;
436  rule<ScannerT, parser_context<>, parser_tag<eID_translocation> > translocation;
437  rule<ScannerT, parser_context<>, parser_tag<eID_header> > header;
438  rule<ScannerT, parser_context<>, parser_tag<eID_seq_id> > seq_id;
439  rule<ScannerT, parser_context<>, parser_tag<eID_mol> > mol;
440  rule<ScannerT, parser_context<>, parser_tag<eID_mut_list > > mut_list;
441  rule<ScannerT, parser_context<>, parser_tag<eID_mut_ref> > mut_ref;
442  rule<ScannerT, parser_context<>, parser_tag<eID_mut_inst> > mut_inst;
443  rule<ScannerT, parser_context<>, parser_tag<eID_int_fuzz> > int_fuzz;
444  rule<ScannerT, parser_context<>, parser_tag<eID_abs_pos> > abs_pos;
445  rule<ScannerT, parser_context<>, parser_tag<eID_general_pos> > general_pos;
446  rule<ScannerT, parser_context<>, parser_tag<eID_fuzzy_pos> > fuzzy_pos;
447  rule<ScannerT, parser_context<>, parser_tag<eID_pos_spec> > pos_spec;
448  rule<ScannerT, parser_context<>, parser_tag<eID_location> > location;
449  rule<ScannerT, parser_context<>, parser_tag<eID_nuc_range> > nuc_range;
450  rule<ScannerT, parser_context<>, parser_tag<eID_prot_range> > prot_range;
451  rule<ScannerT, parser_context<>, parser_tag<eID_raw_seq> > raw_seq;
452  rule<ScannerT, parser_context<>, parser_tag<eID_raw_seq_or_len> > raw_seq_or_len;
453  rule<ScannerT, parser_context<>, parser_tag<eID_aminoacid1> > aminoacid1;
454  rule<ScannerT, parser_context<>, parser_tag<eID_aminoacid2> > aminoacid2;
455  rule<ScannerT, parser_context<>, parser_tag<eID_aminoacid3> > aminoacid3;
456  rule<ScannerT, parser_context<>, parser_tag<eID_nuc_subst> > nuc_subst;
457  rule<ScannerT, parser_context<>, parser_tag<eID_deletion> > deletion;
458  rule<ScannerT, parser_context<>, parser_tag<eID_insertion> > insertion;
459  rule<ScannerT, parser_context<>, parser_tag<eID_delins> > delins;
460  rule<ScannerT, parser_context<>, parser_tag<eID_duplication> > duplication;
461  rule<ScannerT, parser_context<>, parser_tag<eID_nuc_inv> > nuc_inv;
462  rule<ScannerT, parser_context<>, parser_tag<eID_ssr> > ssr;
463  rule<ScannerT, parser_context<>, parser_tag<eID_conversion> > conversion;
464  rule<ScannerT, parser_context<>, parser_tag<eID_seq_loc> > seq_loc;
465  rule<ScannerT, parser_context<>, parser_tag<eID_seq_ref> > seq_ref;
466  rule<ScannerT, parser_context<>, parser_tag<eID_prot_pos> > prot_pos;
467  rule<ScannerT, parser_context<>, parser_tag<eID_prot_missense> > prot_missense;
468  rule<ScannerT, parser_context<>, parser_tag<eID_prot_ext> > prot_ext;
469  rule<ScannerT, parser_context<>, parser_tag<eID_prot_fs> > prot_fs;
470  rule<ScannerT, parser_context<>, parser_tag<eID_no_change> > no_change;
471 
473  {
474  //note: "!X" operator in the parser spec means X node is optional
475  // "A >> B" means match A followed by B
476  // "A | B" means try to match A; if failed, try B
477  // "+A" means match A consecutively one or more times
478  // "A - B" means match on A except when it also matches B
479  // discard_node_d[...] is a directive to discard irrelevant parts of expression from the parse-tree
480  // leaf_node_d[...] is a directive to treat pattern in ... as leaf parse-tree node.
481 
482 
483  //>>> for i in range(27): sys.stdout.write(" | str_p(\"" + ncbi.CSeqportUtil.GetIupacaa3(i) + "\")\n")
484  aminoacid1 = str_p("Ala")
485  | str_p("Asx")
486  | str_p("Cys")
487  | str_p("Asp")
488  | str_p("Glu")
489  | str_p("Phe")
490  | str_p("Gly")
491  | str_p("His")
492  | str_p("Ile")
493  | str_p("Lys")
494  | str_p("Leu")
495  | str_p("Met")
496  | str_p("Asn")
497  | str_p("Pro")
498  | str_p("Gln")
499  | str_p("Arg")
500  | str_p("Ser")
501  | str_p("Thr")
502  | str_p("Val")
503  | str_p("Trp")
504  | str_p("Xaa") //HGVS flavor
505  | str_p("Xxx") //IUPAC flavor
506  | str_p("Tyr")
507  | str_p("Glx")
508  | str_p("Sec")
509  | str_p("Ter")
510  | str_p("Pyl")
511  | chset<>("*X"); //To support legacy HGVS spec, "X" will be interpreted as Ter, not as unknown-AA
512 
513  aminoacid2 = str_p("ALA")
514  | str_p("ASX")
515  | str_p("CYS")
516  | str_p("ASP")
517  | str_p("GLU")
518  | str_p("PHE")
519  | str_p("GLY")
520  | str_p("HIS")
521  | str_p("ILE")
522  | str_p("LYS")
523  | str_p("LEU")
524  | str_p("MET")
525  | str_p("ASN")
526  | str_p("PRO")
527  | str_p("GLN")
528  | str_p("ARG")
529  | str_p("SER")
530  | str_p("THR")
531  | str_p("VAL")
532  | str_p("TRP")
533  | str_p("XXX") //HGVS flavor
534  | str_p("XAA") //IUPAC flavor
535  | str_p("TYR")
536  | str_p("GLX")
537  | str_p("SEC")
538  | str_p("TER")
539  | str_p("PYL")
540  | chset<>("*"); //no 'X' because it is part of other tokens, e.g GLX, ASX
541 
542  aminoacid3 = chset<>("ABCDEFGHIKLMNPQRSTVWXYZU*O");
543  //Note: in HGVS X=stop as in p.X110GlnextX17, whereas in IUPAC X=any
544 
545  raw_seq = leaf_node_d[
546  +aminoacid1
547  | +aminoacid2
548  | +aminoacid3
549  | +chset<>("TGKCYSBAWRDMHVN") //dna IUPAC with ambiguity codes
550  | +chset<>("tugkcysbawrdmhvn")]; //rna IUPAC with ambiguity codes.
551  //note that we also include 't', beacuse we'll support lowercased DNA seq-literal whenever possible
552 
553  /*
554  * Note: there's no distinction between protein, DNA and RNA sequences, as
555  * at the parse time it is not known without context which sequence it is - i.e.
556  * "AC" could be a dna sequence, or AlaCys. Hence, raw_seq will match any sequence
557  * and at the parse-tree transformation stage we'll create proper seq-type as
558  * the mol will be known then.
559  *
560  * Note: +aminoacid precedes +chset<>("ACGTN") so that we don't prematurely match T in Trp etc.
561  */
562 
563 
564  /*
565  * Positions and Locations
566  */
567  int_fuzz = ch_p('(') >> (ch_p('?')|int_p)
568  >> ch_p('_')
569  >> (ch_p('?')|int_p)
570  >> ch_p(')')
571  | ch_p('(') >> int_p >> ch_p(')') //note: not ch_p('?') here as this makes grammar amgiguous
572  | (ch_p('?')|int_p);
573 
574  abs_pos = !ch_p('*') >> int_fuzz;
575  //Note: '*' means the location is CDS-stop-relative; cdna context required.
576  //Otherwise it is CDS-start-relative iff in cdna-context.
577 
578  general_pos = (str_p("IVS") >> int_p | abs_pos) >> sign_p >> int_fuzz
579  | abs_pos;
580  //Note: offset-pos must be followed by a sign because otherwise
581  //it is ambiguous whether c.123(1_2) denotes a SSR or a fuzzy intronic location
582 
583 
584  fuzzy_pos = discard_node_d[ch_p('(')]
585  >> general_pos
586  >> discard_node_d[ch_p('_')]
587  >> general_pos
588  >> discard_node_d[ch_p(')')];
589 
590  pos_spec = general_pos //intronic offset-pos
591  | fuzzy_pos //(generalpos_generalpos)
592  | !ch_p('o') >> header >> pos_spec; //far-loc, as in id:c.88+101_id2:c.355-1045del
593 
594 
595  prot_pos = raw_seq >> pos_spec; //raw_seq must be of length one (single aminoacid)
596 
597  prot_range = prot_pos >> discard_node_d[ch_p('_')] >> prot_pos;
598 
599  nuc_range = pos_spec >> discard_node_d[ch_p('_')] >> pos_spec;
600 
601  location = nuc_range | pos_spec | prot_range | prot_pos;
602  //Note that this describes "local" coordinates within a sequence context, not a seq-loc
603 
604  /*
605  * Seq-ids and seq-locs
606  */
607  seq_id = leaf_node_d[alnum_p >> *(alnum_p | chset<>("._-|"))];
608 
609  mol = str_p("mt") | chset<>("gcnrpm"); //note: for 'mt.' also supporting 'm.'
610  //Genomic, Coding (CDS-relative), Noncoding, Rna-based, Protein, Mitochondrion
611 
612  header = seq_id
613  >> !(discard_node_d[ch_p('{')]
614  >> seq_id
615  >> discard_node_d[ch_p('}')])
616  >> discard_node_d[ch_p(':')]
617  >> mol
618  >> discard_node_d[ch_p('.')];
619  /*
620  * A the parsing stage we'll not require that the mutations for
621  * different mol-types don't mix, i.e. protein mutation type for a
622  * mol-type "c." - this will be deferred to the semantic check that will
623  * validate the parsed expression. The reason is that the g/c/r/p specs mostly
624  * overlap, so we can avoid exploding the grammar.
625  */
626 
627  seq_loc = !ch_p('o') >> header >> location;
628 
629  raw_seq_or_len = raw_seq | int_fuzz;
630 
631  seq_ref = seq_loc //far-loc
632  | (nuc_range|prot_range) //local-loc of range-type, e.g. c.17_18ins5_16 http://www.hgvs.org/mutnomen/FAQ.html
633  //This is to exclude point-locs (see below)
634  | raw_seq_or_len; //literal seq, or unknown sequence of some length
635  // WARNING: this is ambiguous WRT local-loc!
636  // e.g. p.Glu5Valins2fsX3 - 2 in ins2 indicates sequence of length two, NOT the sequence at position 2.
637  // Hence, the local-locs above must be specified via range-types only.
638 
639  no_change = !raw_seq >> ch_p('=');
640 
641  nuc_subst = (!raw_seq) >> ch_p('>') >> raw_seq;
642  //According to the spec, substitution is exactly one base to the left and to the right of ">" (otherwise a delins)
643  //In reality, the submitters don't follow the spec and express delins as substitution,
644  //e.g. NM_000051:c.2077_2098>G or NM_000051:c.2077_2098AC>GTA.
645 
646  deletion = str_p("del") >> !raw_seq_or_len;
647 
648  duplication = str_p("dup") >> !seq_ref;
649 
650  insertion = str_p("ins") >> seq_ref;
651 
652  conversion = str_p("con") >> seq_loc;
653 
654  delins = deletion >> insertion;
655 
656  nuc_inv = str_p("inv") >> !seq_ref; //note: only int_p used to be allowed in older spec
657 
658  ssr = !raw_seq >> (
659  (int_fuzz - (ch_p('?')|int_p)) //don't want to interpret 'id:5_6?' as ssr
660  | list_p(discard_node_d[ch_p('[')]
661  >> int_p
662  >> discard_node_d[ch_p(']')],
663  discard_node_d[ch_p('+')]));
664  /*
665  * Note: It is not correct to match [5] and [3] in NM_000815.2:c.101TC[5]+[3]
666  * individually as list3a within c.101 location context, because
667  * the location-spec for an ssr may point to the first repeat-unit base only,
668  * in which case we need to calculate the actual range-loc based on the sequence literal,
669  * and when processing "[3]" later the "TC" literal will not be in context,
670  * and it will appear as if the context is NM_000815.2:c.101 rather than c.101_102.
671  *
672  * Hence, the ssr rule will have to consume the list of multipliers [5]+[3]
673  * and generate compound variation-ref "manually"
674  */
675 
676 
677  //note: frameshift is optionally followed by new translation length,
678  //e.g. fs, fsX, fsX10, fs*, fs*10 ("X" in HGVS-1.0; "*" in HGVS-2.0)
679  //MSS350: current spec allows 'Ter' as well. Apparently, fsX is no longer
680  //valid, but leaving it for backward-compatibility.
681  prot_fs = str_p("fs") >> !((str_p("Ter") | chset<>("*X")) >> !int_p);
682 
683  //note: stop-loss is extX in HGVS-1.0 and ext* in HGVS 2.0
684  //Note: added extTer to be consistent with above.
685  prot_ext = ( str_p("extMet")
686  | str_p("extX")
687  | str_p("ext*")
688  | str_p("extTer")) >> int_p;
689 
690  prot_missense = raw_seq;
691 
692  //ISCN expression followed by a seq-loc. The ISCN is factored to a single leaf node
693  translocation = leaf_node_d[ch_p('t') >>
694  +(
695  ch_p('(')
696  >> *(print_p - ch_p('(') - ch_p(')'))
697  >> ch_p(')')
698  )
699  ] >> seq_loc;
700 
701  mut_inst = ch_p('?') //can exist within +mut_inst, e.g. p.Met1?extMet-5
702  | no_change
703  | delins //delins must precede del
704  | deletion
705  | insertion
706  | duplication
707  | nuc_subst
708  | nuc_inv
709  | ssr
710  | conversion
711  | prot_fs
712  | prot_missense
713  | prot_ext //this may occur inside location context, e.g. p.Met1ValextMet-12
714  | leaf_node_d[ch_p(':') >> +(alnum_p)] //catch-all
715  ;
716 
717  //Note: '?' and no_change can exist both within a location context
718  //(i.e. expr3) or within a sequence context (i.e. expr2)
719  //additionally, prot_ext may exist as expr2 (outside of location context) as well.
720 
721 
722  root = list_p(expr1, chset<>("+;"));
723  //At the root level the '+'-delimited expressions are not required to be bracketed, i.e.
724  //NM_004004.2:c.[35delG]+NM_006783.1:c.[689_690insT] instead of
725  //[NM_004004.2:c.35delG]+[NM_006783.1:c.689_690insT]
726 
727 
728  list_delimiter = leaf_node_d[
729  str_p("//") //chimeric
730  | chset<>(",;/") //products, same-allele, mosaic respectively
731  | str_p("(;)") //uncertain allele relationship; HGVS 2.0
732  | str_p("(+)") //uncertain allele relationship; HGVS 1.0
733  ];
734 
735 
736  //Note: the list#a version of a list (see is_list_a(..)
737  //represents "top-level" set of alleles
738  //[...]+[...] - HGVS-1.0 representation
739  //[...];[...] - HGVS-2.0 representation of the same thing
740  //where "..." is the list_delimiter -delimited list of allele-specific subexpressions
741 
742  expr1 = ch_p('(') >> expr1 >> ch_p(')')
743  | list1a
744  | header >> expr2
745  | translocation;
746  list1a = list_p(discard_node_d[ch_p('[')] >> list1b >> discard_node_d[ch_p(']')], chset<>(";+"));
747  list1b = list_p(expr1, list_delimiter);
748 
749 
750  expr2 = ch_p('(') >> expr2 >> ch_p(')')
751  | list2a
752  | str_p("0?") //note: HGVS-special; precedes "location>>expr3" rule, so that it is not matched as unknown-variation@pos=0
753  | location >> expr3
754  | prot_ext //note: can also exist within location context (mut_inst)
755  | ch_p('0') //note: HGVS-special; follows "location>>expr3" such that if location contains padding zeros, they are not consumed by this rule.
756  | ch_p('?') //note: follows "location>>expr3" such that variation at unknown pos is not partially-matched as unknown variation
757  | no_change;
758  list2a = list_p(discard_node_d[ch_p('[')] >> list2b >> discard_node_d[ch_p(']')], chset<>(";+"));
759  list2b = list_p(expr2, list_delimiter);
760 
761 
762  expr3 = ch_p('(') >> expr3 >> ch_p(')')
763  | list3a
764  | +mut_inst;
765  /*
766  * Note: Multiple mut_insts that are not delimited, i.e.
767  * abc instead of [a;b;c] are legit, e.g.
768  *
769  * a) p.X110GlnextX17
770  * b) NM_012345.3:c.123+45_123+51dupinsAB012345.3:g.393_1295
771  */
772  list3a = list_p(discard_node_d[ch_p('[')] >> list3b >> discard_node_d[ch_p(']')], chset<>(";+"));
773  list3b = list_p(expr3, list_delimiter);
774 
775  //BOOST_SPIRIT_DEBUG_RULE(expr1);
776  }
777 
778  rule<ScannerT, parser_context<>, parser_tag<eID_root> > const& start() const
779  {
780  return root;
781  }
782  };
783 
784  static bool s_is_list_a(parser_id id)
785  {
786  return id == SGrammar::eID_list1a
787  || id == SGrammar::eID_list2a
788  || id == SGrammar::eID_list3a
789  || id == SGrammar::eID_root;
790  }
791 
792  static bool s_is_list_b(parser_id id)
793  {
794  return id == SGrammar::eID_list1b
795  || id == SGrammar::eID_list2b
796  || id == SGrammar::eID_list3b;
797  }
798 
799  static bool s_is_list(parser_id id)
800  {
801  return s_is_list_a(id) || s_is_list_b(id);
802  }
803  };
805 
806 private:
807  typedef tree_match<char const*> TParseTreeMatch;
808  typedef TParseTreeMatch::const_tree_iterator TIterator;
811 
812 
813  static SFuzzyInt x_int_fuzz (TIterator const& i, const CContext& context);
814 
815  static CRef<CSeq_point> x_abs_pos (TIterator const& i, const CContext& context);
816 
817  static SOffsetPoint x_general_pos (TIterator const& i, const CContext& context);
818  static SOffsetPoint x_fuzzy_pos (TIterator const& i, const CContext& context);
819  static SOffsetPoint x_pos_spec (TIterator const& i, const CContext& context);
820  static SOffsetPoint x_prot_pos (TIterator const& i, const CContext& context);
821 
822  static CRef<CVariantPlacement> x_range (TIterator const& i, const CContext& context);
823  static CRef<CVariantPlacement> x_location (TIterator const& i, const CContext& context);
824 
825  static CRef<CSeq_loc> x_seq_loc (TIterator const& i, const CContext& context);
826  static CRef<CSeq_literal> x_raw_seq (TIterator const& i, const CContext& context);
827  static TDelta x_seq_ref (TIterator const& i, const CContext& context);
828  static CRef<CSeq_literal> x_raw_seq_or_len (TIterator const& i, const CContext& context);
829  static CRef<CVariation> x_delins (TIterator const& i, const CContext& context);
830  static CRef<CVariation> x_deletion (TIterator const& i, const CContext& context);
831  static CRef<CVariation> x_insertion (TIterator const& i, const CContext& context, bool check_loc);
832  static CRef<CVariation> x_duplication (TIterator const& i, const CContext& context);
833  static CRef<CVariation> x_nuc_subst (TIterator const& i, const CContext& context);
834  static CRef<CVariation> x_no_change (TIterator const& i, const CContext& context);
835  static CRef<CVariation> x_nuc_inv (TIterator const& i, const CContext& context);
836  static CRef<CVariation> x_ssr (TIterator const& i, const CContext& context);
837  static CRef<CVariation> x_conversion (TIterator const& i, const CContext& context);
838  static CRef<CVariation> x_prot_ext (TIterator const& i, const CContext& context);
839  static CRef<CVariation> x_prot_missense (TIterator const& i, const CContext& context);
840  static CRef<CVariation> x_translocation (TIterator const& i, const CContext& context);
841  static CRef<CVariation> x_mut_inst (TIterator const& i, const CContext& context);
842  static CRef<CVariation> x_expr1 (TIterator const& i, const CContext& context);
843  static CRef<CVariation> x_expr2 (TIterator const& i, const CContext& context);
844  static CRef<CVariation> x_expr3 (TIterator const& i, const CContext& context);
845  static CRef<CVariation> x_prot_fs (TIterator const& i, const CContext& context);
846  static CRef<CVariation> x_list (TIterator const& i, const CContext& context);
847  static CContext x_header (TIterator const& i, const CContext& context);
848  static CRef<CVariation> x_root (TIterator const& i, const CContext& context);
849  static CRef<CVariation> x_string_content (TIterator const& i, const CContext& context);
850  static CVariation::TData::TSet::EData_set_type x_list_delimiter(TIterator const& i, const CContext& context);
851 
852  static CRef<CVariation> x_unwrap_iff_singleton(CVariation& v);
853 
854 
855  ///Convert HGVS amino-acid code to ncbi code.
856  ///Return true iff success; otherwise false and out = in.
857  static bool s_hgvsaa2ncbieaa(const string& hgvsaa, string& out);
858  static bool s_hgvs_iupacaa2ncbieaa(const string& hgvsaa, string& out);
859  static bool s_hgvsaa2ncbieaa(const string& hgvsaa, bool uplow, string& out);
860 
861  ///Convert non-HGVS compliant all-uppercase AAs to UpLow, e.g. ILECYS ->IleCys
862  static string s_hgvsUCaa2hgvsUL(const string& hgvsaa);
863 
864  static void s_SetStartOffset(CVariantPlacement& p, const CHgvsParser::SFuzzyInt& fint);
865  static void s_SetStopOffset(CVariantPlacement& p, const CHgvsParser::SFuzzyInt& fint);
866 
867 private:
868 // Helpers to generate a HGVS expression from a variation
869 
870  /// if no atg_pos, assume that not dealing with coordinate systems (simply return abs-pos)
871  ///otherwise, convert to hgvs coordinates:
872  // adjust by 1, adjust relative to atg_pos, and adjust by -1 if nonpositive.
873  static TSignedSeqPos s_GetHgvsPos(TSeqPos abs_pos, const TSeqPos* atg_pos);
874 
875  /// this function may be used to create hgvs-coordinates (if ref_pos is not null), or
876  /// to create a fuzzy hgvs-number specification (e.g. for multipliers, or fuzzy offset values), where pos is not adjusted
877  /// (Note that pos may be negative, e.g. for offset value)
878  static string s_IntWithFuzzToStr(long pos, const TSeqPos* ref_pos, bool with_sign, const CInt_fuzz* fuzz);
879 
880  /// Construct an HGVS coordinate, which may be an intronic offset-point, e.g. "5+(10_11)"
881  static string s_OffsetPointToString(
882  TSeqPos anchor_pos, //anchor position in absolute seq-loc coordinates
883  const CInt_fuzz* anchor_fuzz, //..of anchor-pos, can be NULL
884  TSeqPos anchor_ref_pos, //first-pos in HGVS coordinates (e.g. 0 for "g.", start_codon or stop_codon+1 for "c."
885  TSeqPos effective_seq_length, //length of sequence. If transcript, excludes polyA
886  const long* offset_pos, //if not specified, this is a "native" coordinate; otherwise "c."|"p." -intronic
887  const CInt_fuzz* offset_fuzz); //..of offset position, can be NULL
888 
889  /// Construct an hgvs "header" consisting of seq-id and mol-type, e.g. "NG_016831.1:g."
890  static string s_SeqIdToHgvsStr(const CVariantPlacement& vp, CScope* scope = NULL);
891 
892  static void sx_AppendMoltypeExceptions(CVariation& v, CScope& scope);
893  /// In some cases the placement needs to be adjusted depending on inst, e.g. if we have a point-relative insertion,
894  /// it needs to be converted to "between-dinucleotide" representation; or, in case of microsatellites, the
895  /// location must point to the first repeat unit rather than whole tandem repeat
896  CRef<CVariantPlacement> x_AdjustPlacementForHgvs(const CVariantPlacement& p, const CVariation_inst& inst);
897 
898  /// If the variation is a package-set, find the subvariation with observation-type "asserted" and return its literal
899  CConstRef<CSeq_literal> x_FindAssertedSequence(const CVariation& v);
900 
901  /// Compute length of the delta
902  TSeqPos x_GetInstLength(const CVariation_inst& inst, const CVariantPlacement& p, bool account_for_multiplier);
903 
904  string x_PlacementCoordsToStr(const CVariantPlacement& vp);
905 
906  /// Create "inst" part of HGVS expression
907  string x_AsHgvsInstExpression(
908  const CVariation& inst_variation,
910  CConstRef<CSeq_literal> asserted_seq);
911 
912  /// Construct HGVS expression for a variation: use first VariantPlacement, or, if id is specified,
913  /// irst placement with matching id; If an asserted sequence is given explicitly, it will be used
914  /// in construction of HGVS expression; otherwise one will be created based on placement's literal.
915  string x_AsHgvsExpression(
916  const CVariation& variation,
918  CConstRef<CSeq_literal> asserted_seq);
919 
920  /// Get literal seq at location
921  string x_LocToSeqStr(const CSeq_loc& loc);
922 
923  /// translate=true will translate nucleotide literal to prot as appropriate.
924  /// It is intended for cases where delta literal in a protein variation is
925  /// specified as codons rather than AAs; For HGVS purposes we can't use NAs
926  /// in a protein context.
927  string x_SeqLiteralToStr(const CSeq_literal& literal, bool translate, bool is_mito);
928 
929 private:
932 };
933 
934 }
935 
937 
938 
939 #endif /* HGVSPARSER_HPP_ */
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_Handle –.
CObject –.
Definition: ncbiobj.hpp:180
CRef –.
Definition: ncbiobj.hpp:618
CSafeStatic<>::
CScope –.
Definition: scope.hpp:92
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Resolve CCDS-id to an NM.
Resolve LRG seq-ids, e.g. LRG_123, LRG_123t1, LRG_123p1.
A helper class to convert a string to a seq-id.
list< CRef< CSeq_id_Resolver > > TResolvers
static CSeq_id_Handle s_Get(TResolvers &resolvers, const string &s)
Iterate through resolvers and resolve using the first one that can do it Return empty handle otherwis...
CVariantPlacement –.
Set of related Variations.
Definition: Variation_.hpp:127
CVariation_inst –.
CContext(CRef< CScope > scope, CSeq_id_Resolver::TResolvers &id_resolvers, const string &hgvs)
CSeq_id_Handle ResolevSeqId(const string &s) const
CVariantPlacement & SetPlacement()
const CVariantPlacement & GetPlacement() const
CSeq_id_Resolver::TResolvers m_seq_id_resolvers
CRef< CVariantPlacement > m_placement
const string & GetHgvs() const
const CBioseq_Handle & GetBioseqHandle() const
NCBI_EXCEPTION_DEFAULT(CHgvsParserException, CException)
virtual const char * GetErrCodeString(void) const override
Get error code interpreted as text.
@ eSemantic
Expression is invalid in some way.
@ eAlignment
Some problem with getting alignment.
@ eGrammatic
Expression is not a valid language.
TParseTreeMatch::const_tree_iterator TIterator
CSeq_id_Resolver::TResolvers m_seq_id_resolvers
CVariation_inst::TDelta::value_type TDelta
static CSafeStatic< SGrammar > s_grammar
CRef< CScope > m_scope
CVariation::TData::TSet TVariationSet
CSeq_id_Resolver::TResolvers & SetSeq_id_Resolvers()
In order of decreasing priority.
CHgvsParser(CScope &scope, int tax_id=9606)
tree_match< char const * > TParseTreeMatch
static string s_hgvsUCaa2hgvsUL(const string &hgvsaa)
Convert non-HGVS compliant all-uppercase AAs to UpLow, e.g. ILECYS ->IleCys.
Include a standard set of the NCBI C++ Toolkit most basic headers.
std::ofstream out("events_result.xml")
main entry point for tests
int offset
Definition: replacements.h:160
static const char location[]
Definition: config.c:97
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NULL
Definition: ncbistd.hpp:225
EErrCode
Error types that an application can generate.
Definition: ncbiexpt.hpp:884
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
Definition: ncbiexpt.cpp:444
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CTempString literal(const char(&str)[Size])
Templatized initialization from a string literal.
Definition: tempstr.hpp:441
@ eLim_other
something else
Definition: Int_fuzz_.hpp:216
virtual void Reset(void)
Reset the whole object.
int i
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
USING_SCOPE(objects)
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
void Assign(const SFuzzyInt &other)
rule< ScannerT, parser_context<>, parser_tag< eID_list2a > > list2a
rule< ScannerT, parser_context<>, parser_tag< eID_aminoacid3 > > aminoacid3
rule< ScannerT, parser_context<>, parser_tag< eID_fuzzy_pos > > fuzzy_pos
rule< ScannerT, parser_context<>, parser_tag< eID_mol > > mol
rule< ScannerT, parser_context<>, parser_tag< eID_list_delimiter > > list_delimiter
rule< ScannerT, parser_context<>, parser_tag< eID_list3a > > list3a
rule< ScannerT, parser_context<>, parser_tag< eID_location > > location
rule< ScannerT, parser_context<>, parser_tag< eID_nuc_subst > > nuc_subst
rule< ScannerT, parser_context<>, parser_tag< eID_mut_inst > > mut_inst
rule< ScannerT, parser_context<>, parser_tag< eID_list1b > > list1b
rule< ScannerT, parser_context<>, parser_tag< eID_int_fuzz > > int_fuzz
rule< ScannerT, parser_context<>, parser_tag< eID_nuc_inv > > nuc_inv
rule< ScannerT, parser_context<>, parser_tag< eID_insertion > > insertion
rule< ScannerT, parser_context<>, parser_tag< eID_conversion > > conversion
rule< ScannerT, parser_context<>, parser_tag< eID_duplication > > duplication
rule< ScannerT, parser_context<>, parser_tag< eID_aminoacid2 > > aminoacid2
rule< ScannerT, parser_context<>, parser_tag< eID_prot_ext > > prot_ext
rule< ScannerT, parser_context<>, parser_tag< eID_list1a > > list1a
rule< ScannerT, parser_context<>, parser_tag< eID_nuc_range > > nuc_range
rule< ScannerT, parser_context<>, parser_tag< eID_delins > > delins
rule< ScannerT, parser_context<>, parser_tag< eID_prot_pos > > prot_pos
rule< ScannerT, parser_context<>, parser_tag< eID_deletion > > deletion
rule< ScannerT, parser_context<>, parser_tag< eID_abs_pos > > abs_pos
rule< ScannerT, parser_context<>, parser_tag< eID_mut_ref > > mut_ref
rule< ScannerT, parser_context<>, parser_tag< eID_translocation > > translocation
rule< ScannerT, parser_context<>, parser_tag< eID_ssr > > ssr
rule< ScannerT, parser_context<>, parser_tag< eID_prot_fs > > prot_fs
rule< ScannerT, parser_context<>, parser_tag< eID_expr1 > > expr1
rule< ScannerT, parser_context<>, parser_tag< eID_general_pos > > general_pos
rule< ScannerT, parser_context<>, parser_tag< eID_raw_seq_or_len > > raw_seq_or_len
rule< ScannerT, parser_context<>, parser_tag< eID_seq_loc > > seq_loc
rule< ScannerT, parser_context<>, parser_tag< eID_mut_list > > mut_list
rule< ScannerT, parser_context<>, parser_tag< eID_expr3 > > expr3
rule< ScannerT, parser_context<>, parser_tag< eID_list3b > > list3b
rule< ScannerT, parser_context<>, parser_tag< eID_prot_range > > prot_range
rule< ScannerT, parser_context<>, parser_tag< eID_no_change > > no_change
rule< ScannerT, parser_context<>, parser_tag< eID_expr2 > > expr2
rule< ScannerT, parser_context<>, parser_tag< eID_header > > header
rule< ScannerT, parser_context<>, parser_tag< eID_seq_id > > seq_id
rule< ScannerT, parser_context<>, parser_tag< eID_prot_missense > > prot_missense
rule< ScannerT, parser_context<>, parser_tag< eID_pos_spec > > pos_spec
rule< ScannerT, parser_context<>, parser_tag< eID_aminoacid1 > > aminoacid1
rule< ScannerT, parser_context<>, parser_tag< eID_raw_seq > > raw_seq
rule< ScannerT, parser_context<>, parser_tag< eID_root > > const & start() const
rule< ScannerT, parser_context<>, parser_tag< eID_seq_ref > > seq_ref
rule< ScannerT, parser_context<>, parser_tag< eID_root > > root
rule< ScannerT, parser_context<>, parser_tag< eID_list2b > > list2b
static bool s_is_list(parser_id id)
static bool s_is_list_a(parser_id id)
static bool s_is_list_b(parser_id id)
void Assign(const SOffsetPoint &other)
static CS_CONTEXT * context
Definition: will_convert.c:21
Modified on Sun Jul 21 04:20:01 2024 by modify_doxy.py rev. 669887