NCBI C++ ToolKit
snp_utils.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef SNP_UTIL___SNP_UTILS__HPP
2 #define SNP_UTIL___SNP_UTILS__HPP
3 
4 /* $Id: snp_utils.hpp 89856 2020-04-28 14:26:30Z rudnev $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors: Melvin Quintos, Dmitry Rudnev
30  *
31  * File Description:
32  * Declares Helper functions in NSnp class
33  *
34  */
35 
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbitime.hpp>
38 
40 
41 #include <objmgr/feat_ci.hpp>
45 #include <objects/seq/Seq_data.hpp>
53 
56 
57 /////////////////////////////////////////////////////////////////////////////
58 ///
59 /// NSnp --
60 ///
61 /// Helper functions for SNP features
63 {
64 public:
65  // tag for SNPs features, meaning is the same as for CDbtag::eDbtagType_dbSNP, but that one is unfortunately a enum, and
66  // the corresponding string value is not exposed
67  const static string sm_dbTag_dbSNP;
68 
69  // RS ID
71  // clinical significance
72  // values are taken from [human_9606].[dbo].[ClinSigCode] on 05/26/2011
73  // and later coordinated with CPhenotype::EClinical_significance (05/14/2013)
74  // hopefully they will stay stable
76  typedef int TClinSigID;
77 
78  /// Determine if feature is a SNP
79  ///
80  /// @param mapped_feat
81  /// CMappedFeat object representing feature
82  /// @return
83  /// - true if Subtype is variation
84  /// - false otherwise
85  static bool IsSnp(const CMappedFeat &mapped_feat);
86  static bool IsSnp(const CSeq_feat &feat);
87  static bool IsSnp(const CDbtag& tag);
88 
89  /// find a SNP tag in the feature
90  /// returns NULL if no such tag (sm_dbTag_dbSNP)
91  static CConstRef<CDbtag> GetTag(const CSeq_feat& SrcFeat);
92  static CConstRef<CDbtag> GetTag(const CMappedFeat& SrcFeat);
93 
94  /// Get Create Time
95  /// It will fetch the creation time based on the CAnnotDescr of the feature's
96  /// parent annotation object.
97  ///
98  /// @param mapped_feat
99  /// CMappedFeat object representing feature
100  /// @return
101  /// - CTime object representing Creation time. A default constructed CTime
102  /// object will be returned if no Create-time was found.
103  static CTime GetCreateTime(const CMappedFeat &mapped_feat);
104 
105  /// Return rsid of SNP
106  ///
107  /// @param mapped_feat
108  /// CMappedFeat object representing SNP feature
109  /// @return
110  /// - rsid of SNP as set in its Tag data
111  /// - 0 if no rsid found
112  static TRsid GetRsid(const CMappedFeat &mapped_feat);
113 
114  /// Return rsid of SNP
115  ///
116  /// @param feat
117  /// CSeq_feat object representing SNP feature
118  /// @return
119  /// - rsid of SNP as set in its Tag data
120  /// - 0 if no rsid found
121  static TRsid GetRsid(const CSeq_feat &feat);
122 
123  static TRsid GetRsid(const CDbtag& tag);
124 
125  /// Return distance of neighbors in flanking sequence
126  ///
127  /// @param mapped_feat
128  /// CMappedFeat object representing feature
129  /// @return
130  /// - length of neighbors on flanking sequencer
131  /// - 0 if no length information found
132  static int GetLength(const CMappedFeat &);
133 
134  /// Return distance of neighbors in flanking sequence
135  ///
136  /// @param mapped_feat
137  /// CSeq_feat object representing feature
138  /// @return
139  /// - length of neighbors on flanking sequencer
140  /// - 0 if no length information found
141  static int GetLength(const CSeq_feat &);
142 
143  /// Return bitfield information stored in the feature
144  ///
145  /// @param mapped_feat
146  /// CMappedFeat object representing snp feature
147  /// @return
148  /// - CSnpBitfield.isGood() is false if no bitfield is found
149  static CSnpBitfield GetBitfield(const CMappedFeat &);
150 
151  /// Return bitfield information stored in the feature
152  ///
153  /// @param mapped_feat
154  /// CSeq_feat object representing snp feature
155  /// @return
156  /// - bitfield created from octect sequence of QualityCodes
157  /// - CSnpBitfield is empty if no "QualityCodes" are found
158  static CSnpBitfield GetBitfield(const CSeq_feat &feat);
159 
160  /// Check if SNP exists in GenBank database
161  ///
162  /// @param scope
163  /// CScope object representing scope of data
164  /// @param mapped_feat
165  /// CMappedFeat object representing snp feature
166  /// @param allele
167  /// string object representing allele of SNP (e.g. A or GG or -)
168  /// @return
169  /// - true if SNP was found
170  /// - false otherwise
171  static bool IsSnpKnown( CScope &scope, const CMappedFeat &private_snp, const string &allele=kEmptyStr);
172 
173  /// Check if SNP exists in GenBank database
174  ///
175  /// @param scope
176  /// CScope object representing scope of data
177  /// @param loc
178  /// CSeq_loc representing location of SNP
179  /// @param allele
180  /// string object representing allele of SNP (e.g. A or GG or -)
181  /// @return
182  /// - true if SNP was found
183  /// - false otherwise
184  static bool IsSnpKnown( CScope &scope, const CSeq_loc& loc, const string &allele=kEmptyStr);
185 
186  /// list of alleles belonging to particular SNP
187  /// a deletion is represented by a "-"
188  typedef vector<string> TAlleles;
189 
190  /// Return list of alleles encoded in qual."replace"
191  ///
192  /// @param mapped_feat
193  /// CMappedFeat object representing snp feature
194  /// @return
195  /// - list of alleles found in the feature (if any)
196  static void GetAlleles(const CMappedFeat &mapped_feat, TAlleles& Alleles);
197 
198  /// Return list of alleles encoded in qual."replace"
199  ///
200  /// @param feat
201  /// CSeq_feat object representing snp feature
202  /// @param isPadding
203  /// if true, add allele padding according to VCF spec
204  /// @param bsh
205  /// must be not NULL if isPadding == true
206  /// @return
207  /// - list of alleles found in the feature (if any)
208  static void GetAlleles(const CSeq_feat &feat, TAlleles& Alleles, bool isPadding = false, CBioseq_Handle* bsh = NULL);
209 
210  /// controls the case of strings returned from ClinSigAsString()
211  enum ELetterCase {
212  eLetterCase_ForceLower, ///< always use lower case only
213  eLetterCase_Mixed ///< return strings in mixes case
214  };
215 
216  /// get a human-readable text for various clinical significance types
217  ///
218  /// @param var
219  /// the clinical significance will be taken from var.phenotype.clinical-significance
220  /// if it is defined
221  /// @param LetterCase
222  /// controls the letter case of the result
223  /// @return
224  /// string describing the first clinical significance in the first phenotype
225  /// - will be empty if clinical-significance is not present
226  static string ClinSigAsString(const CVariation_ref& var, ELetterCase LetterCase = eLetterCase_Mixed);
227 
228  /// get a human-readable text for various clinical significance types
229  ///
230  /// @param ClinSigID
231  /// clinical significance ID
232  /// @param LetterCase
233  /// controls the letter case of the result
234  /// @return
235  /// string describing the given clinical significance ID
236  static string ClinSigAsString(TClinSigID ClinSigID, ELetterCase LetterCase = eLetterCase_Mixed);
237 
238 ///////////////////////////////////////////////////////////////////////////////
239 // Private Methods
240 ///////////////////////////////////////////////////////////////////////////////
241 private:
242 
243 };
244 
245 #define SNP_VAR_EXT_CLASS "SNPData"
246 #define SNP_VAR_EXT_BITFIELD "Bitfield"
247 
248 /// set of functions for dealing with SNP represented as variation objects
250 {
251 public:
252  /// legacy SNP feature conversion into a variation object
253  ///
254  /// reads a feature that supposedly contains a SNP record (old, up-to-2012, style,
255  /// with SNP data encoded as "qual" (alleles) and "ext.data" (bitfield))
256  /// and sets Variation to content found in the feature
257  /// @param Variation
258  /// conversion result will be put here, old contents are destroyed upon conversion success
259  /// @param SrcFeat
260  /// old format feature
261  /// @return
262  /// false if a given feature is not a correctly formed SNP feature
263  static bool ConvertFeat(CVariation& Variation, const CSeq_feat& SrcFeat);
264 
265  /// @sa
266  /// same as the other ConvertFeat(), but result put into variation-ref, placement is lost
267  static bool ConvertFeat(CVariation_ref& Variation, const CSeq_feat& SrcFeat);
268 
269  /// convert SNP bitfield data to respective fields in CVariantProperties
270  ///
271  /// @param prop
272  /// The result will be put here
273  /// @param bf
274  /// Bitfield that will be decoded
275  /// @note
276  /// Not all of the bitfield bit currently have direct correspondencies within CVariantProperties
277  /// so they will be lost during the conversion
278  static void DecodeBitfield(CVariantProperties& prop, const CSnpBitfield& bf);
279 
280  // list of all possible substitutions in eSNPPropName_ResourceLinkURL
281  static const string sResourceLink_RsID;
282 
283  /// enums to control getting a string list representation of various CVariantProperties
284  /// @sa VariantPropAsStrings
287  eSNPPropName_GeneLocation, ///< prop.gene-location
288  eSNPPropName_Effect, ///< prop.effect
289  eSNPPropName_Mapping, ///< prop.mapping
290  eSNPPropName_FreqValidation,///< prop.frequence-based-validation
291  eSNPPropName_QualityCheck, ///< prop.quality-check
292  eSNPPropName_ResourceLink, ///< prop.resource-link
293 
294  /// generate URL templates, with one of sResourceLink_ substrings potentially inside
295  ///
296  /// the user should perform correct substitution of sResourceLink_ substring for the actual value
297  /// an empty string will be inserted into the list when the resource URL is not known
298  /// the order is guaranteed to be exactly the same as for _ResourceLink for the same prop
299  eSNPPropName_ResourceLinkURL///< prop.resource-link
300  };
301 
302  /// get lists of strings corresponding to a given property type
303  ///
304  /// @param ResList
305  /// will be reset to the resulting list of strings
306  /// @param prop
307  /// property based upon values within which the result will be generated
308  /// @param ePropType
309  /// type of property requested
310  /// @sa ESNPPropTypes
311  static void VariantPropAsStrings(list<string>& ResList, const CVariantProperties& prop, ESNPPropTypes ePropType);
312 
313  /// add alleles to a list of strings from deltas in variation data
314  ///
315  /// empty deltas are converted to dashes "-"
316  /// @param Alleles
317  /// list of strings to the end of which the found deltas will be added,
318  /// alleles already present in the list are retained
319  /// @param pVariation
320  /// variation object from which the deltas are read
321  template <class TVariation> static void GetDeltas(list<string>& Alleles, const TVariation* pVariation);
322 
323 private:
324  template <class TPVariation> static bool x_CommonConvertFeat(TPVariation pVariation, const CSeq_feat& SrcFeat);
325 };
326 
327 
328 // inlines
329 template <class TVariation> inline void NSNPVariationHelper::GetDeltas(list<string>& Alleles, const TVariation* pVariation)
330 {
331  if(!pVariation || !pVariation->CanGetData())
332  return;
333  const typename TVariation::TData& Data(pVariation->GetData());
334  // if the data is a set, the deltas are located in the components
335  if(Data.IsSet()) {
336  const typename TVariation::TData::TSet& Set(Data.GetSet());
337  if(Set.CanGetVariations()) {
338  ITERATE(typename TVariation::TData::TSet::TVariations, iVariations, Set.GetVariations()) {
339  GetDeltas(Alleles, iVariations->GetPointer());
340  }
341  }
342  }
343  if(Data.IsInstance()) {
344  const typename TVariation::TData::TInstance&
345  VarInst(Data.GetInstance());
346  if(VarInst.CanGetDelta()) {
347  ITERATE(typename TVariation::TData::TInstance::TDelta, iDelta, VarInst.GetDelta()) {
348  if((*iDelta)->CanGetSeq()) {
349  const CDelta_item::C_Seq& DeltaSeq((*iDelta)->GetSeq());
350  switch(DeltaSeq.Which()) {
352  {
353  if(DeltaSeq.GetLiteral().CanGetSeq_data()) {
354  const CSeq_data& Seq_data(DeltaSeq.GetLiteral().GetSeq_data());
355  // variations normally use Iupacna/Iupacaa
356  string sAllele;
357  if(Seq_data.IsIupacna())
358  sAllele = Seq_data.GetIupacna().Get().empty() ? "-" : Seq_data.GetIupacna().Get();
359  if(Seq_data.IsIupacaa())
360  sAllele = Seq_data.GetIupacaa().Get().empty() ? "-" : Seq_data.GetIupacaa().Get();
361 
362  if(!sAllele.empty())
363  Alleles.push_back(sAllele);
364  }
365  break;
366  }
368  default:
369  // no specific processing for other deltas
370  break;
371  }
372  }
373  }
374  }
375  }
376 }
377 
378 
379 template <class TPVariation> inline bool NSNPVariationHelper::x_CommonConvertFeat(TPVariation pVariation, const CSeq_feat& SrcFeat)
380 {
381  if(!NSnp::IsSnp(SrcFeat)) {
382  return false;
383  }
384  NSnp::TRsid Rsid(NSnp::GetRsid(SrcFeat));
385  if (!Rsid) {
386  return false;
387  }
388  CSnpBitfield bf(NSnp::GetBitfield(SrcFeat));
389  if (!bf.isGood()) {
390  return false;
391  }
393 
394  // read the alleles from the feature
395  // the alleles are encoded in qual with qual=="replace"
396  // empty allele strings stand for "-"
397  vector<string> alleles;
398  NSnp::GetAlleles(SrcFeat, alleles);
399 
400  // store the alleles depending on the SNP type sourced from the bitfield
401  // have to create a temp CVariation_ref for this, since CVariation lacks useful
402  // helper methods like SetSNV(), SetMNP(), etc.
403  //!! need to remind Mike to implement those or suggest
404  //!! a helper for CVariation_inst (since this is really the data structure those helpers
405  //!! work with)
406  //!! see email exchange with Mike from 07/15/2011
407  CVariation_ref TmpVarRef;
408  switch(VarClass) {
410  TmpVarRef.SetSNV(alleles, CVariation_ref::eSeqType_na);
411  break;
412 
413  case CSnpBitfield::eDips:
414  {
415  // unfortunately, there is not enough information in the annotation to
416  // reliably reconstruct what exactly this DIP represents (either ins, del or delins)
417  // so we will lump them together into an indel
418  string sAllelesTogether(NStr::Join(alleles, "/"));
419  TmpVarRef.SetDeletionInsertion(sAllelesTogether, CVariation_ref::eSeqType_na);
420  break;
421  }
423  TmpVarRef.SetMNP(alleles, CVariation_ref::eSeqType_na);
424  break;
425 
427  default:
428  {
429  //!! catch-all for the types that we cannot support well yet
430  //!! the using code will have to rely on extraneous info (e.g., a saved bitfield)
431  //!! to recover the original type
432  //!! put alleles inside a list without any sophistication, masked as a MNP
433  //!! this is against the formal rules, but we currently use it a plain container
434  TmpVarRef.SetMNP(alleles, CVariation_ref::eSeqType_na);
435  TmpVarRef.SetData().SetInstance().SetType(CVariation_ref::TData::TInstance::eType_other);
436  }
437  break;
438  }
439  if(TmpVarRef.GetData().IsInstance())
440  pVariation->SetData().SetInstance().Assign(TmpVarRef.GetData().GetInstance());
441 
442  pVariation->SetId().Assign(*NSnp::GetTag(SrcFeat));
443 
444  CVariantProperties& prop(pVariation->SetVariant_prop());
445  DecodeBitfield(prop, bf);
446 
447  pVariation->SetDescription("SNP data");
448  return true;
449 }
450 
453 
454 #endif // SNP_UTIL___SNP_UTILS__HPP
455 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_Handle –.
Definition: Dbtag.hpp:53
CMappedFeat –.
Definition: mapped_feat.hpp:59
CScope –.
Definition: scope.hpp:92
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSnpBitfield is a facade for representing any version of the SNP bitfield.
bool isGood() const
EVariationClass GetVariationClass() const
CTime –.
Definition: ncbitime.hpp:296
void SetDeletionInsertion(const string &sequence, ESeqType seq_type)
Make this variant an insertion.
void SetSNV(const CSeq_data &nucleotide, CRef< CDelta_item > offset=null)
void SetMNP(const CSeq_data &nucleotide, TSeqPos length, CRef< CDelta_item > offset=null)
set of functions for dealing with SNP represented as variation objects
Definition: snp_utils.hpp:250
static void GetDeltas(list< string > &Alleles, const TVariation *pVariation)
add alleles to a list of strings from deltas in variation data
Definition: snp_utils.hpp:329
static bool x_CommonConvertFeat(TPVariation pVariation, const CSeq_feat &SrcFeat)
Definition: snp_utils.hpp:379
static const string sResourceLink_RsID
Definition: snp_utils.hpp:281
ESNPPropTypes
enums to control getting a string list representation of various CVariantProperties
Definition: snp_utils.hpp:285
@ eSNPPropName_GeneLocation
prop.gene-location
Definition: snp_utils.hpp:287
@ eSNPPropName_ResourceLink
prop.resource-link
Definition: snp_utils.hpp:292
@ eSNPPropName_QualityCheck
prop.quality-check
Definition: snp_utils.hpp:291
@ eSNPPropName_Mapping
prop.mapping
Definition: snp_utils.hpp:289
@ eSNPPropName_FreqValidation
prop.frequence-based-validation
Definition: snp_utils.hpp:290
@ eSNPPropName_Effect
prop.effect
Definition: snp_utils.hpp:288
static void DecodeBitfield(CVariantProperties &prop, const CSnpBitfield &bf)
convert SNP bitfield data to respective fields in CVariantProperties
Definition: snp_utils.cpp:379
NSnp –.
Definition: snp_utils.hpp:63
int TClinSigID
Definition: snp_utils.hpp:76
static TRsid GetRsid(const CMappedFeat &mapped_feat)
Return rsid of SNP.
Definition: snp_utils.cpp:109
ELetterCase
controls the case of strings returned from ClinSigAsString()
Definition: snp_utils.hpp:211
@ eLetterCase_ForceLower
always use lower case only
Definition: snp_utils.hpp:212
CPhenotype::EClinical_significance EClinSigID
Definition: snp_utils.hpp:75
static CConstRef< CDbtag > GetTag(const CSeq_feat &SrcFeat)
find a SNP tag in the feature returns NULL if no such tag (sm_dbTag_dbSNP)
Definition: snp_utils.cpp:77
vector< string > TAlleles
list of alleles belonging to particular SNP a deletion is represented by a "-"
Definition: snp_utils.hpp:188
static bool IsSnp(const CMappedFeat &mapped_feat)
Determine if feature is a SNP.
Definition: snp_utils.cpp:62
CObject_id::TId8 TRsid
Definition: snp_utils.hpp:70
static void GetAlleles(const CMappedFeat &mapped_feat, TAlleles &Alleles)
Return list of alleles encoded in qual.
Definition: snp_utils.cpp:237
static CSnpBitfield GetBitfield(const CMappedFeat &)
Return bitfield information stored in the feature.
Definition: snp_utils.cpp:220
static const string sm_dbTag_dbSNP
Definition: snp_utils.hpp:67
Include a standard set of the NCBI C++ Toolkit most basic headers.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
const TPrim & Get(void) const
Definition: serialbase.hpp:347
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
Definition: ncbistr.hpp:2697
#define NCBI_SNPUTIL_EXPORT
Definition: ncbi_export.h:536
const TIupacaa & GetIupacaa(void) const
Get the variant data.
Definition: Seq_data_.hpp:530
bool IsIupacaa(void) const
Check if variant Iupacaa is selected.
Definition: Seq_data_.hpp:524
const TIupacna & GetIupacna(void) const
Get the variant data.
Definition: Seq_data_.hpp:510
bool CanGetSeq_data(void) const
Check if it is safe to call GetSeq_data method.
bool IsIupacna(void) const
Check if variant Iupacna is selected.
Definition: Seq_data_.hpp:504
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
const TInstance & GetInstance(void) const
Get the variant data.
E_Choice Which(void) const
Which variant is currently selected.
void SetData(TData &value)
Assign a value to Data data member.
const TData & GetData(void) const
Get the Data member data.
bool IsInstance(void) const
Check if variant Instance is selected.
const TLiteral & GetLiteral(void) const
Get the variant data.
EClinical_significance
does this variant have known clinical significance?
Definition: Phenotype_.hpp:90
@ e_This
same location as variation-ref itself
AutoArray< int > TDelta
container for mass deltas
Definition: msladder.hpp:62
const char * tag
Defines: CTimeFormat - storage class for time format.
CSeq_id_Mapper TInstance
Modified on Tue May 21 11:01:31 2024 by modify_doxy.py rev. 669887