NCBI C++ ToolKit
variation_util2.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: variation_util2.hpp 87190 2019-08-05 00:34:23Z vakatov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Description:
27 *
28 * Utility functions pretaining to Variation
29 *
30 * ===========================================================================
31 */
32 
33 #ifndef VARIATION_UTIL2_HPP_
34 #define VARIATION_UTIL2_HPP_
35 
36 #include <corelib/ncbiobj.hpp>
37 #include <corelib/ncbistd.hpp>
38 
44 #include <objects/seq/Seq_data.hpp>
45 
49 
53 
54 #include <util/rangemap.hpp>
55 
56 #include <objmgr/scope.hpp>
57 #include <objmgr/feat_ci.hpp>
58 #include <objmgr/seq_vector.hpp>
60 #include <objmgr/util/feature.hpp>
61 
64 
65 namespace variation {
66 
67 class CVariationUtil : public CObject
68 {
69 public:
70  enum EOptions
71  {
72  fOpt_cache_exon_sequence = 1, ///< Use when there will be many calls to calculate protein consequnece per sequence
73  fOpt_default = 0
74  };
75  typedef int TOptions;
76 
78  : m_scope(&scope)
80  , m_cdregion_index(scope, options)
81  {}
82 
83  void ClearCache()
84  {
87  }
88 
91 
92 
93 /// Methods to remap a VariantPlacement
94 
95  /*!
96  * Remap using provided alignment. Ids of at least one row must match.
97  * When remapping between transcript/genomic coordiane systems, this method should be used
98  * instead of the mapper-based, as if the location is intronic, the method
99  * will use the spliced-alignment to create or resolve offset-placement as appropriate.
100 
101  RSNP-2558 - Sometimes we don't need to check the validity of resulting placements, so this costly step can
102  be skipped
103  */
104  CRef<CVariantPlacement> Remap(const CVariantPlacement& p, const CSeq_align& aln, bool check_placements = true);
105 
106  // Remap a placement with a specified mapper. If a cdna->genomic or genomic->cdna mapping
107  // is attempted, this will throw.
109 
110 
111  /// Remap variation from product coordinates onto a nucleotide sequence on which this product is annotated.
112  /// e.g. {NM,NR,NP} -> {NG,NT,NW,AC,NC}. Use annotated seq-align, if available; otherwise use spliced rna or cdregion.
113  /// If starting from NP, remap to the parent mRNA first.
115 
116 
117 /// Methods to convert between nucleotide and protein
118 
119  /// Convert protein-variation (single-AA missense/nonsense) to nuc-variation on the precursor in the parent nuc-prot.
120  typedef int TAA2NAFlags;
122  {
124  fAA2NA_default = 0
125  };
127 
128 
129  /// Evaluate protein effect of a single-inst @ single-placement
130  ///
131  /// Two placements will be added: The protein placement, and an additional placement
132  /// on the original nucleotide level, representing the location and sequence of affected codons.
133  ///
134  /// The inst will be at the nucleotide level (describe variant codons). Rationale: the user may
135  /// want to know the codons, or translate it to AA if necessary.
136  ///
137  /// Only a subset of sequence edits is supported (mismatch, ins, del, indel)
138  /// Nuc-variation's placement must fall within cds-feat location.
139  CRef<CVariation> TranslateNAtoAA(const CVariation_inst& nuc_inst, const CVariantPlacement& p, const CSeq_feat& cds_feat);
140 
141 
142  /// Find the CDSes for the first placement; Compute prot consequence using TranslateNAtoAA for each
143  /// and attach results to nuc_variation.consequnece.
144  /// If seq-id specified, use only the placement with the specified id.
145  /// If ignore_genomic set to true, then consequences will be evaluated for cDNA or MT placements only.
146  /// Note: Alternatively, the API could be "create and return consequence protein variation(s)", rather than attach,
147  /// but that for hierarchical input it would be hard to tell which consequence corresponds to which node.
148  void AttachProteinConsequences(CVariation& nuc_variation, const CSeq_id* = NULL, bool ignore_genomic = false);
149 
150 
151 ///Other utility methods:
152 
153  /// Flipping strand involves flipping all placements, and insts.
154  /// Note: for insertions, the placement(s) must be dinucleotide where
155  /// insertion occurs, such that semantic correctness is maintained.
156  void FlipStrand(CVariation& v) const;
157 
158  /// Flipping a placement involves flipping strand on the seq-loc and swapping the offsets (and signs).
159  /// If there's a seq-literal, it is reverse-complemented.
160  void FlipStrand(CVariantPlacement& vp) const;
161 
162  /// Flip strand on delta-items and reverse the order.
163  void FlipStrand(CVariation_inst& vi) const;
164 
165  /// Reverse-complement seq-literals, flip strand on the seq-locs.
166  void FlipStrand(CDelta_item& di) const;
167 
168  enum { kMaxAttachSeqLen = 100000} ; //VAR-724
169  /// If have offsets (intronic) or too long, return false;
170  /// else set seq field on the placement and return true.
172 
173  //Call AttachSeq on every placement; Try to find asserted-type subvariation.
174  //If the asserted sequence is different from the attached reference, add
175  //corresponding exception object to the placement. Return true iff there were no exceptions added
176  bool AttachSeq(CVariation& v, TSeqPos max_len = kMaxAttachSeqLen);
177 
178 
180 
181  /*!
182  * If placement is not intronic, or alignment is not spliced-seg -> eNotApplicable
183  * Else if variation is intronic but location is not at exon boundary -> eFail
184  * Else -> ePass
185  */
187  {
191  };
193 
194  // Similar to above, except verify exon boundary using exon annotation on refseq transcripts, if exists
196 
197  /// if placement is invalid SeqLocCheck fails, or offsets out of order, attach VariationException and return true
198  /// otherwise return false;
200 
201  /// if variation.data contains a seq-literal with non-ACGT residues, attach VariationException to the first placement
202  /// (if exists) and return true. otherwise return false;
204 
205  /// Calculate upstream (first) and downstream(second) flanks for loc
206  struct SFlankLocs
207  {
210  };
212 
213 
214 /// Methods to compute properties
215 
216  /*!
217  * Fill location-specific variant-properties for a placement.
218  * Attach related GeneID(s) to placement with respect to which the properties are defined.
219  */
220  void SetPlacementProperties(CVariantPlacement& placement);
221 
222  /*
223  * Set placement-specific variant properties for a variation by applying
224  * x_SetVariantProperties(...) on each found placement and collapse those
225  * onto parent VariationProperties
226  */
228 
229 
230  /// Supported SO-terms
231  enum ESOTerm
232  {
233  //location-specific terms
246 
247  //variation-specific terms
253  eSO_stop_lost =1578
254  };
255  typedef vector<ESOTerm> TSOTerms;
256  void AsSOTerms(const CVariantProperties& p, TSOTerms& terms);
257  static string AsString(ESOTerm term);
258 
259 
260  /// Find location properties based on alignment.
261  void FindLocationProperties(const CSeq_align& transcript_aln,
262  const CSeq_loc& query_loc,
263  TSOTerms& terms);
264 
265 
266  static void s_FindLocationProperties(CConstRef<CSeq_loc> rna_loc,
267  CConstRef<CSeq_loc> cds_loc,
268  const CSeq_loc& query_loc,
269  TSOTerms& terms);
270 
271 
272  static TSeqPos s_GetLength(const CVariantPlacement& p, CScope* scope);
273 
274  /// If at any level in variation-set all variations have all same placements, move them to the parent level.
275  static void s_FactorOutPlacements(CVariation& v);
276 
277  /*!
278  * If this variation has placements defined, return it; otherwise, recursively try
279  * parent all the way to the root; return NULL if nothing found.
280  * Precondition: root Variation must have been indexed (links to parents defined as necessary)
281  */
282  static const CVariation::TPlacements* s_GetPlacements(const CVariation& v);
283 
284 
285  /// Find attached consequence variation in v that corresponds to p (has same seq-id).
287 
288 
289  /// Length up to last position of the last exon (i.e. not including polyA)
290  /// If neither is annotated, return normal length
292 
293 
295 private:
296 
297 
299 
300  void ChangeToDelins(CVariation& v);
301 
304 
306 
308 
309  void x_TranslateNAtoAA(CVariation& prot_variation);
310 
312 
313  //return iupacna or ncbieaa literals
315 
317 
318  /*
319  * If we are flipping an insertion, we must verify that the location is a multinucleotide (normally dinucleotide)
320  * (in this case insertion is interpreted as "insert-between", and can be strand-flipped).
321  * It is impossible, however, to correctly flip the strand of insert-before-a-point variation, because
322  * "before" would become "after".
323  */
324  static bool s_IsInstStrandFlippable(const CVariation& v, const CVariation_inst& inst);
325 
326  /// join two seq-literals
328 
329  /// insert seq-literal payload into ref before pos (pos=0 -> prepend; pos=ref.len -> append)
330  static CRef<CSeq_literal> s_SpliceLiterals(const CSeq_literal& payload, const CSeq_literal& ref, TSeqPos pos);
331 
332  static void s_AttachGeneIDdbxref(CVariantPlacement& p, int gene_id);
333 
334  static void s_UntranslateProt(const string& prot_str, vector<string>& codons);
335 
336  static size_t s_CountMatches(const string& a, const string& b);
337 
339  const string& codon_from, //codon on cDNA
340  const string& prot_to, //missense/nonsense AA
341  vector<string>& codons_to); //calculated variation-codon
342 
343  static string s_CollapseAmbiguities(const vector<string>& seqs);
344 
345  /*!
346  * Apply offsets to the variation's location (variation must be inst)
347  * E.g. the original variation could be a transcript location with offsets specifying intronic location.
348  * After original transcript lociation is remapped, the offsets are to be applied to the remapped location
349  * to get absolute offset-free genomic location.
350  */
352 
353  /*!
354  * If start|stop don't fall into an exon, adjust start|stop to the closest exon boundary and add offset to inst.
355  * This is to be applied before remapping a genomic variation to transcript coordinates
356  */
357  static void s_AddIntronicOffsets(CVariantPlacement& p, const CSpliced_seg& ss, CScope* scope);
358 
359  /*!
360  * Extend delins to specified location (adjust location and delta)
361  *
362  * Precondition:
363  * -variation must be a inst-type normalized delins (via x_ChangeToDelins)
364  * -loc must be a superset of variation's location.
365  */
366  void x_AdjustDelinsToInterval(CVariation& v, const CSeq_loc& loc);
367 
368 
369 
370  /*!
371  * Call s_GetPlacements and find first seq-literal in any of them.
372  */
374 
375  /*
376  * Find a sibling variation with with observation=asserted;
377  * if not found, recursively try parent, or return null if root.
378  * Note: Assumes CVariation::Index has been called.
379  */
381 
382 
385 
386 
387  /*
388  * In variation-ref the intronic offsets are encoded as last and/or first inst delta-items.
389  * In Variation the intronic offsets are part of the placement. After creating a variation
390  * from a variation-ref we need to migrate the offsets from inst into placement.
391  */
394 
395 
396 private:
397 
398  /// Cache seq-data in the CDS regions and the cds features by location
400  {
401  public:
402  struct SCdregion
403  {
405  bool operator<(const SCdregion& other) const
406  {
407  return !this->cdregion_feat ? false
408  : !other.cdregion_feat ? true
409  : *this->cdregion_feat < *other.cdregion_feat;
410  }
411  };
412  typedef vector<SCdregion> TCdregions;
413 
414  CCdregionIndex(CScope& scope, TOptions options)
415  : m_scope(&scope)
416  , m_options(options)
417  {}
418 
419  void Get(const CSeq_loc& loc, TCdregions& cdregions);
420 
421  //return NULL if not cached in its entirety
423 
424  void Clear()
425  {
426  m_data.clear();
428  }
429  private:
430 
431  struct SSeqData
432  {
433  mutable CRef<CSeq_loc_Mapper> mapper; //will map original location consisting of merged exon locations on the seq_data coordinate system
434  string seq_data;
435  };
439 
440  void x_Index(const CSeq_id_Handle& idh);
441  void x_CacheSeqData(const CSeq_loc& loc, const CSeq_id_Handle& idh);
442 
447  };
448 
449  /// Given a seq-loc, compute CVariantProperties::TGene_location from annotation.
450  /// Precompute for the whole sequence and keep the cache to avoid objmgr annotation
451  /// lookups on every call.
453  {
454  public:
455  typedef pair<int, CVariantProperties::TGene_location> TGeneIDAndProp;
456  typedef vector<TGeneIDAndProp> TGeneIDAndPropVector;
457 
459  : m_scope(&scope)
460  {}
461 
463  void Clear()
464  {
465  m_loc2prop.clear();
466  }
467 
468  typedef pair<CRef<CSeq_loc>, CRef<CSeq_loc> > TLocsPair; //5' and 3' respectively
469  static TLocsPair s_GetStartAndStopCodonsLocs(const CSeq_loc& cds_loc);
470  static TLocsPair s_GetUTRLocs(const CSeq_loc& cds_loc, const CSeq_loc& parent_loc);
471  static TLocsPair s_GetNeighborhoodLocs(const CSeq_loc& gene_loc, TSeqPos max_pos);
472  static TLocsPair s_GetIntronsAndSpliceSiteLocs(const CSeq_loc& rna_loc);
473  static int s_GetGeneID(const CMappedFeat& mf, feature::CFeatTree& ft);
474  static int s_GetGeneIdForProduct(CBioseq_Handle bsh);
475 
476  private:
477  void x_Index(const CSeq_id_Handle& idh);
478  void x_Add(const CSeq_loc& loc, int gene_id, CVariantProperties::TGene_location prop);
479 
482 
485  };
486 
487 
488 private:
492 };
493 
494 }
495 
497 #endif
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_Handle –.
CMappedFeat –.
Definition: mapped_feat.hpp:59
CObject –.
Definition: ncbiobj.hpp:180
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeq_loc_Mapper –.
CVariantPlacement –.
CVariation_inst –.
void clear()
Definition: map.hpp:169
Cache seq-data in the CDS regions and the cds features by location.
void Get(const CSeq_loc &loc, TCdregions &cdregions)
map< CSeq_id_Handle, SSeqData > TSeqDataMap
map< CSeq_id_Handle, TRangeMap > TIdRangeMap
CCdregionIndex(CScope &scope, TOptions options)
CRangeMap< TCdregions, TSeqPos > TRangeMap
void x_Index(const CSeq_id_Handle &idh)
CRef< CSeq_literal > GetCachedLiteralAtLoc(const CSeq_loc &loc)
void x_CacheSeqData(const CSeq_loc &loc, const CSeq_id_Handle &idh)
Given a seq-loc, compute CVariantProperties::TGene_location from annotation.
static TLocsPair s_GetIntronsAndSpliceSiteLocs(const CSeq_loc &rna_loc)
pair< CRef< CSeq_loc >, CRef< CSeq_loc > > TLocsPair
void GetLocationProperties(const CSeq_loc &loc, TGeneIDAndPropVector &v)
CRangeMap< TGeneIDAndPropVector, TSeqPos > TRangeMap
pair< int, CVariantProperties::TGene_location > TGeneIDAndProp
static TLocsPair s_GetNeighborhoodLocs(const CSeq_loc &gene_loc, TSeqPos max_pos)
static TLocsPair s_GetUTRLocs(const CSeq_loc &cds_loc, const CSeq_loc &parent_loc)
static TLocsPair s_GetStartAndStopCodonsLocs(const CSeq_loc &cds_loc)
static int s_GetGeneID(const CMappedFeat &mf, feature::CFeatTree &ft)
void x_Add(const CSeq_loc &loc, int gene_id, CVariantProperties::TGene_location prop)
CRef< CVariantPlacement > RemapToAnnotatedTarget(const CVariation &v, const CSeq_id &target)
Remap variation from product coordinates onto a nucleotide sequence on which this product is annotate...
static void s_AddInstOffsetsFromPlacementOffsets(CVariation_inst &vi, const CVariantPlacement &p)
static const CVariation::TPlacements * s_GetPlacements(const CVariation &v)
void FindLocationProperties(const CSeq_align &transcript_aln, const CSeq_loc &query_loc, TSOTerms &terms)
Find location properties based on alignment.
CRef< CSeq_literal > GetLiteralAtLoc(const CSeq_loc &loc)
CRef< CVariation > x_AsVariation(const CVariation_ref &vr)
SFlankLocs CreateFlankLocs(const CSeq_loc &loc, TSeqPos len)
CRef< CVariation > x_CreateUnknownVariation(const CSeq_id &id, CVariantPlacement::TMol mol)
bool CheckAmbiguitiesInLiterals(CVariation &v)
if variation.data contains a seq-literal with non-ACGT residues, attach VariationException to the fir...
static size_t s_CountMatches(const string &a, const string &b)
void x_AdjustDelinsToInterval(CVariation &v, const CSeq_loc &loc)
ETestStatus CheckExonBoundary(const CVariantPlacement &p, const CSeq_align &aln)
static void s_FactorOutPlacements(CVariation &v)
If at any level in variation-set all variations have all same placements, move them to the parent lev...
static CConstRef< CVariation > s_FindConsequenceForPlacement(const CVariation &v, const CVariantPlacement &p)
Find attached consequence variation in v that corresponds to p (has same seq-id).
CRef< CVariation > TranslateNAtoAA(const CVariation_inst &nuc_inst, const CVariantPlacement &p, const CSeq_feat &cds_feat)
Evaluate protein effect of a single-inst @ single-placement.
static string AsString(ESOTerm term)
static CRef< CSeq_literal > s_SpliceLiterals(const CSeq_literal &payload, const CSeq_literal &ref, TSeqPos pos)
insert seq-literal payload into ref before pos (pos=0 -> prepend; pos=ref.len -> append)
void x_SetVariantPropertiesForIntronic(CVariantPlacement &p, int offset, const CSeq_loc &loc, CBioseq_Handle &bsh)
bool AttachSeq(CVariantPlacement &p, TSeqPos max_len=kMaxAttachSeqLen)
If have offsets (intronic) or too long, return false; else set seq field on the placement and return ...
CRef< CVariation > AsVariation(const CSeq_feat &variation_ref)
@ fOpt_cache_exon_sequence
Use when there will be many calls to calculate protein consequnece per sequence.
void x_ChangeToDelins(CVariation &v)
void x_SetVariantProperties(CVariantProperties &p, const CVariation_inst &vi, const CSeq_loc &loc)
CVariationUtil(CScope &scope, TOptions options=fOpt_default)
bool CheckPlacement(CVariantPlacement &p)
if placement is invalid SeqLocCheck fails, or offsets out of order, attach VariationException and ret...
static void s_AddIntronicOffsets(CVariantPlacement &p, const CSpliced_seg &ss, CScope *scope)
CVariantPlacement::TMol GetMolType(const CSeq_id &id)
void x_InferNAfromAA(CVariation &v, TAA2NAFlags flags)
static string s_CollapseAmbiguities(const vector< string > &seqs)
static void s_FindLocationProperties(CConstRef< CSeq_loc > rna_loc, CConstRef< CSeq_loc > cds_loc, const CSeq_loc &query_loc, TSOTerms &terms)
void ChangeToDelins(CVariation &v)
static void s_ResolveIntronicOffsets(CVariantPlacement &p)
static void s_UntranslateProt(const string &prot_str, vector< string > &codons)
void AsSOTerms(const CVariantProperties &p, TSOTerms &terms)
static void s_AttachGeneIDdbxref(CVariantPlacement &p, int gene_id)
static const CConstRef< CSeq_literal > s_FindFirstLiteral(const CVariation &v)
CRef< CSeq_literal > x_GetLiteralAtLoc(const CSeq_loc &loc)
CRef< CVariation > InferNAfromAA(const CVariation &prot_variation, TAA2NAFlags flags=fAA2NA_default)
void FlipStrand(CVariation &v) const
Other utility methods:
void SetPlacementProperties(CVariantPlacement &placement)
Methods to compute properties.
CVariantPropertiesIndex m_variant_properties_index
CRef< CVariation_ref > x_AsVariation_ref(const CVariation &v, const CVariantPlacement &p)
CRef< CVariantPlacement > Remap(const CVariantPlacement &p, const CSeq_align &aln, bool check_placements=true)
Methods to remap a VariantPlacement.
static bool s_IsInstStrandFlippable(const CVariation &v, const CVariation_inst &inst)
static TSeqPos s_GetLength(const CVariantPlacement &p, CScope *scope)
ESOTerm
Supported SO-terms.
TSeqPos GetEffectiveTranscriptLength(const CBioseq_Handle &bsh)
Length up to last position of the last exon (i.e.
void AttachProteinConsequences(CVariation &nuc_variation, const CSeq_id *=NULL, bool ignore_genomic=false)
Find the CDSes for the first placement; Compute prot consequence using TranslateNAtoAA for each and a...
void x_TranslateNAtoAA(CVariation &prot_variation)
CConstRef< CSeq_literal > x_FindOrCreateLiteral(const CVariation &v)
static void s_ConvertInstOffsetsToPlacementOffsets(CVariation &v, CVariantPlacement &p)
static const CConstRef< CSeq_literal > s_FindAssertedLiteral(const CVariation &v)
void AsVariation_feats(const CVariation &v, CSeq_annot::TData::TFtable &feats)
void SetVariantProperties(CVariation &v)
static CRef< CSeq_literal > s_CatLiterals(const CSeq_literal &a, const CSeq_literal &b)
join two seq-literals
int TAA2NAFlags
Methods to convert between nucleotide and protein.
void s_CalcPrecursorVariationCodon(const string &codon_from, const string &prot_to, vector< string > &codons_to)
CRef< CVariantPlacement > x_Remap(const CVariantPlacement &p, CSeq_loc_Mapper &mapper)
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
int offset
Definition: replacements.h:160
const TResidue codons[4][4]
Definition: gnomon_seq.cpp:76
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define NULL
Definition: ncbistd.hpp:225
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
list< CRef< CVariantPlacement > > TPlacements
int len
unsigned int a
Definition: ncbi_localip.c:102
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
Calculate upstream (first) and downstream(second) flanks for loc.
USING_SCOPE(objects)
Modified on Fri Sep 20 14:57:07 2024 by modify_doxy.py rev. 669887