NCBI C++ ToolKit
snp_bins.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: snp_bins.cpp 99445 2023-03-29 13:18:42Z rudnev $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Melvin Quintos, Dmitry Rudnev
27  *
28  * File Description:
29  * Implements the functions in snp_bins.hpp
30  */
31 
32 #include <ncbi_pch.hpp>
33 
34 #include <objmgr/annot_ci.hpp>
35 #include <objmgr/feat_ci.hpp>
36 #include <objmgr/graph_ci.hpp>
37 #include <objmgr/align_ci.hpp>
38 #include <objmgr/table_field.hpp>
39 
44 
46 
49 
50 #include <util/checksum.hpp>
51 
52 #include <cmath>
53 
56 
59 
60 
62 {
63  switch(Source)
64  {
65  case eSource_dbGAP:
66  return "dbGaP";
67  case eSource_NHGRI_GWAS:
68  return "NHGRI GWAS catalog";
70  return "NHLBI GRASP";
71  }
72  return "dbGaP";
73 }
74 
75 void NSnpBins::ReadAnnotDesc(const CSeq_annot_Handle& handle, string& title, string& comment)
76 {
77  if (handle.Seq_annot_CanGetDesc()) {
78  // Extract the Annotations properties
80  const CAnnotdesc &desc = **it;
81 
82  if (desc.IsComment()) {
83  comment = desc.GetComment();
84  }
85  else if (desc.IsTitle()) {
86  title = desc.GetTitle();
87  }
88  }
89  }
90 }
91 
92 
93 // get a selector for a bin given a NA track accession with some selector parameters
94 void NSnpBins::GetBinSelector(const string& sTrackAccession,
95  bool isAdaptive,
96  int depth,
97  SAnnotSelector& sel)
98 {
101  sel.IncludeNamedAnnotAccession(sTrackAccession);
102  sel.AddNamedAnnots(sTrackAccession);
103  // copied from CSeqUtils::SetResolveDepth()
104  if(isAdaptive) {
105  sel.SetAdaptiveDepth(true);
106  sel.SetExactDepth(false);
107  //!!: watch out
108  // Maybe there is bug inside selector, we have to call SetResolveAll() even
109  // for cases where we only want to resolve up to a given depth.
110  sel.SetResolveAll();
111  if (depth >=0) sel.SetResolveDepth(depth);
112  } else if (depth >= 0){
113  sel.SetResolveDepth(depth);
114  sel.SetExactDepth(true);
115  sel.SetAdaptiveDepth(false);
116  }
117 }
118 
119 // get an annotation handle that is needed to load a bin from an existing selector and loc and bioseq handle
120 // returns false if a handle cannot be obtained
122  const SAnnotSelector& sel,
123  const CSeq_loc &loc,
124  CSeq_annot_Handle& handle)
125 {
126  CAnnot_CI iter(scope, loc, sel);
127  if(iter.size() != 1) {
128  return false;
129  }
130  handle = *iter;
131  return true;
132 }
133 
134 
135 
136 
137 // choose a more significant entry of the two offered
138 // returns 1 of entry1 is more significant or 2 if entry2 is more
139 int NSnpBins::ChooseSignificant(const SBinEntry* entry1, const SBinEntry* entry2, TBinType type)
140 {
141  // significance is determined using different metrics depending on the bin type
142  // for eCLIN, the most significant is pathogenic, then probably pathogenic, then everything else
143  // for all other bins, the significance is determined by the largest pvalue
144  if(type == eCLIN) {
147  ? 1
148  : 2;
149  } else {
150  return ((entry1->pvalue ? -log10(entry1->pvalue) : 0.) > (entry2->pvalue ? -log10(entry2->pvalue) : 0.))
151  ? 1
152  : 2;
153  }
154 }
155 
156 
157 CRef<NSnpBins::SBin> NSnpBins::GetBin(const objects::CSeq_annot_Handle& annot,
159 {
160  const CTableFieldHandle<int> col_type("trackType");
161  int pos_start, pos_end;
162  int type;
163  string title, comment;
164  CRef<SBin> res(new SBin);
165  FindPosIndexRange(annot, (int)range.GetFrom(), (int)range.GetTo(), pos_start, pos_end);
166  ReadAnnotDesc(annot, title, comment);
167  if (!col_type.TryGet(annot, 0, type)) {
169  }
170  res->count = 0;
171  res->range = range;
172  res->title = title;
173  res->type = type;
174 
175  for(int row = pos_start; row < pos_end; ++row ) {
176  CRef<NSnpBins::SBinEntry> BinEntry(GetEntry(annot, row));
177  if(res->m_SigEntry.Empty()) {
178  res->m_SigEntry = BinEntry;
179  } else {
180  if(ChooseSignificant(res->m_SigEntry, BinEntry, type) == 2) {
181  res->m_SigEntry = BinEntry;
182  }
183  }
184  res->m_EntryList.push_back(BinEntry);
185  res->count++;
186  }
187  return res;
188 }
189 
190 CRef<NSnpBins::SBinEntry> NSnpBins::GetEntry(const objects::CSeq_annot_Handle& annot,
191  int row)
192 {
193  const CTableFieldHandle<int> col_pos("pos");
194  const CTableFieldHandle<int> col_pos_end("pos_end");
195  const CTableFieldHandle<double> col_val("pvalue");
196  const CTableFieldHandle<double> col_val_synonym("Pvalue");
197  const CTableFieldHandle<string> col_trait("trait");
198  const CTableFieldHandle<string> col_pmids("pmids");
199  const CTableFieldHandle<string> col_rgenes("reportedGenes");
200  const CTableFieldHandle<string> col_mgenes("mappedGenes");
201  const CTableFieldHandle<int> col_snpid("snpId");
202  const CTableFieldHandle<string> col_sub_type("trackSubType");
203  const CTableFieldHandle<int> col_clinsigid("clinSigID");
204  const CTableFieldHandle<string> col_hgvs("HGVS");
205  const CTableFieldHandle<string> col_dbgaptext("dbgaptext");
206  const CTableFieldHandle<string> col_context("context");
207  const CTableFieldHandle<int> col_source("source");
208  const CTableFieldHandle<string> col_population("population");
209  const CTableFieldHandle<int> col_geneId("geneId");
210  const CTableFieldHandle<string> col_geneStringId("geneId");
211  const CTableFieldHandle<string> col_geneName("geneName");
212 
213  string trackSubType;
214  int pos, pos_end;
215  int snpid, ClinSigID;
216  double pvalue;
217  string trait, pmids, rgenes, mgenes;
218  string title, comment, population;
219  string sHGVS;
220  int source, geneId;
221  string geneStringId;
222  string dbgaptext, geneName;
223  string context;
224 
226  if(col_pos.TryGet(annot, row, pos)) {
227  entry.Reset(new NSnpBins::SBinEntry());
228  entry->pos = (TSeqPos)pos;
229  entry->pos_end = col_pos_end.TryGet(annot, row, pos_end) ? (TSeqPos)pos_end : kInvalidSeqPos;
230  entry->trackSubType = col_sub_type.TryGet(annot, row, trackSubType) ? trackSubType : "";
231  entry->snpid = col_snpid.TryGet(annot, row, snpid) ? (unsigned int)snpid : 0;
232  entry->pvalue = (col_val.TryGet(annot, row, pvalue) || col_val_synonym.TryGet(annot, row, pvalue)) ? pvalue : 0;
233  entry->trait = col_trait.TryGet(annot, row, trait) ? trait : "";
234  entry->pmids = col_pmids.TryGet(annot, row, pmids) ? pmids : "";
235  entry->genes_reported = col_rgenes.TryGet(annot, row, rgenes) ? rgenes : "";
236  entry->genes_mapped = col_mgenes.TryGet(annot, row, mgenes) ? mgenes : "";
237  entry->ClinSigID = col_clinsigid.TryGet(annot, row, ClinSigID) ? ClinSigID : -1;
238  entry->sHGVS = col_hgvs.TryGet(annot, row, sHGVS) ? sHGVS : "";
239  entry->dbgaptext = col_dbgaptext.TryGet(annot, row, dbgaptext) ? dbgaptext : "";
240  entry->context = col_context.TryGet(annot, row, context) ? context : "";
241  entry->source = col_source.TryGet(annot, row, source) ? source : -1;
242  entry->population = col_population.TryGet(annot, row, population) ? population : "";
243  entry->geneName = col_geneName.TryGet(annot, row, geneName) ? geneName : "";
244  try {
245  entry->geneId = col_geneId.TryGet(annot, row, geneId) ? geneId : -1;
246  } catch(...) {
247  entry->geneStringId = col_geneStringId.TryGet(annot, row, geneStringId) ? geneStringId : "";
248  }
249  }
250  return entry;
251 }
252 
253 void NSnpBins::CGeneMap::x_Init(const string& sSrc)
254 {
255  m_GeneMap.clear();
256  list<string> GeneSymIDPairsList;
257 
258  NStr::Split(sSrc, ":", GeneSymIDPairsList, NStr::fSplit_Tokenize);
259 
260  ITERATE(list<string>, iGeneSymIDPairsList, GeneSymIDPairsList) {
261  list<string> GeneSymIDPair;
262  NStr::Split(*iGeneSymIDPairsList, "^", GeneSymIDPair, NStr::fSplit_Tokenize);
263  m_GeneMap[GeneSymIDPair.front()] = GeneSymIDPair.size() == 2 ? GeneSymIDPair.back() : string();
264  }
265 }
266 
268 {
269  string sRes;
270 
271  ITERATE(TGeneMap, iGeneMap, m_GeneMap) {
272  sRes += (sRes.empty() ? "" : ":") + iGeneMap->first + "^" + iGeneMap->second;
273  }
274 
275  return sRes;
276 }
277 
279  int pos_value_from, int pos_value_to,
280  int& pos_index_begin, int& pos_index_end)
281 {
282  size_t rows = annot.GetSeq_tableNumRows();
283  const CTableFieldHandle<int> col_pos("pos");
284  const CTableFieldHandle<string> col_sub_type("trackSubType");
285 
286  pos_index_begin = -1;
287  pos_index_end = rows-1;
288 
289  string trackSubType;
290  col_sub_type.TryGet(annot, 0, trackSubType);
291 
292  const CTableFieldHandle<int> col_pos_end(NSnpBins::isGeneMarker(trackSubType) ? "pos_end" : "pos");
293 
294 
295  // Find 'pos_value_from'
296  int lower_pos_index_bound = 0;
297  int upper_pos_index_bound = rows-1;
298  int pos_index_k = 0;
299  int pos_value_k = 0;
300  do {
301  pos_index_k = (lower_pos_index_bound + upper_pos_index_bound)/2;
302  col_pos_end.TryGet(annot, pos_index_k, pos_value_k);
303  if(pos_value_k < pos_value_from) {
304  pos_index_begin = pos_index_k;
305  lower_pos_index_bound = pos_index_k+1;
306  } else {
307  upper_pos_index_bound = pos_index_k-1;
308  }
309  } while (pos_value_k != pos_value_from && lower_pos_index_bound <= upper_pos_index_bound);
310 
311  // position start to be inclusive (catch boundary condition)
312  pos_index_begin = (pos_value_from == pos_value_k ? pos_index_k : pos_index_begin+1);
313 
314  // slide the start down for cases when there are several entries with the same position
315  int SlidingBegin(pos_index_begin-1);
316  while(SlidingBegin >= 0) {
317  col_pos_end.TryGet(annot, SlidingBegin, pos_value_k);
318  if(pos_value_k < pos_value_from)
319  break;
320  pos_index_begin = SlidingBegin;
321  --SlidingBegin;
322  }
323 
324  // find the 'pos_value_to' value
325  lower_pos_index_bound=0;
326  upper_pos_index_bound=rows-1;
327  pos_value_k = 0;
328  do {
329  pos_index_k = (lower_pos_index_bound+upper_pos_index_bound)/2;
330  col_pos.TryGet(annot, pos_index_k, pos_value_k);
331  if (pos_value_k < pos_value_to) {
332  lower_pos_index_bound = pos_index_k+1;
333  }
334  else {
335  pos_index_end = pos_index_k;
336  upper_pos_index_bound = pos_index_k-1;
337  }
338  } while (pos_value_k != pos_value_to && lower_pos_index_bound <= upper_pos_index_bound);
339 
340  // increase end to include in range up until the latest entry with "pos".
341  size_t SlidingEnd(pos_index_end);
342  while(SlidingEnd < rows) {
343  col_pos.TryGet(annot, SlidingEnd, pos_value_k);
344  if(pos_value_k > pos_value_to)
345  break;
346  pos_index_end = SlidingEnd+1;
347  ++SlidingEnd;
348  }
349 }
350 
351 
352 
353 
355 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Checksum and hash calculation classes.
CAnnot_CI –.
Definition: annot_ci.hpp:59
CAnnotdesc –.
Definition: Annotdesc.hpp:66
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
CSeq_annot_Handle –.
bool TryGet(const CFeat_CI &feat_ci, Value &v) const
Definition: table_field.hpp:78
void x_Init(const string &sSrc)
Definition: snp_bins.cpp:253
string AsString() const
recreate the string that was used for creation
Definition: snp_bins.cpp:267
TGeneMap m_GeneMap
Definition: snp_bins.hpp:140
static int ChooseSignificant(const SBinEntry *entry1, const SBinEntry *entry2, TBinType type)
choose a more significant entry of the two offered
Definition: snp_bins.cpp:139
static void GetBinSelector(const string &sTrackAccession, bool isAdaptive, int depth, objects::SAnnotSelector &sel)
get a selector for a bin from a NA track accession
Definition: snp_bins.cpp:94
int TSource
Definition: snp_bins.hpp:68
static bool GetBinHandle(objects::CScope &scope, const objects::SAnnotSelector &sel, const objects::CSeq_loc &loc, objects::CSeq_annot_Handle &annot)
get an annotation handle that is needed to load a singular bin on range
Definition: snp_bins.cpp:121
int TBinType
Definition: snp_bins.hpp:60
static CRef< SBin > GetBin(const objects::CSeq_annot_Handle &annot, TSeqRange range)
get a singular bin corresponding to a position range
Definition: snp_bins.cpp:157
static void ReadAnnotDesc(const objects::CSeq_annot_Handle &handle, string &title, string &comment)
get title and comment out of annot.desc
Definition: snp_bins.cpp:75
@ eSource_NHGRI_GWAS
Definition: snp_bins.hpp:65
@ eSource_NHLBI_GRASP
Definition: snp_bins.hpp:66
@ eSource_dbGAP
Definition: snp_bins.hpp:64
@ eCLIN
Clinical Variations.
Definition: snp_bins.hpp:56
@ eGAP
dbGaP analysis files
Definition: snp_bins.hpp:54
static bool isGeneMarker(const string &trackSubType)
determine whether a string in TrackSubType describes a Gene Marker ("102_1" or "102_3")
Definition: snp_bins.hpp:226
static void FindPosIndexRange(const objects::CSeq_annot_Handle &annot, int pos_value_from, int pos_value_to, int &pos_index_begin, int &pos_index_end)
Perform iterative binary search to find table indexes (rows) 'pos_index_begin' and 'pos_index_end' in...
Definition: snp_bins.cpp:278
static string SourceAsString(TSource Source)
get human-readable text for various source types
Definition: snp_bins.cpp:61
static CRef< SBinEntry > GetEntry(const objects::CSeq_annot_Handle &annot, int row)
get a bin entry corresponding to a row position in the table presumed contained within the handle
Definition: snp_bins.cpp:190
size_type size() const
Definition: map.hpp:148
void clear()
Definition: map.hpp:169
static unsigned char depth[2 *(256+1+29)+1]
static int type
Definition: getdata.c:31
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
string
Definition: cgiapp.hpp:687
size_t GetSeq_tableNumRows(void) const
const CSeq_annot::TDesc & Seq_annot_GetDesc(void) const
bool Seq_annot_CanGetDesc(void) const
SAnnotSelector & SetExactDepth(bool value=true)
SetExactDepth() specifies that annotations will be searched on the segment level specified by SetReso...
SAnnotSelector & SetResolveAll(void)
SetResolveAll() is equivalent to SetResolveMethod(eResolve_All).
SAnnotSelector & SetOverlapTotalRange(void)
Check overlapping only of total ranges.
size_t size(void) const
Get number of collected Seq-annots.
Definition: annot_ci.hpp:194
SAnnotSelector & SetAdaptiveDepth(bool value=true)
SetAdaptiveDepth() requests to restrict subsegment resolution depending on annotations found on lower...
SAnnotSelector & SetResolveDepth(int depth)
SetResolveDepth sets the limit of subsegment resolution in searching annotations.
SAnnotSelector & IncludeNamedAnnotAccession(const string &acc, int zoom_level=0)
SAnnotSelector & SetAnnotType(TAnnotType type)
Set annotation type (feat, align, graph)
SAnnotSelector & AddNamedAnnots(const CAnnotName &name)
Add named annot to set of annots names to look for.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
const Tdata & Get(void) const
Get the member data.
bool IsComment(void) const
Check if variant Comment is selected.
Definition: Annotdesc_.hpp:535
const TTitle & GetTitle(void) const
Get the variant data.
Definition: Annotdesc_.hpp:521
const TComment & GetComment(void) const
Get the variant data.
Definition: Annotdesc_.hpp:541
bool IsTitle(void) const
Check if variant Title is selected.
Definition: Annotdesc_.hpp:515
list< CRef< CAnnotdesc > > Tdata
@ eClinical_significance_probable_pathogenic
Definition: Phenotype_.hpp:95
@ eClinical_significance_pathogenic
Definition: Phenotype_.hpp:96
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
range(_Ty, _Ty) -> range< _Ty >
const CharType(& source)[N]
Definition: pointer.h:1149
T log10(T x_)
USING_SCOPE(objects)
#define row(bind, expected)
Definition: string_bind.c:73
a single bin entry
Definition: snp_bins.hpp:75
TSeqPos pos_end
gene end when trackSubType is (Gene association)
Definition: snp_bins.hpp:91
string geneStringId
when external Gene IDs are used, it can be a string
Definition: snp_bins.hpp:93
unsigned int snpid
Definition: snp_bins.hpp:78
string geneName
gene name when trackSubType is (Gene association)
Definition: snp_bins.hpp:94
string dbgaptext
specially formatted, see document
Definition: snp_bins.hpp:86
string population
population description for GWAS/pha tracks
Definition: snp_bins.hpp:90
string genes_reported
specially formatted, see document
Definition: snp_bins.hpp:82
int ClinSigID
clinical significance ID,
Definition: snp_bins.hpp:84
string trackSubType
used to further differentiate some GWAS/pha tracks (see SV-2201)
Definition: snp_bins.hpp:89
string genes_mapped
specially formatted, see document
Definition: snp_bins.hpp:83
string pmids
comma-delimited list of PubMed IDs
Definition: snp_bins.hpp:80
TSeqPos pos
! arrange member names as in the dumped file
Definition: snp_bins.hpp:77
int geneId
gene ID when trackSubType is (Gene association)
Definition: snp_bins.hpp:92
representation of a bin
Definition: snp_bins.hpp:102
SAnnotSelector –.
Definition: type.c:6
static CS_CONTEXT * context
Definition: will_convert.c:21
Modified on Tue Jun 18 13:40:18 2024 by modify_doxy.py rev. 669887