NCBI C++ ToolKit
na_utils.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef GUI_OBJUTILS___NA_UTILS__HPP
2 #define GUI_OBJUTILS___NA_UTILS__HPP
3 
4 /* $Id: na_utils.hpp 47296 2022-12-30 03:01:50Z evgeniev $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors: Liangshou Wu, Dmitry Rudnev
30  *
31  * File Description:
32  * Name annotation utility class.
33  *
34  */
35 
37 #include <objmgr/bioseq_handle.hpp>
38 #include <gui/objutils/utils.hpp>
39 
40 /** @addtogroup GUI_OBJUTILS
41  *
42  * @{
43  */
44 
45 
47 
48 class ICanceled;
49 
51  class CScope;
52  class CSeq_id;
53  class CSeq_align;
54  class CSeq_annot;
55  class CSeq_annot_Handle;
57 
58 
59 
60 ///////////////////////////////////////////////////////////////////////////////
61 /// Utilities for data/track type discovery
63 {
64 public:
65  /// Get alignment type.
66  /// The alignment types include:
67  /// - nucleotide: nucleotide-to-nucleotide alignment
68  /// - protein: protein-to-protein alignment
69  /// - mixed: protein-to-nucleotide alignment
70  static string GetAlignType(const objects::CSeq_align& align,
71  objects::CScope& scope);
72 
73  /// Collect unique list of feature subtypes for all features in a seq-annot.
74  static void GetFeatSubtypes(const objects::CSeq_annot& annot,
75  set<string>& subtypes);
76 
77  /// Does feature belong to gene model track
78  /// Gene, RNA, cdregion, exon, C_region, and VDJ segments
79  static bool IsGeneModelFeature(int type, int subtype);
80 
81  /// Convert feature subtypes to track subtypes.
82  /// If the feature subtype list contains 'gene' feature,
83  /// all gene, RNA, cdregion and exon feature subtypes will
84  /// be combined into one track subtype: gene_model. If
85  /// the feature type is variation_ref, the track type will
86  /// be dbVar. If the feature tyep is variation, the track
87  /// type will be dbSNP. For anything else, track type
88  /// will be just treated as a feature track subtype.
89  static void FeatSubtypes2TrackSubtypes(set<string>& subtypes,
90  const set<string>& feat_subtypes);
91 
92  /// Collect all column header names for a given seq-table.
93  /// The header name is from either field-name or field-id.
94  static void GetColumnHeader(set<string>& headers,
95  const objects::CSeq_annot& annot);
96 
97  /// Guess the track subtype based on seq-table headers.
98  static string GetSeqTableSubtype(const set<string>& headers);
99 
100  /// Get data track information from a seq-annot.
102  GetTrackInfo(objects::CSeq_annot_Handle annot_handle);
103 
104 
106  GetTrackInfo(const objects::CSeq_annot& annot, objects::CScope& scope);
107 
108  /// Map data type to track type.
109  static void DataType2TrackType(const string& annot_type,
110  const string& subtype,
111  string& track_key,
112  string& subkey);
113 };
114 
116 {
117 public:
118  /// typedefs for NAA meta-data
119  typedef string TNAA;
120  typedef list<TNAA> TNAAs;
124 
125  /// eutils databases that can be used
126  enum EEntrezDB {
130  EEDB_All ///< try both nucleotide and protein databases, merge results
131  };
132  /// source of obtained NA metadata
133  /// were internal caches hit, partially hit or totally missed during some cached operations
140  EMetaDataSource_EmptyAfterFilteringRequest
141  };
142 
143  enum EUidsSource {
148  EUidsSource_EmptyRequest
149  };
150 
151  /// Default ctor.
152  /// No Sequence id provided. That means the target named
153  /// annotations won't be filtered by any sequence id.
154  CNAUtils();
155 
156  /// Ctor with target sequence id.
157  /// Only the named annotations associated with the target
158  /// sequence will be returned. However, the target sequence
159  /// is ignored if the requested NAAs are explicitly listed.
160  CNAUtils(const objects::CSeq_id& id);
161 
162  /// Ctor with target sequence id and a scope.
163  /// @sa CNAUtils(const objects::CSeq_id& id).
164  CNAUtils(const objects::CSeq_id& id, objects::CScope& scope);
165 
166  /// Get a list of NAAs associated with the target sequence.
167  /// It returns only the named annotation names only (no meta-data).
168  /// If the target sequence is not set, it returns nothing.
169  /// @param naas data structure for returned NA names.
170  /// @param context is a viewer context that predefines a set
171  /// of NAAs applied only under the given viewer context.
172  /// Maybe, we can remove it when TMS is fully integrated.
173  void GetAllNAAs(TNAAs& naas, const string& context = "") const;
174 
175  /// get all GIs for a given NA accession
176  typedef set<TGi> TGis;
177  static void GetAllGIs(const TNAA& naa, EEntrezDB eedb, TGis& gis,
178  EUidsSource* pUidsSource = NULL);
179 
180  /// Get meta-data for a specific NA.
181  /// The meta-data include both the core information from NA
182  /// meta-data and Entrez links.
183  /// @param na the target named annotation.
184  /// @filtering determines if results are filtered against
185  /// on the target sequence
186  void GetNAMetaData(TNAMetaDataSet& md_set,
187  const string& na,
188  bool filtering = false,
189  EMetaDataSource* pMDSource = NULL,
190  EUidsSource* pUidsSource = NULL,
191  bool isGetLinks = true) const;
192 
193  /// Get meta-data for a list of NAAs.
194  /// The meta-data include both the core informaton from NA
195  /// meta-data and Entrez links.
196  /// @param naas the target input NAA list
197  /// @param md_set data structure for returned na meta-data
198  /// @filtering determines if results are filtered against
199  /// on the target seqquience
200  void GetNAMetaData(TNAMetaDataSet& md_set,
201  const TNAAs& naas,
202  bool filtering = false,
203  EMetaDataSource* pMDSource = NULL,
204  EUidsSource* pUidsSource = NULL,
205  bool isGetLinks = true) const;
206 
207  /// Get meta-data for a given NAA associated with seq_id.
208  /// Links stored in NA meta-data will be included, but links
209  /// from Entez are not.
210  /// @param md_set data structure for returned na meta-data
211  /// @param context is a viewer context that predefines a set
212  /// of NAAs applied only under the given viewer context.
213  /// Maybe, we can remove it when TMS is fully integrated.
214  void GetAllNAMetaData(TNAMetaDataSet& md_set,
215  const string& context = "",
216  ICanceled* canceledCallback = 0) const;
217 
218  /// Do data track discovery for a given NA.
219  /// This method relies on NA meta-data to do data discovery
220  /// completely or partially depending on the annotation type.
221  /// For example, for seq-table, it will need to look at the data
222  /// to report the column names. For alignment, it will need to
223  /// check the alignment type: protein or nucleotide.
224  void GetNATrackInfo(TTrackInfoList& track_info, const string& naa) const;
225 
226  /// Do data track discovery for a given NA with meta-data.
227  /// @sa void GetNATrackInfo(TTrackInfoList& track_info, const string& naa) const;
228  void GetNATrackInfo(TTrackInfoList& track_info,
229  const string& naa, const TNAMetaDataSet& md_set) const;
230 
231  /// Do data track discovery for a list of NAs.
232  /// @sa void GetNATrackInfo(TTrackInfoList& track_info, const string& naa) const;
233  void GetNATrackInfo(TTrackInfoList& track_info,
234  const TNAAs& naas) const;
235 
236  /// Do data track discovery for a list of NAs with meta-data.
237  /// @sa void GetNATrackInfo(TTrackInfoList& track_info, const string& naa) const;
238  void GetNATrackInfo(TTrackInfoList& track_info,
239  const TNAAs& naas, const TNAMetaDataSet& md_set) const;
240 
241  void SetMaxNAs(int size);
242 
243 private:
244  /// Initialize some internal states.
245  void x_Init();
246 
247  /// Get all available NA uids w/wo a context.
248  void x_GetNAIds(TEntrezIds &uids, const string& context = "") const;
249 
250  /// get an id for a given NAA string
251  /// returns false if no id is found
252  static bool x_GetNAId(TEntrezId &uid, const string& naa,
253  EUidsSource* pUidsSource = NULL);
254 
255  /// get all available NA uids for a given gi.
256  void x_GetAllNAIds(TEntrezIds &uids) const;
257 
258  /// filter a given list of NA uids and keep only the ones that are related to a given gi (m_GI)
259  void x_FilterNAIds(TEntrezIds &uids,
260  EUidsSource* pUidsSource = NULL) const;
261 
262  /// returns true if an uid is related to a given gi (m_GI) or GI is ZERO_GI
263  bool x_NAIdMatchesGI(TEntrezId uid,
264  EUidsSource* pUidsSource = NULL) const;
265 
266  /// get all available NA ids applied to a viewer context for a given gi.
267  void x_GetAllNAIdsWithContext(TEntrezIds &uids, const string& context) const;
268 
269  /// Retrieve NAAs for a list of NA uids.
270  void x_GetNAAs(TNAAs& naas, const TEntrezIds &uids) const;
271 
272  /// get all GIs for a given uid
273  /// this needs a given database where to search for GIs
274  /// @param if isIncremental == true, the newly found GIs are appended to gis, not erasing it
275  static void x_GetAllGIs(TEntrezId uid, EEntrezDB eedb, TGis& gis,
276  EUidsSource* pUidsSource = NULL,
277  bool isIncremental = false);
278  /// this uses m_SeqDB
279  void x_GetAllGIs(TEntrezId uid, TGis& gis,
280  EUidsSource* pUidsSource = NULL) const;
281 
282  /// Retrieve NA meta data for a list of NA uids.
283  void x_GetNAMetaData(TNAMetaDataSet& md_set,
284  const TEntrezIds &uids,
285  bool isGetLinks,
286  EMetaDataSource* pSource = NULL) const;
287 
288  /// Search all NAAs for a given term.
289  void x_SearchNAIds(TEntrezIds &uids, const TNAAs& naas, bool filtering,
290  EUidsSource* pUidsSource = NULL) const;
291 
292  /// Parse the meta-data xml DocSum.
293  void x_ParseNAMetaData(CAnnotMetaData& data, const char* xml_str) const;
294 
295  /// Retrieve Entrez links for a given NA.
296  void x_GetNAEntrezLinks(CAnnotMetaData& data) const;
297 
298 private:
299  /// The target sequence associating with the NAAs.
300  /// This is optional. If it is set, we only look at the NAAs
301  /// associated with the target sequence. If it is not set,
302  /// all requested/found NAAs will be returned. However, the
303  /// target sequence may be ignored for some cases, such as
304  /// searching meta-data for a speicfic NA.
306 
307  /// Scope helping resolve sequences and retrieve annotations.
308  /// If not set, a default one will be created if necessary.
310 
311  /// Derived values which make sense only if m_TargetSeq is set
312  objects::CBioseq_Handle m_BioseqHandle;
313  objects::CSeq_id_Handle m_SeqIdHandle; ///< sequence handles
314  TGi m_Gi; ///< GI of m_TargetSeq; it may be ZERO_GI for one of two reasons: either a target seq is not given at all or
315  ///< it does not have a GI
316  EEntrezDB m_SeqDB{EEDB_Undef}; ///< Entrez db name for m_TargetSeq: nucleotide/protein or Undef if no TargetSeq is given
317 
318  /// maximal number of NAAs for retrieving meta-data.
320 
321  /// caching of eutils-related data:
322  /// x_GetNAId():
323  /// NAA string to its numeric uid
325  DECLARE_CLASS_STATIC_FAST_MUTEX(sm_NAAtoUidCacheMutex);
327 
328  /// x_GetNAMetaData()
329  /// NAA numeric uid to CAnnotMetaData
331  DECLARE_CLASS_STATIC_FAST_MUTEX(sm_UidtoMetaDataCacheMutex);
333 
334  /// x_GetAllGIs():
335  /// NA uid+dbfrom to GIs which are associated with this NA
337  DECLARE_CLASS_STATIC_FAST_MUTEX(sm_UidtoGiCacheMutex);
339 };
340 
341 
343 
344 /* @} */
345 
346 #endif /// GUI_OBJUTILS___NA_UTILS__HPP
Data structure for holding meta information for an annotaion.
Definition: annot_info.hpp:85
Utilities for data/track type discovery.
Definition: na_utils.hpp:63
CObject –.
Definition: ncbiobj.hpp:180
CScope –.
Definition: scope.hpp:92
CSeq_annot_Handle –.
Interface for testing cancellation request in a long lasting operation.
Definition: icanceled.hpp:51
Definition: map.hpp:338
SStrictId_Entrez::TId TEntrezId
TEntrezId type for entrez ids which require the same strictness as TGi.
Definition: ncbimisc.hpp:1041
#define NULL
Definition: ncbistd.hpp:225
map< string, TGis > TEntrezIdtoGiCache
x_GetAllGIs(): NA uid+dbfrom to GIs which are associated with this NA
Definition: na_utils.hpp:336
list< TNAA > TNAAs
Definition: na_utils.hpp:120
TGi m_Gi
GI of m_TargetSeq; it may be ZERO_GI for one of two reasons: either a target seq is not given at all ...
Definition: na_utils.hpp:314
CTrackInfo::TTrackInfoList TTrackInfoList
Definition: na_utils.hpp:122
objects::CBioseq_Handle m_BioseqHandle
Derived values which make sense only if m_TargetSeq is set.
Definition: na_utils.hpp:312
static TNAAtoUidCache sm_NAAtoUidCache
Definition: na_utils.hpp:326
static CTrackInfo::TTrackInfoList GetTrackInfo(const objects::CSeq_annot &annot, objects::CScope &scope)
EMetaDataSource
source of obtained NA metadata were internal caches hit, partially hit or totally missed during some ...
Definition: na_utils.hpp:134
vector< TEntrezId > TEntrezIds
Definition: utils.hpp:125
CNAUtils(const objects::CSeq_id &id)
Ctor with target sequence id.
string TNAA
typedefs for NAA meta-data
Definition: na_utils.hpp:119
static TEntrezIdtoGiCache sm_UidtoGiCache
Definition: na_utils.hpp:338
CRef< objects::CScope > m_Scope
Scope helping resolve sequences and retrieve annotations.
Definition: na_utils.hpp:309
map< string, CRef< CAnnotMetaData > > TNAMetaDataSet
Definition: na_utils.hpp:121
map< TEntrezId, CRef< CAnnotMetaData > > TEntrezIdtoMetaDataCache
x_GetNAMetaData() NAA numeric uid to CAnnotMetaData
Definition: na_utils.hpp:330
CNAUtils(const objects::CSeq_id &id, objects::CScope &scope)
Ctor with target sequence id and a scope.
DECLARE_CLASS_STATIC_FAST_MUTEX(sm_UidtoGiCacheMutex)
map< TNAA, TEntrezId > TNAAtoUidCache
caching of eutils-related data: x_GetNAId(): NAA string to its numeric uid
Definition: na_utils.hpp:324
objects::CSeq_id_Handle m_SeqIdHandle
sequence handles
Definition: na_utils.hpp:313
static CTrackInfo::TTrackInfoList GetTrackInfo(objects::CSeq_annot_Handle annot_handle)
Get data track information from a seq-annot.
DECLARE_CLASS_STATIC_FAST_MUTEX(sm_UidtoMetaDataCacheMutex)
DECLARE_CLASS_STATIC_FAST_MUTEX(sm_NAAtoUidCacheMutex)
CSeqUtils::TEntrezIds TEntrezIds
Definition: na_utils.hpp:123
int m_MaxNAMeta
maximal number of NAAs for retrieving meta-data.
Definition: na_utils.hpp:319
static TEntrezIdtoMetaDataCache sm_UidtoMetaDataCache
Definition: na_utils.hpp:332
EEntrezDB
eutils databases that can be used
Definition: na_utils.hpp:126
list< CRef< CTrackInfo > > TTrackInfoList
Definition: annot_info.hpp:60
CConstRef< objects::CSeq_id > m_TargetSeq
The target sequence associating with the NAAs.
Definition: na_utils.hpp:305
set< TGi > TGis
get all GIs for a given NA accession
Definition: na_utils.hpp:176
@ EUidsSource_Undef
Definition: na_utils.hpp:144
@ EUidsSource_Mixed
Definition: na_utils.hpp:146
@ EUidsSource_Eutils
Definition: na_utils.hpp:147
@ EUidsSource_Cache
Definition: na_utils.hpp:145
@ EMetaDataSource_EmptyRequest
Definition: na_utils.hpp:139
@ EMetaDataSource_Cache
Definition: na_utils.hpp:136
@ EMetaDataSource_Mixed
Definition: na_utils.hpp:137
@ EMetaDataSource_Undef
Definition: na_utils.hpp:135
@ EMetaDataSource_Eutils
Definition: na_utils.hpp:138
@ EEDB_Protein
Definition: na_utils.hpp:129
@ EEDB_Undef
Definition: na_utils.hpp:127
@ EEDB_Nucleotide
Definition: na_utils.hpp:128
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_GUIOBJUTILS_EXPORT
Definition: gui_export.h:512
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::SUBKEY subkey
Definition: type.c:6
Modified on Wed Dec 06 07:13:07 2023 by modify_doxy.py rev. 669887