NCBI C++ ToolKit
idmapper.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_IDMAPPER___IDMAPPER_IMPL__HPP
2 #define OBJTOOLS_IDMAPPER___IDMAPPER_IMPL__HPP
3 
4 /* $Id: idmapper.hpp 84612 2018-11-21 14:24:48Z ucko $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Frank Ludwig
30  *
31  * File Description: Definition of the IIdMapper interface and its
32  * implementation
33  *
34  */
35 
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbicntr.hpp>
40 #include <objmgr/scope.hpp>
44 
46 BEGIN_objects_SCOPE // namespace ncbi::objects::
47 
48 /// IdMapper base class implementation
49 ///
50 /// Provides the means to set up and maintain an internal table of mappings
51 /// and to use such table for actual ID mapping.
52 /// Actual initialization of the internal table is left for derived classes to
53 /// implement.
54 ///
56 {
57 
58 public:
59  /// Constructor specifying the mapping context, direction, and error
60  /// handling.
61  /// @param strContext
62  /// the mapping context or genome source IDs will belong to. Something
63  /// like "mm6" or "hg18".
64  /// @param bInvert
65  /// Mapping direction. "true" will map in reverse direction.
66  /// @param pErrors
67  /// Optional error container. If specified, mapping errors will be passed
68  /// to the error container for further processing. If not specified,
69  /// mapping errors result in exceptions that need to be handled.
70  CIdMapper(const std::string& strContext = "",
71  bool bInvert = false,
72  ILineErrorListener* pErrors = 0 );
73 
74  virtual ~CIdMapper() {};
75 
76  /// Add a mapping to the internal mapping table.
77  /// @param from
78  /// source handle, or target handle in the case of reverse mapping
79  /// @param to
80  /// target handle, or source handle in the case of reverse mapping
81  virtual void AddMapping(const CSeq_id_Handle& from,
82  const CSeq_id_Handle& to );
83 
84  virtual void AddMapping(const CSeq_loc& loc_from,
85  const CSeq_loc& loc_to);
86 
87  virtual CSeq_id_Handle Map(const CSeq_id_Handle&);
88 
89  virtual CRef<CSeq_loc> Map(const CSeq_loc& loc);
90 
91  /// Map all embedded IDs in a given object at once.
92  virtual void MapObject(CSerialObject&);
93 
95  {
96  string context;
97  string map_from;
98  string map_to;
99  };
100 
101 protected:
102  static std::string
103  MapErrorString(const CSeq_id_Handle& );
104 
105  static std::string
106  MapErrorString(const CSeq_loc& );
107 
109  const bool m_bInvert;
110 
111  struct SMapper
112  {
115  };
116  typedef std::map<CSeq_id_Handle, SMapper> TMapperCache;
118 
120 };
121 
122 
123 /// IdMapper implementation combining multiple id mappers with the selected
124 /// priorities.
125 ///
127 {
128 public:
129 
130  /// Sub-mapper priority. Default is zero, positive values are for
131  /// higher priorities. Mappers with the same priorities are checked
132  /// in the order of their addition to the composite mapper.
133  enum EPriority {
134  kPriority_Default = 0
135  };
136  typedef int TPriority;
137 
139 
140  /// Add sub-mapper. By default the composite mapper owns the added
141  /// object.
142  void AddMapper(IIdMapper* mapper,
143  TPriority priority = kPriority_Default,
144  EOwnership ownership = eTakeOwnership);
145 
146  virtual CSeq_id_Handle Map(const CSeq_id_Handle&);
147 
148  virtual CRef<CSeq_loc> Map(const CSeq_loc& loc);
149 
150 private:
152  struct SNode {
155 
160 
161  SNode(IIdMapper* mapper, TPriority priority, EOwnership ownership)
162  : m_Mapper(mapper, ownership),
163  m_Priority(priority)
164  {
165  m_Order = sm_Counter.Add(1);
166  }
167 
168  bool operator<(const SNode& node) const
169  {
170  // Higher priority goes first
171  if (m_Priority != node.m_Priority) return m_Priority > node.m_Priority;
172  // Lower order goes first
173  return m_Order < node.m_Order;
174  }
175  };
176 
178 
180 };
181 
182 
183 /// IdMapper implementation using an external configuration file
184 ///
185 /// The internal mapping table will be initialized during IdMapper construction
186 /// from a given input stream (typically, an open configuration file).
187 ///
189 {
190 
191 public:
192  /// Constructor specifying the content of the mapping table, mapping
193  /// context, direction, and error handling.
194  ///
195  /// The configuration-file-based mapper uses a config file to indicate how
196  /// mapping should be performed. Configuration-based mapping is suitable
197  /// for simple id -> id mapping, and cannot generally be used to indicate
198  /// mapping through a complex location, as would be needed when handling
199  /// things such as mapping to UCSC chrRandom.
200  ///
201  /// The format of the configuration file is as a standard Windows .ini
202  /// file, and it should be structured as follows:
203  ///
204  /// \code
205  /// [hg18]
206  /// map_from = UCSC HG18
207  /// map_to = NCBI Human build 36
208  /// 89161185 = chr1 1
209  /// 89161199 = chr2 2
210  /// 89161205 = chr3 3
211  /// 89161207 = chr4 4
212  /// 51511721 = chr5 5
213  /// \endcode
214  ///
215  /// Note that the config file appears backwards! This is intentional, and
216  /// is structured so as to capture a many-to-one synonymy that we often see
217  /// in IDs. The snippet above implies:
218  /// - We are mapping from UCSC build HG18 -> NCBI Human build 36
219  /// - The chromosomes are defined by bare integers, which represent gis
220  /// - The primary aliases all begin 'chr', as 'chr1', 'chr2', etc.
221  /// - Each chromosome is represented by multiple input aliases (chr1, 1,
222  /// etc)
223  /// - We map implicitly from lcl|chr1 -> 89161185, lcl|1 -> 89161185
224  /// - Because of a limitation in processing .ini files, we cannot use a
225  /// full FASTA representation for the key (the gi). We can use one for
226  /// the aliases. Since bare integers are interpreted as gis, it is
227  /// necessary to qualify bare integers as local IDs if you wish to have
228  /// a representation as something other than a gi
229  ///
230  /// @param istr
231  /// open input stream containing tabbed data specifying map sources and
232  /// targets.
233  /// @param strContext
234  /// the mapping context or genome source IDs will belong to. Something
235  /// like "mm6" or "hg18".
236  /// @param bInvert
237  /// Mapping direction. "true" will map in reverse direction.
238  /// @param pErrors
239  /// Optional error container. If specified, mapping errors will be passed
240  /// to the error container for further processing. If not specified,
241  /// mapping errors result in exceptions that need to be handled.
243  const std::string& strContext = "",
244  bool bInvert = false,
245  ILineErrorListener* pErrors = 0);
246 
247  CIdMapperConfig(const std::string& strContext = "",
248  bool bInvert = false,
249  ILineErrorListener* pErrors = 0);
250 
251  void Initialize(CNcbiIstream& istr);
252  static void DescribeContexts(CNcbiIstream& istr,
253  list<SMappingContext>& contexts);
254 
255 protected:
256 
257  void AddMapEntry(const std::string& );
258 
259  void SetCurrentContext(const std::string&,
260  std::string& );
261 
262  CSeq_id_Handle SourceHandle(const std::string& );
263 
264  CSeq_id_Handle TargetHandle(const std::string& );
265 };
266 
267 
268 /// IdMapper implementation using hardcoded values
269 ///
270 /// Mapping targets are fixed at compile time and cannot be modified later.
271 /// Useful for self contained applications that should work without external
272 /// configuration files or databases.
273 ///
275 {
276 
277 public:
278  /// Constructor specifying the mapping context, direction, and error
279  /// handling.
280  /// @param strContext
281  /// the mapping context or genome source IDs will belong to. Something
282  /// like "mm6" or "hg18".
283  /// @param bInvert
284  /// Mapping direction. "true" will map in reverse direction.
285  /// @param pErrors
286  /// Optional error container. If specified, mapping errors will be passed
287  /// to the error container for further processing. If not specified,
288  /// mapping errors result in exceptions that need to be handled.
289  CIdMapperBuiltin(const std::string& strContext,
290  bool bInvert = false,
291  ILineErrorListener* pErrors = 0 );
292 
293  void Initialize();
294 
295 protected:
296  void AddMapEntry(const std::string&, int);
297 };
298 
299 
300 
301 /// IdMapper implementation using an external database
302 ///
303 /// Mappings will be retrived from an external database, then cached internally
304 /// for future reuse.
305 ///
307 {
308 public:
309  /// Constructor specifying a database containing the actual mapping, the
310  /// mapping context, direction, and error handling.
311  /// @param strServer
312  /// server on which the mapping database resides.
313  /// @param strDatabase
314  /// the actual database on the specified server.
315  /// @param strContext
316  /// the mapping context or genome source IDs will belong to. Something
317  /// like "mm6" or "hg18".
318  /// @param bInvert
319  /// Mapping direction. "true" will map in reverse direction.
320  /// @param pErrors
321  /// Optional error container. If specified, mapping errors will be passed
322  /// to the error container for further processing. If not specified,
323  /// mapping errors result in exceptions that need to be handled.
325  const std::string& strServer,
326  const std::string& strDatabase,
327  const std::string& strContext,
328  bool bInvert = false,
329  ILineErrorListener* pErrors = 0)
330  : CIdMapper(strContext, bInvert, pErrors),
331  m_strServer(strServer),
332  m_strDatabase(strDatabase)
333  {};
334 
335  virtual CSeq_id_Handle Map(const CSeq_id_Handle& from);
336 
337 protected:
340 };
341 
342 
343 /// IdMapper implementaion using CScope.
344 ///
345 /// Non-versioned ids are mapped to versioned ones referencing the focus
346 /// sequence or its parts.
347 ///
349 {
350 public:
351  /// Initialize the mapper using the scope and the focus seq-id handle.
352  CIdMapperScope(CScope& scope, const CSeq_id_Handle& focus_idh);
353 
354  /// Initialize the mapper using the scope and the focus seq-id.
355  CIdMapperScope(CScope& scope, const CSeq_id& focus_id);
356 
357 private:
358  void x_Init(const CSeq_id_Handle& focus_idh);
359  void x_AddMappings(const CBioseq_Handle& bh);
360 
362 };
363 
364 
365 /// IdMapper implementation using a GC-Assembly
366 ///
367 /// Ids are mapped to the selected alias type.
368 ///
370 {
371 public:
373 
374  /// Mapping destination type.
376  eGenBank, ///< GenBank GI
377  eGenBankAcc, ///< GenBank accession
378  eRefSeq, ///< RefSeq GI
379  eRefSeqAcc, ///< RefSeq accession
380  eUCSC, ///< External UCSC id
381  eOther, ///< Private id
382  eAccVer ///< Find versioned accession
383  };
384 
385  /// Establish mappings based on a GC-Assembly
386  ///
387  /// Inputs for this are a full assembly and a concept of what you are
388  /// mapping to. "map to" means "convert to another ID space", defined by
389  /// the enum above. To allow for future extension, eOther maps to the
390  /// GC-Assembly concept of "private" or the named scope provided in the
391  /// extra string parameter (i.e., "UCSC")
393  const CGC_Assembly& assm,
394  EAliasMapping mapping,
395  const string& alias_scope = kEmptyStr);
396 
397  CIdMapperGCAssembly(CScope& scope);
398 
399  void AddAliasMappings(const CGC_Assembly& assm,
400  EAliasMapping mapping,
401  const string& alias_scope = kEmptyStr);
402 
403  /// Map seq-id, throw CIdMapperException if mapping fails.
404  virtual CSeq_id_Handle Map(const CSeq_id_Handle&);
405 
406  using TParent::Map;
407 
408 private:
409  void x_AddUnversionedMapping(const CSeq_id& src_id,
410  const CSeq_id_Handle& dst_id);
411  void x_AddAccessionMapping(const CSeq_id& id);
412  void x_AddAliasMappings(const CGC_Assembly& seq,
413  EAliasMapping mapping,
414  const string& alias_scope);
415  void x_AddAliasMappings(const CGC_AssemblyUnit& seq,
416  EAliasMapping mapping,
417  const string& alias_scope);
418  void x_AddAliasMappings(const CGC_Sequence& seq,
419  EAliasMapping mapping,
420  const string& alias_scope);
421 
423 };
424 
425 
427 {
428 public:
429  enum EErrCode {
431  eOther
432  };
433  virtual const char* GetErrCodeString(void) const override;
435 };
436 
437 
438 END_objects_SCOPE
440 
441 #endif // OBJTOOLS_IDMAPPER___IDMAPPER_IMPL__HPP
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CAtomicCounter –.
Definition: ncbicntr.hpp:71
CBioseq_Handle –.
IdMapper implementation using hardcoded values.
Definition: idmapper.hpp:275
void AddMapEntry(const std::string &, int)
IdMapper implementation combining multiple id mappers with the selected priorities.
Definition: idmapper.hpp:127
EPriority
Sub-mapper priority.
Definition: idmapper.hpp:133
CAtomicCounter::TValue TOrder
Definition: idmapper.hpp:151
TMappers m_Mappers
Definition: idmapper.hpp:179
set< SNode > TMappers
Definition: idmapper.hpp:177
CIdMapperComposite(void)
Definition: idmapper.hpp:138
IdMapper implementation using an external configuration file.
Definition: idmapper.hpp:189
IdMapper implementation using an external database.
Definition: idmapper.hpp:307
const std::string m_strDatabase
Definition: idmapper.hpp:339
const std::string m_strServer
Definition: idmapper.hpp:338
CIdMapperDatabase(const std::string &strServer, const std::string &strDatabase, const std::string &strContext, bool bInvert=false, ILineErrorListener *pErrors=0)
Constructor specifying a database containing the actual mapping, the mapping context,...
Definition: idmapper.hpp:324
NCBI_EXCEPTION_DEFAULT(CIdMapperException, CException)
IdMapper implementation using a GC-Assembly.
Definition: idmapper.hpp:370
EAliasMapping
Mapping destination type.
Definition: idmapper.hpp:375
@ eRefSeq
RefSeq GI.
Definition: idmapper.hpp:378
@ eGenBank
GenBank GI.
Definition: idmapper.hpp:376
@ eOther
Private id.
Definition: idmapper.hpp:381
@ eUCSC
External UCSC id.
Definition: idmapper.hpp:380
@ eGenBankAcc
GenBank accession.
Definition: idmapper.hpp:377
@ eRefSeqAcc
RefSeq accession.
Definition: idmapper.hpp:379
CRef< CScope > m_Scope
Definition: idmapper.hpp:422
IdMapper implementaion using CScope.
Definition: idmapper.hpp:349
CRef< CScope > m_Scope
Definition: idmapper.hpp:361
IdMapper base class implementation.
Definition: idmapper.hpp:56
ILineErrorListener * m_pErrors
Definition: idmapper.hpp:119
virtual ~CIdMapper()
Definition: idmapper.hpp:74
TMapperCache m_Cache
Definition: idmapper.hpp:117
std::map< CSeq_id_Handle, SMapper > TMapperCache
Definition: idmapper.hpp:116
const std::string m_strContext
Definition: idmapper.hpp:108
const bool m_bInvert
Definition: idmapper.hpp:109
CScope –.
Definition: scope.hpp:92
Base class for all serializable objects.
Definition: serialbase.hpp:150
General IdMapper interface.
Definition: iidmapper.hpp:48
virtual void MapObject(CSerialObject &)=0
Map all embedded IDs in a given object at once.
virtual CSeq_id_Handle Map(const CSeq_id_Handle &id)=0
Map a single given CSeq_id_Handle to another.
Include a standard set of the NCBI C++ Toolkit most basic headers.
CRange< Position > Map(const CRange< Position > &target, const CRange< Position > &range)
Definition: blast_aux.cpp:826
@ eTakeOwnership
An object can take ownership of another.
Definition: ncbi_types.h:136
string
Definition: cgiapp.hpp:690
TNCBIAtomicValue TValue
Alias TValue for TNCBIAtomicValue.
Definition: ncbicntr.hpp:73
TValue Add(int delta) THROWS_NONE
Atomically add value (=delta), and return new counter value.
Definition: ncbicntr.hpp:278
EErrCode
Error types that an application can generate.
Definition: ncbiexpt.hpp:884
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
#define kEmptyStr
Definition: ncbistr.hpp:123
enum ENcbiOwnership EOwnership
Ownership relations between objects.
#define NCBI_XOBJREAD_EXPORT
Definition: ncbi_export.h:1315
bool operator<(const SNode &node) const
Definition: idmapper.hpp:168
SNode(IIdMapper *mapper, TPriority priority, EOwnership ownership)
Definition: idmapper.hpp:161
CIdMapperComposite::TPriority TPriority
Definition: idmapper.hpp:153
static CAtomicCounter sm_Counter
Definition: idmapper.hpp:159
CIdMapperComposite::TOrder TOrder
Definition: idmapper.hpp:154
AutoPtr< IIdMapper > m_Mapper
Definition: idmapper.hpp:156
CSeq_id_Handle dest_idh
Definition: idmapper.hpp:113
CRef< CSeq_loc_Mapper > dest_mapper
Definition: idmapper.hpp:114
Modified on Fri Sep 20 14:57:44 2024 by modify_doxy.py rev. 669887