NCBI C++ ToolKit
id_mapper.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef ALGO_ID_MAPPER___ID_MAPPER__HPP
2 #define ALGO_ID_MAPPER___ID_MAPPER__HPP
3 
4 /* $Id: id_mapper.hpp 93856 2021-05-27 14:33:25Z boukn $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Nathan Bouk
30  *
31  * File Description:
32  *
33  */
34 
35 
39 
40 
42 
44 class CGC_Assembly;
45 class CGC_AssemblyUnit;
46 class CGC_SeqIdAlias;
47 class CGC_Sequence;
48 class CGC_TypedSeqId;
49 class CSeq_id;
50 class CSeq_interval;
51 class CSeq_loc;
52 class CSeq_loc_Mapper;
54 
55 
56 // Common use case: Going from outside-of-NCBI ids to RefSeq GIs,
57 // then back again
58 //
59 // CRef<CGC_Assembly> GenCollAssembly = ...
60 // CSeq_loc GivenLoc = ...
61 //
62 // CGencollIdMapper Mapper(GenCollAssembly)
63 // CGencollIdMapper::SIdSpec RsSpec;
64 // RsSpec.TypedChoice = CGC_TypedSeqId::e_Refseq;
65 // RsSpec.Alias = CGC_SeqIdAlias::e_Gi;
66 // CRef<CSeq_loc> RefSeqLoc = Mapper.Map(GivenLoc, RsSpec);
67 //
68 // ... process RefSeqLoc somehow ...
69 //
70 // CGencollIdMapper::SIdSpec OrigSpec;
71 // Mapper.Guess(GivenLoc, OrigSpec);
72 // CRef<CSeq_loc> BackAgain = Mapper.Map(*RefSeqLoc, OrigSpec);
73 //
74 
76 {
77 public:
79 
81  {
82  typedef objects::CGC_TypedSeqId::E_Choice E_Choice;
83  typedef objects::CGC_SeqIdAlias::E_AliasTypes E_Alias;
84 
85  bool Primary; // CGC_Sequence::GetSeq_id(), what the Gencoll considers most important
86  // When used by Map, Primary == true overrides the next 4 fields
87  // When used by Guess, it is informing the caller about if it is primary or not.
90  string External; // outside organization name, like UCSC
91  string Pattern; // pattern matching, like chr%s to find 'chrMT' matches replicon named 'MT'
92  int Role; // EGC_SequenceRole
93  bool Top; // if has Sequence Role Top, but do not block out the Role value
94 
95  enum
96  {
97  // e_Role_ExcludePseudo_Top -- Fake role for Non-Pseudo top
98  e_Role_ExcludePseudo_Top = objects::eGC_SequenceRole_top_level + 1,
99  e_Role_NotSet = 10000
100  };
101 
102  SIdSpec();
103 
104  operator string() const;
105  string ToString(void) const;
106  bool operator<(const SIdSpec& Other) const;
107  bool operator==(const SIdSpec& Other) const;
108 
109  bool IsSpecMet(const SIdSpec& Guessed) const;
110  };
111 
112  // Derives the spec from a given loc
113  bool Guess(const objects::CSeq_loc& Loc, SIdSpec& Spec) const;
114 
115  // Returning NULL means requested spec could not be met.
116  // If the given loc already meets the spec, it returns a copy of itself
117  CRef<objects::CSeq_loc> Map(const objects::CSeq_loc& Loc, const SIdSpec& Spec) const;
118 
119  bool CanMeetSpec(const objects::CSeq_loc& Loc, const SIdSpec& Spec) const;
120 
121  enum E_Gap
122  {
123  e_None = 0,
127  e_Complicated
128  };
129  E_Gap IsLocInAGap(const objects::CSeq_loc& Loc) const;
130 
131  CConstRef<objects::CGC_Assembly> GetInternalGencoll(void) const;
132 
133  void GetSynonyms(const objects::CSeq_id& BaseId,
134  list< CConstRef<objects::CSeq_id> >& Synonyms,
135  bool NcbiOnly = true) const;
136 
137 protected:
138  void x_Init(void);
139 
140  void x_Init_SeqLocMappers(void) const;
141 
142  bool x_NCBI34_Guess(const objects::CSeq_id& Id, SIdSpec& Spec) const;
144  x_NCBI34_Map_IdFix(CConstRef<objects::CSeq_id> SourceId) const;
145 
146  void x_StripPseudoSeq(objects::CGC_Sequence& Seq);
147  void x_RecursiveSeqFix(objects::CGC_Sequence& Seq);
148  void x_FillGpipeTopRole(objects::CGC_Sequence& Seq);
149  void x_RemoveHiddenAccessions(objects::CGC_Sequence& Seq);
150  void x_FillChromosomeIds(void);
151  void x_PrioritizeIds(void);
152  void x_PrioritizeIds(objects::CGC_Sequence& Sequence);
153 
154 
155  // is the given CSeq_id found in the given Gencoll, exactly, anywhere
156  bool x_IsExactIdInAssembly(const objects::CSeq_id& Id) const;
157  bool x_IsFuzzyIdInAssembly(const objects::CSeq_id& Id) const;
158  // Fixes locals that should be accessions, and versionless accessions
160  const SIdSpec& Spec
161  ) const;
163  const SIdSpec& Spec
164  ) const;
165 
166  int x_GetRole(const objects::CGC_Sequence& Seq) const;
167 
168  void x_AddSeqToMap(const objects::CSeq_id& Id,
170  );
171 
172  void x_BuildSeqMap(const objects::CGC_Assembly& assm);
173  void x_BuildSeqMap(const objects::CGC_AssemblyUnit& assm);
174  void x_BuildSeqMap(const objects::CGC_Sequence& Seq, int Depth=1);
175 
176  bool x_DoesSeqContainSyn(const objects::CGC_Sequence& Seq, const objects::CSeq_id& Id) const;
177 
179  x_GetIdFromSeqAndSpec(const objects::CGC_Sequence& Seq,
180  const SIdSpec& Spec
181  ) const;
182 
183  enum { e_No, e_Yes, e_Up, e_Down };
184 
185  int x_CanSeqMeetSpec(const objects::CGC_Sequence& Seq,
186  const SIdSpec& Spec,
187  int Level = 0
188  ) const;
189 
190  bool x_MakeSpecForSeq(const objects::CSeq_id& Id,
191  const objects::CGC_Sequence& Seq,
192  SIdSpec& Spec
193  ) const;
194 
196  x_FindChromosomeSequence(const objects::CSeq_id& Id,
197  const SIdSpec& Spec
198  ) const;
199 
201  x_FindParentSequence(const objects::CSeq_id& Id,
202  const objects::CGC_Assembly& Assembly,
203  int Depth = 0
204  ) const;
205 
206  bool x_IsParentSequence(const objects::CSeq_id& Id,
207  const objects::CGC_Sequence& Parent
208  ) const;
209 
210 
211  CRef<objects::CSeq_loc> x_Map_OneToOne(const objects::CSeq_loc& SourceLoc,
212  const objects::CGC_Sequence& Seq,
213  const SIdSpec& Spec
214  ) const;
215 
216  CRef<objects::CSeq_loc> x_Map_Up(const objects::CSeq_loc& SourceLoc,
217  const objects::CGC_Sequence& Seq,
218  const SIdSpec& Spec
219  ) const;
220 
221  CRef<objects::CSeq_loc> x_Map_Down(const objects::CSeq_loc& SourceLoc,
222  const objects::CGC_Sequence& Seq,
223  const SIdSpec& Spec
224  ) const;
225 
226 
227  //bool x_IsLocInGap(const objects::CSeq_loc& Loc) const;
228  E_Gap x_Merge_E_Gaps(E_Gap First, E_Gap Second) const;
229  E_Gap x_IsLoc_Int_InAGap(const objects::CSeq_interval& Int) const;
230 
231 private:
233  string m_SourceAsm;
234 
238 
241 
242  vector<string> m_Chromosomes;
244 
245  // All component IDs to the Parent CGC_Sequence
248 
250  mutable TSeqLocMapperRef m_UpMapper, m_DownMapper_Shallow, m_DownMapper_Deep;
251 };
252 
253 
255 
256 #endif // ALGO_ID_MAPPER___ID_MAPPER__HPP
257 
CGC_TypedSeqId –.
TSeqLocMapperRef m_UpMapper
Definition: id_mapper.hpp:250
void x_BuildSeqMap(const objects::CGC_AssemblyUnit &assm)
map< objects::CSeq_id_Handle, TGC_SequenceCRef > TChildToParentMap
Definition: id_mapper.hpp:246
TAccToVerMap m_AccToVerMap
Definition: id_mapper.hpp:240
void x_PrioritizeIds(objects::CGC_Sequence &Sequence)
CConstRef< objects::CGC_Sequence > TGC_SequenceCRef
Definition: id_mapper.hpp:235
void x_BuildSeqMap(const objects::CGC_Assembly &assm)
vector< string > m_Chromosomes
Definition: id_mapper.hpp:242
void x_BuildSeqMap(const objects::CGC_Sequence &Seq, int Depth=1)
map< objects::CSeq_id_Handle, TGC_SequenceCRef > TIdToSeqMap
Definition: id_mapper.hpp:236
TChildToParentMap m_ChildToParentMap
Definition: id_mapper.hpp:247
TIdToSeqMap m_IdToSeqMap
Definition: id_mapper.hpp:237
CRef< objects::CSeq_loc_Mapper > TSeqLocMapperRef
Definition: id_mapper.hpp:249
CRef< objects::CGC_Assembly > m_Assembly
Definition: id_mapper.hpp:232
map< string, int > TAccToVerMap
Definition: id_mapper.hpp:239
CObject –.
Definition: ncbiobj.hpp:180
CSeq_loc_Mapper –.
bool operator<(const CEquivRange &A, const CEquivRange &B)
bool operator==(const CEquivRange &A, const CEquivRange &B)
CRange< Position > Map(const CRange< Position > &target, const CRange< Position > &range)
Definition: blast_aux.cpp:826
string
Definition: cgiapp.hpp:690
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_XALGOID_MAPPER_EXPORT
Definition: ncbi_export.h:1057
@ eGC_SequenceRole_top_level
#define Loc
objects::CGC_TypedSeqId::E_Choice E_Choice
Definition: id_mapper.hpp:82
objects::CGC_SeqIdAlias::E_AliasTypes E_Alias
Definition: id_mapper.hpp:83
string ToString(const wxRect &rc)
Definition: wx_utils.cpp:773
Modified on Fri Sep 20 14:57:01 2024 by modify_doxy.py rev. 669887