NCBI C++ ToolKit
idmapper_gcassembly.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: idmapper_gcassembly.cpp 99483 2023-04-04 17:43:43Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Mike DiCuccio, Aleksey Grichenko
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
34 #include <objmgr/scope.hpp>
35 
45 
47 #include <objects/seq/Seq_ext.hpp>
49 
50 #include <serial/serial.hpp>
51 #include <serial/iterator.hpp>
53 
54 
57 
58 
60  const CGC_Assembly& assm,
61  EAliasMapping mapping,
62  const string& alias_scope)
63  : m_Scope(&scope)
64 {
65  AddAliasMappings(assm, mapping, alias_scope);
66 }
67 
68 
70  : m_Scope(&scope)
71 {
72 }
73 
74 
76  EAliasMapping mapping,
77  const string& alias_scope)
78 {
79  x_AddAliasMappings(assm, mapping, alias_scope);
80 }
81 
82 
84 {
85  CSeq_id_Handle id = TParent::Map(from);
86  if ( !id ) {
87  NCBI_THROW(CIdMapperException, eBadSeqId, MapErrorString(from));
88  }
89  return id;
90 }
91 
92 
94  const CSeq_id_Handle& dst_id)
95 {
96  AddMapping(CSeq_id_Handle::GetHandle(src_id), dst_id);
97  // Try to create accession-only id, add mapping to the same destination.
98  CSeq_id acc_id;
99  acc_id.Assign(src_id);
100  CTextseq_id* txt_id = nullptr;
101  switch ( src_id.Which() ) {
102  case CSeq_id::e_Genbank:
103  txt_id = &acc_id.SetGenbank();
104  break;
105  case CSeq_id::e_Embl:
106  txt_id = &acc_id.SetEmbl();
107  break;
108  case CSeq_id::e_Pir:
109  txt_id = &acc_id.SetPir();
110  break;
112  txt_id = &acc_id.SetSwissprot();
113  break;
114  case CSeq_id::e_Other:
115  txt_id = &acc_id.SetOther();
116  break;
117  case CSeq_id::e_Ddbj:
118  txt_id = &acc_id.SetDdbj();
119  break;
120  case CSeq_id::e_Prf:
121  txt_id = &acc_id.SetPrf();
122  break;
123  case CSeq_id::e_Tpg:
124  txt_id = &acc_id.SetTpg();
125  break;
126  case CSeq_id::e_Tpe:
127  txt_id = &acc_id.SetTpe();
128  break;
129  case CSeq_id::e_Tpd:
130  txt_id = &acc_id.SetTpd();
131  break;
132  case CSeq_id::e_Gpipe:
133  txt_id = &acc_id.SetGpipe();
134  break;
136  txt_id = &acc_id.SetNamed_annot_track();
137  break;
138  default:
139  return; // Non-text seq-id, ignore.
140  }
141  _ASSERT(txt_id);
142  txt_id->ResetVersion();
143  txt_id->ResetName();
144  txt_id->ResetRelease();
145  AddMapping(CSeq_id_Handle::GetHandle(acc_id), dst_id);
146 }
147 
148 
150 {
152 }
153 
154 
156  EAliasMapping map_to,
157  const string& alias_scope)
158 {
159  if (map_to == eAccVer) {
161  }
162 
163  if (seq.IsSetSeq_id_synonyms()) {
164  CSeq_id_Handle dst_alias;
166  switch ((*it)->Which()) {
168  if (map_to == eAccVer) {
169  x_AddAccessionMapping((*it)->GetGenbank().GetPublic());
170  if ( (*it)->GetGenbank().IsSetGpipe() ) {
171  x_AddAccessionMapping((*it)->GetGenbank().GetGpipe());
172  }
173  }
174  else if (map_to == eGenBank && (*it)->GetGenbank().IsSetGi()) {
175  dst_alias =
176  CSeq_id_Handle::GetHandle((*it)->GetGenbank().GetGi());
177  }
178  else if (map_to == eGenBankAcc) {
179  dst_alias =
180  CSeq_id_Handle::GetHandle((*it)->GetGenbank().GetPublic());
181  }
182  break;
184  if (map_to == eAccVer) {
185  x_AddAccessionMapping((*it)->GetRefseq().GetPublic());
186  if ( (*it)->GetRefseq().IsSetGpipe() ) {
187  x_AddAccessionMapping((*it)->GetRefseq().GetGpipe());
188  }
189  }
190  else if (map_to == eRefSeq && (*it)->GetRefseq().IsSetGi()) {
191  dst_alias =
192  CSeq_id_Handle::GetHandle((*it)->GetRefseq().GetGi());
193  }
194  else if (map_to == eRefSeqAcc) {
195  dst_alias =
196  CSeq_id_Handle::GetHandle((*it)->GetRefseq().GetPublic());
197  }
198  break;
200  if (map_to == eAccVer) {
201  x_AddAccessionMapping((*it)->GetExternal().GetId());
202  }
203  else if (map_to == eUCSC &&
204  (*it)->GetExternal().GetExternal() == "UCSC") {
205  dst_alias =
206  CSeq_id_Handle::GetHandle((*it)->GetExternal().GetId());
207  }
208  break;
210  // Ignore private accessions?
211  if (map_to == eOther) {
212  dst_alias =
213  CSeq_id_Handle::GetHandle((*it)->GetPrivate());
214  }
215  break;
216 
217  default:
218  break;
219  }
220  }
221 
222  if (dst_alias) {
224 
226  seq.GetSeq_id_synonyms()) {
227  switch ((*it)->Which()) {
229  {{
230  const CGC_SeqIdAlias& alias = (*it)->GetGenbank();
231  if (map_to != eGenBankAcc) {
232  x_AddUnversionedMapping(alias.GetPublic(), dst_alias);
233  }
234  if (alias.IsSetGpipe()) {
235  x_AddUnversionedMapping(alias.GetGpipe(), dst_alias);
236  }
237  if (map_to != eGenBank) {
238  AddMapping(CSeq_id_Handle::GetHandle(alias.GetGi()), dst_alias);
239  }
240  }}
241  break;
242 
244  {{
245  const CGC_SeqIdAlias& alias = (*it)->GetRefseq();
246  x_AddUnversionedMapping(alias.GetPublic(), dst_alias);
247  if (alias.IsSetGpipe()) {
248  x_AddUnversionedMapping(alias.GetGpipe(), dst_alias);
249  }
250  AddMapping(CSeq_id_Handle::GetHandle(alias.GetGi()), dst_alias);
251  }}
252  break;
253 
255  {{
256  AddMapping(CSeq_id_Handle::GetHandle((*it)->GetPrivate()), dst_alias);
257 
258  /// HACK:
259  /// here go a bunch of scary modifications to the
260  /// input data
261 
262  if ((*it)->GetPrivate().IsLocal()) {
263  string s =
264  (*it)->GetPrivate().GetLocal().GetStr();
265 
266  CSeq_id id;
267  id.Set("lcl|chr" + s);
268  AddMapping(CSeq_id_Handle::GetHandle(id), dst_alias);
269  }
270 
271  /// END HACK
272  }}
273  break;
274 
276  {{
277  const CSeq_id& id = (*it)->GetExternal().GetId();
278  AddMapping(CSeq_id_Handle::GetHandle(id), dst_alias);
279  }}
280  break;
281 
282  default:
283  _ASSERT(false);
285  str << MSerial_AsnText << **it;
286  NCBI_THROW(CIdMapperException, eBadSeqId,
287  "Unhandled ID type in GC-Assembly: " +
289  (string)CNcbiOstrstreamToString(str),
291  }
292  }
293  }
294  else if (map_to != eAccVer) {
295  ///
296  /// check for UCSC-style random chromosomes
297  ///
298  const CSeq_id& id = seq.GetSeq_id();
299  if (id.IsLocal() && id.GetLocal().IsStr() &&
300  id.GetLocal().GetStr().find("_random") != string::npos &&
301  seq.IsSetStructure()) {
302  CRef<CBioseq> bioseq(new CBioseq());
303 
304  /// local ID is our only ID
305  /// HACK: make this 'chr' aware!
306  /// note: this should be replaced with a complete list of
307  /// possible synonyms for this sequence
308  CRef<CSeq_id> lcl(new CSeq_id);
309  lcl->SetLocal().SetStr(id.GetLocal().GetStr());
310 
311  CRef<CSeq_id> chr_lcl;
312  if ( !NStr::StartsWith(id.GetLocal().GetStr(), "chr") ) {
313  chr_lcl.Reset(new CSeq_id);
314  chr_lcl->SetLocal().SetStr("chr" + id.GetLocal().GetStr());
315  bioseq->SetId().push_back(chr_lcl);
316  }
317  bioseq->SetId().push_back(lcl);
318  /// END HACK
319 
320  /// inst should be the delta-seq from the structure
321  bioseq->SetInst().SetRepr(CSeq_inst::eRepr_delta);
322  bioseq->SetInst().SetMol(CSeq_inst::eMol_na);
323  bioseq->SetInst().SetExt()
324  .SetDelta(const_cast<CDelta_ext&>(seq.GetStructure()));
325 
326  CBioseq_Handle bsh = m_Scope->AddBioseq(*bioseq);
327 
328  /// build the Seq-loc-Mapper for this
329  /// depending on our direction, we may need to map differently
330  /// NOTE: the random delta-seqs are provided as chr*_random ->
331  /// scaffold; we need to map these the other way in some cases
332  switch (map_to) {
333  case eUCSC:
334  {{
335  CRef<CSeq_loc_Mapper> mapper
336  (new CSeq_loc_Mapper
337  (1, bsh, CSeq_loc_Mapper::eSeqMap_Up));
339  id_iter(seq.GetStructure());
340 
341  for ( ; id_iter; ++id_iter) {
342  CSeq_id_Handle idh =
343  CSeq_id_Handle::GetHandle(*id_iter);
344  m_Cache[idh].dest_mapper = mapper;
345  }
346  }}
347  break;
348 
349  case eRefSeq:
350  {{
351  CRef<CSeq_loc_Mapper> mapper
352  (new CSeq_loc_Mapper
354  m_Cache[CSeq_id_Handle::GetHandle(*lcl)].dest_mapper = mapper;
355  if (chr_lcl) {
356  m_Cache[CSeq_id_Handle::GetHandle(*chr_lcl)].dest_mapper = mapper;
357  }
358  }}
359  break;
360 
361  default:
362  break;
363  }
364 
365  ///
366  /// END HACK
367  }
368  }
369  }
370 
371  if (seq.IsSetSequences()) {
373  ITERATE (CGC_TaggedSequences::TSeqs, i, (*it)->GetSeqs()) {
374  x_AddAliasMappings(**i, map_to, alias_scope);
375  }
376  }
377  }
378 }
379 
380 
382  EAliasMapping map_to,
383  const string& alias_scope)
384 {
385  if (assm.IsSetMols()) {
386  ITERATE (CGC_AssemblyUnit::TMols, iter, assm.GetMols()) {
387  const CGC_Replicon& mol = **iter;
388 
389  if (mol.GetSequence().IsSingle()) {
391  map_to, alias_scope);
392  }
393  else {
395  mol.GetSequence().GetSet()) {
396  x_AddAliasMappings(**it, map_to, alias_scope);
397  }
398  }
399  }
400  }
401 
402  if (assm.IsSetOther_sequences()) {
404  assm.GetOther_sequences()) {
405  ITERATE (CGC_TaggedSequences::TSeqs, i, (*it)->GetSeqs()) {
406  x_AddAliasMappings(**i, map_to, alias_scope);
407  }
408  }
409  }
410 }
411 
412 
414  EAliasMapping map_to,
415  const string& alias_scope)
416 {
417  if (assm.IsUnit()) {
418  x_AddAliasMappings(assm.GetUnit(), map_to, alias_scope);
419  }
420  else if (assm.IsAssembly_set()) {
422  map_to, alias_scope);
423  if (assm.GetAssembly_set().IsSetMore_assemblies()) {
425  iter, assm.GetAssembly_set().GetMore_assemblies()) {
426  x_AddAliasMappings(**iter, map_to, alias_scope);
427  }
428  }
429  }
430 }
431 
432 
static CRef< CScope > m_Scope
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_Handle –.
void x_AddUnversionedMapping(const CSeq_id &src_id, const CSeq_id_Handle &dst_id)
void AddAliasMappings(const CGC_Assembly &assm, EAliasMapping mapping, const string &alias_scope=kEmptyStr)
EAliasMapping
Mapping destination type.
Definition: idmapper.hpp:375
@ eRefSeq
RefSeq GI.
Definition: idmapper.hpp:378
@ eGenBank
GenBank GI.
Definition: idmapper.hpp:376
@ eOther
Private id.
Definition: idmapper.hpp:381
@ eUCSC
External UCSC id.
Definition: idmapper.hpp:380
@ eGenBankAcc
GenBank accession.
Definition: idmapper.hpp:377
@ eAccVer
Find versioned accession.
Definition: idmapper.hpp:382
@ eRefSeqAcc
RefSeq accession.
Definition: idmapper.hpp:379
CRef< CScope > m_Scope
Definition: idmapper.hpp:422
virtual CSeq_id_Handle Map(const CSeq_id_Handle &)
Definition: idmapper.cpp:111
void x_AddAccessionMapping(const CSeq_id &id)
CIdMapperGCAssembly(CScope &scope, const CGC_Assembly &assm, EAliasMapping mapping, const string &alias_scope=kEmptyStr)
Establish mappings based on a GC-Assembly.
void x_AddAliasMappings(const CGC_Assembly &seq, EAliasMapping mapping, const string &alias_scope)
virtual CSeq_id_Handle Map(const CSeq_id_Handle &)
Map a single given CSeq_id_Handle to another.
Definition: idmapper.cpp:111
static std::string MapErrorString(const CSeq_id_Handle &)
Definition: idmapper.cpp:189
TMapperCache m_Cache
Definition: idmapper.hpp:117
virtual void AddMapping(const CSeq_id_Handle &from, const CSeq_id_Handle &to)
Add a mapping to the internal mapping table.
Definition: idmapper.cpp:65
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CScope –.
Definition: scope.hpp:92
CSeq_loc_Mapper –.
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
static const char * str(char *buf, int n)
Definition: stats.c:84
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
CSeq_id & Set(const CTempString &the_id, TParseFlags flags=fParse_AnyRaw)
Reassign based on flat specifications; arguments interpreted as with constructors.
Definition: Seq_id.cpp:2457
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
@ eSeqMap_Down
map from a segmented bioseq to segments
@ eSeqMap_Up
map from segments to the top level bioseq
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static string PrintableString(const CTempString str, TPrintableMode mode=fNewLine_Quote|fNonAscii_Passthru)
Get a printable version of the specified string.
Definition: ncbistr.cpp:3944
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
@ fNewLine_Quote
Display "\n" instead of actual linebreak.
Definition: ncbistr.hpp:2730
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
bool IsSetGpipe(void) const
the gpipe accession Check if a value has been assigned to Gpipe data member.
list< CRef< CGC_Sequence > > TSeqs
const TUnit & GetUnit(void) const
Get the variant data.
const TStructure & GetStructure(void) const
Get the Structure member data.
bool IsSingle(void) const
Check if variant Single is selected.
const TGi & GetGi(void) const
Get the Gi member data.
bool IsSetMols(void) const
collections of molecules for this assembly Check if a value has been assigned to Mols data member.
list< CRef< CGC_TypedSeqId > > TSeq_id_synonyms
bool IsAssembly_set(void) const
Check if variant Assembly_set is selected.
bool IsSetMore_assemblies(void) const
Check if a value has been assigned to More_assemblies data member.
bool IsSetOther_sequences(void) const
On primary assembly-unit: here will be the unplaced sequences On alt-loci: list of sequences aligned/...
const TOther_sequences & GetOther_sequences(void) const
Get the Other_sequences member data.
const TPrimary_assembly & GetPrimary_assembly(void) const
Get the Primary_assembly member data.
bool IsSetStructure(void) const
locations of ordered scaffolds/components Check if a value has been assigned to Structure data member...
const TSeq_id_synonyms & GetSeq_id_synonyms(void) const
Get the Seq_id_synonyms member data.
list< CRef< CGC_TaggedSequences > > TSequences
const TAssembly_set & GetAssembly_set(void) const
Get the variant data.
const TMols & GetMols(void) const
Get the Mols member data.
list< CRef< CGC_Replicon > > TMols
bool IsSetSequences(void) const
placed: populated both on chromosome and scaffold levels unlocalized: populated on chromosome level C...
const TSequence & GetSequence(void) const
Get the Sequence member data.
const TGpipe & GetGpipe(void) const
Get the Gpipe member data.
const TSequences & GetSequences(void) const
Get the Sequences member data.
bool IsUnit(void) const
Check if variant Unit is selected.
list< CRef< CGC_Assembly > > TMore_assemblies
list< CRef< CGC_Sequence > > TSet
const TSingle & GetSingle(void) const
Get the variant data.
const TPublic & GetPublic(void) const
Get the Public member data.
const TSet & GetSet(void) const
Get the variant data.
const TMore_assemblies & GetMore_assemblies(void) const
Get the More_assemblies member data.
bool IsSetSeq_id_synonyms(void) const
Other known identifiers: Local / gpipe-satellite / genbank / refseq Check if a value has been assigne...
list< CRef< CGC_TaggedSequences > > TOther_sequences
const TSeq_id & GetSeq_id(void) const
Get the Seq_id member data.
TNamed_annot_track & SetNamed_annot_track(void)
Select the variant.
Definition: Seq_id_.cpp:551
TEmbl & SetEmbl(void)
Select the variant.
Definition: Seq_id_.cpp:265
TOther & SetOther(void)
Select the variant.
Definition: Seq_id_.cpp:353
void ResetVersion(void)
Reset Version data member.
TTpe & SetTpe(void)
Select the variant.
Definition: Seq_id_.cpp:485
TTpg & SetTpg(void)
Select the variant.
Definition: Seq_id_.cpp:463
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
void ResetRelease(void)
Reset Release data member.
Definition: Textseq_id_.cpp:62
TPir & SetPir(void)
Select the variant.
Definition: Seq_id_.cpp:287
TTpd & SetTpd(void)
Select the variant.
Definition: Seq_id_.cpp:507
TGpipe & SetGpipe(void)
Select the variant.
Definition: Seq_id_.cpp:529
TDdbj & SetDdbj(void)
Select the variant.
Definition: Seq_id_.cpp:397
TLocal & SetLocal(void)
Select the variant.
Definition: Seq_id_.cpp:199
void ResetName(void)
Reset Name data member.
Definition: Textseq_id_.cpp:50
TPrf & SetPrf(void)
Select the variant.
Definition: Seq_id_.cpp:419
TGenbank & SetGenbank(void)
Select the variant.
Definition: Seq_id_.cpp:243
TSwissprot & SetSwissprot(void)
Select the variant.
Definition: Seq_id_.cpp:309
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
Definition: Seq_id_.hpp:113
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ e_Named_annot_track
Internal named annotation tracking ID.
Definition: Seq_id_.hpp:114
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
USING_SCOPE(objects)
int i
#define _ASSERT
Modified on Fri Sep 20 14:58:24 2024 by modify_doxy.py rev. 669887