NCBI C++ ToolKit
Seq_id.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: Seq_id.cpp 102245 2024-04-10 19:03:39Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: .......
27  *
28  * File Description:
29  * .......
30  *
31  * Remark:
32  * This code was originally generated by application DATATOOL
33  * using specifications from the ASN data definition file
34  * 'seqloc.asn'.
35  */
36 
37 // standard includes
38 
39 // generated includes
40 #include <ncbi_pch.hpp>
41 #include <corelib/ncbiutil.hpp>
42 #include <corelib/ncbi_param.hpp>
43 #include <util/line_reader.hpp>
44 #include <util/static_map.hpp>
45 #include <util/util_misc.hpp>
47 #include <serial/serialimpl.hpp>
48 
49 #include <objects/seq/Bioseq.hpp>
50 #include <objects/seq/Seq_inst.hpp>
52 
58 
60 
63 #include <objects/general/Date.hpp>
69 #include <corelib/ncbistre.hpp>
70 
71 #include "accguide2.inc"
72 
73 
74 #define NCBI_USE_ERRCODE_X Objects_SeqId
75 namespace
76 {
77 struct CSeq_id_find_pred
78 {
79  const char* kSymbols;
80  bool operator()(const char ch)
81  {
82  if (ch<32 || ch>127) // non-ASCII is not allowed
83  return true;
84  return strchr(kSymbols, ch) != 0;
85  }
86 };
87 
88 };
89 
90 
91 // generated classes
92 
94 BEGIN_objects_SCOPE // namespace ncbi::objects::
95 
96 static const char* sc_SupportedRawDbtags[] = {
97  "ATGC",
98  "BCMHGSC",
99  "BERKELEY",
100  "CELERA",
101  "GSDB",
102  "HOOD",
103  "LANLCHGS",
104  "LRG",
105  "MIPS",
106  "NCBI_EXT_ACC",
107  "NCBI_GENOMES",
108  "NCBI_MITO",
109  "PGEC",
110  "PID",
111  "SGD",
112  "SHGC",
113  "SRA",
114  "TIGR",
115  "UOKNOR",
116  "UWGC",
117  "WASHU",
118  "WIBR",
119  "WUGSC",
120  "dbGSS",
121  "dbSTS"
122 };
125 
126 
127 // CSeqIdException
128 const char* CSeqIdException::GetErrCodeString(void) const
129 {
130  switch (GetErrCode()) {
131  case eUnknownType: return "eUnknownType";
132  case eFormat: return "eFormat";
133  default: return CException::GetErrCodeString();
134  }
135 }
136 
137 
138 // constructor
140 {
141  return;
142 }
143 
144 // destructor
146 {
147  return;
148 }
149 
150 
151 static void s_SplitVersion(const CTempString& acc_in, CTempString& acc,
152  int& ver)
153 {
154  CTempString verstr;
155  NStr::SplitInTwo(acc_in, ".", acc, verstr);
156  if (verstr.empty()) {
157  ver = 0;
158  } else {
159  ver = NStr::StringToNonNegativeInt(verstr);
160  if (ver <= 0) {
161  NCBI_THROW(CSeqIdException, eFormat,
162  "Version embedded in accession " + string(acc_in)
163  + " is not a positive integer");
164  }
165  }
166 }
167 
168 
170 {
171  switch ( Which() ) {
172  case e_Genbank:
173  return &GetGenbank();
174  case e_Embl:
175  return &GetEmbl();
176  case e_Ddbj:
177  return &GetDdbj();
178  case e_Pir:
179  return &GetPir();
180  case e_Swissprot:
181  return &GetSwissprot();
182  case e_Other:
183  return &GetOther();
184  case e_Prf:
185  return &GetPrf();
186  case e_Tpg:
187  return &GetTpg();
188  case e_Tpe:
189  return &GetTpe();
190  case e_Tpd:
191  return &GetTpd();
192  case e_Gpipe:
193  return &GetGpipe();
194  case e_Named_annot_track:
195  return &GetNamed_annot_track();
196  default:
197  return 0;
198  }
199 }
200 
201 
202 inline
203 void x_Assign(CObject_id& dst, const CObject_id& src)
204 {
205  switch ( src.Which() ) {
207  dst.Reset();
208  return;
209  case CObject_id::e_Id:
210  dst.SetId(src.GetId());
211  return;
212  case CObject_id::e_Str:
213  dst.SetStr(src.GetStr());
214  return;
215  default:
216  NCBI_THROW(CSeqIdException, eFormat, "invalid Object-id variant");
217  }
218 }
219 
220 
221 inline
222 void x_Assign(CGiimport_id& dst, const CGiimport_id& src)
223 {
224  dst.SetId(src.GetId());
225  if ( src.IsSetDb() ) {
226  dst.SetDb(src.GetDb());
227  }
228  else {
229  dst.ResetDb();
230  }
231  if ( src.IsSetRelease() ) {
232  dst.SetRelease(src.GetRelease());
233  }
234  else {
235  dst.ResetRelease();
236  }
237 }
238 
239 
240 inline
241 void x_Assign(CTextseq_id& dst, const CTextseq_id& src)
242 {
243  if ( src.IsSetName() ) {
244  dst.SetName(src.GetName());
245  }
246  else {
247  dst.ResetName();
248  }
249  if ( src.IsSetAccession() ) {
250  dst.SetAccession(src.GetAccession());
251  }
252  else {
253  dst.ResetAccession();
254  }
255  if ( src.IsSetRelease() ) {
256  dst.SetRelease(src.GetRelease());
257  }
258  else {
259  dst.ResetRelease();
260  }
261  if ( src.IsSetVersion() ) {
262  dst.SetVersion(src.GetVersion());
263  }
264  else {
265  dst.ResetVersion();
266  }
267 }
268 
269 
270 inline
271 void x_Assign(CDbtag& dst, const CDbtag& src)
272 {
273  dst.SetDb(src.GetDb());
274  x_Assign(dst.SetTag(), src.GetTag());
275 }
276 
277 
278 inline
279 void x_Assign(CPatent_seq_id& dst, const CPatent_seq_id& src)
280 {
281  dst.SetSeqid(src.GetSeqid());
282  dst.SetCit().Assign(src.GetCit());
283 }
284 
285 
286 inline
287 void x_Assign(CDate& dst, const CDate& src)
288 {
289  dst.Assign(src);
290 }
291 
292 
293 inline
294 void x_Assign(CPDB_seq_id& dst, const CPDB_seq_id& src)
295 {
296  dst.SetMol().Set(src.GetMol());
297  if ( src.IsSetChain() ) {
298  dst.SetChain(src.GetChain());
299  }
300  else {
301  dst.ResetChain();
302  }
303  if ( src.IsSetChain_id() ) {
304  dst.SetChain_id(src.GetChain_id());
305  }
306  else {
307  dst.ResetChain_id();
308  }
309  if ( src.IsSetRel() ) {
310  dst.SetRel().Assign(src.GetRel());
311  }
312  else {
313  dst.ResetRel();
314  }
315 }
316 
317 
319 {
320  if ( GetTypeInfo() == obj.GetThisTypeInfo() ) {
321  const CSeq_id& id = static_cast<const CSeq_id&>(obj);
322  switch ( id.Which() ) {
323  case e_not_set:
324  Reset();
325  return;
326  case e_Local:
327  x_Assign(SetLocal(), id.GetLocal());
328  return;
329  case e_Gibbsq:
330  SetGibbsq(id.GetGibbsq());
331  return;
332  case e_Gibbmt:
333  SetGibbmt(id.GetGibbmt());
334  return;
335  case e_Giim:
336  x_Assign(SetGiim(), id.GetGiim());
337  return;
338  case e_Pir:
339  x_Assign(SetPir(), id.GetPir());
340  return;
341  case e_Swissprot:
343  return;
344  case e_Patent:
345  x_Assign(SetPatent(), id.GetPatent());
346  return;
347  case e_Other:
348  x_Assign(SetOther(), id.GetOther());
349  return;
350  case e_General:
351  x_Assign(SetGeneral(), id.GetGeneral());
352  return;
353  case e_Gi:
354  SetGi(id.GetGi());
355  return;
356  case e_Prf:
357  x_Assign(SetPrf(), id.GetPrf());
358  return;
359  case e_Pdb:
360  x_Assign(SetPdb(), id.GetPdb());
361  return;
362  case e_Genbank:
363  x_Assign(SetGenbank(), id.GetGenbank());
364  return;
365  case e_Embl:
366  x_Assign(SetEmbl(), id.GetEmbl());
367  return;
368  case e_Ddbj:
369  x_Assign(SetDdbj(), id.GetDdbj());
370  return;
371  case e_Tpg:
372  x_Assign(SetTpg(), id.GetTpg());
373  return;
374  case e_Tpe:
375  x_Assign(SetTpe(), id.GetTpe());
376  return;
377  case e_Tpd:
378  x_Assign(SetTpd(), id.GetTpd());
379  return;
380  case e_Gpipe:
381  x_Assign(SetGpipe(), id.GetGpipe());
382  return;
383  case e_Named_annot_track:
385  return;
386  }
387  }
388  CSerialObject::Assign(obj, how);
389 }
390 
391 
392 inline bool CanCmpAcc(CSeq_id::E_Choice choice)
393 {
394  switch ( choice ) {
395  case CSeq_id::e_Genbank:
396  case CSeq_id::e_Embl:
397  case CSeq_id::e_Ddbj:
398  case CSeq_id::e_Tpg:
399  case CSeq_id::e_Tpe:
400  case CSeq_id::e_Tpd:
401  case CSeq_id::e_Gpipe:
403  return true;
404  default:
405  return false;
406  }
407 }
408 
409 
410 // Compare() - are SeqIds equivalent?
412 {
413  if ( Which() != sid2.Which() ) { // Only one case where this will work
414  if (!CanCmpAcc(Which()) || !CanCmpAcc(sid2.Which())) {
415  return e_DIFF;
416  }
417  const CTextseq_id *tsip1 = GetTextseq_Id();
418  if ( !tsip1 )
419  return e_DIFF;
420 
421  const CTextseq_id *tsip2 = sid2.GetTextseq_Id();
422  if ( !tsip2 )
423  return e_DIFF;
424 
425  if ( tsip1->Match(*tsip2) ) // id Textseq_id match
426  return e_YES;
427  else
428  return e_NO;
429  }
430 
431  switch ( Which() ) { // Now we only need to know one
432  case e_Local:
433  return GetLocal().Match(sid2.GetLocal()) ? e_YES : e_NO;
434  case e_Gibbsq:
435  return GetGibbsq() == sid2.GetGibbsq() ? e_YES : e_NO;
436  case e_Gibbmt:
437  return GetGibbmt() == sid2.GetGibbmt() ? e_YES : e_NO;
438  case e_Giim:
439  return GetGiim().GetId() == sid2.GetGiim().GetId() ? e_YES : e_NO;
440  case e_Pir:
441  return GetPir().Match(sid2.GetPir()) ? e_YES : e_NO;
442  case e_Swissprot:
443  return GetSwissprot().Match(sid2.GetSwissprot()) ? e_YES : e_NO;
444  case e_Patent:
445  return GetPatent().Match(sid2.GetPatent()) ? e_YES : e_NO;
446  case e_Other:
447  return GetOther().Match(sid2.GetOther()) ? e_YES : e_NO;
448  case e_General:
449  if ( GetGeneral().Match(sid2.GetGeneral()) ) {
450  return e_YES;
451  }
452  else if ( NStr::CompareNocase(GetGeneral().GetDb(),
453  sid2.GetGeneral().GetDb()) ) {
454  return e_DIFF;
455  }
456  return e_NO;
457  case e_Gi:
458  return GetGi() == sid2.GetGi() ? e_YES : e_NO;
459  case e_Prf:
460  return GetPrf().Match(sid2.GetPrf()) ? e_YES : e_NO;
461  case e_Pdb:
462  return GetPdb().Match(sid2.GetPdb()) ? e_YES : e_NO;
463  case e_Genbank:
464  return GetGenbank().Match(sid2.GetGenbank()) ? e_YES : e_NO;
465  case e_Embl:
466  return GetEmbl().Match(sid2.GetEmbl()) ? e_YES : e_NO;
467  case e_Ddbj:
468  return GetDdbj().Match(sid2.GetDdbj()) ? e_YES : e_NO;
469  case e_Tpg:
470  return GetTpg().Match(sid2.GetTpg()) ? e_YES : e_NO;
471  case e_Tpe:
472  return GetTpe().Match(sid2.GetTpe()) ? e_YES : e_NO;
473  case e_Tpd:
474  return GetTpd().Match(sid2.GetTpd()) ? e_YES : e_NO;
475  case e_Gpipe:
476  return GetGpipe().Match(sid2.GetGpipe()) ? e_YES : e_NO;
477  case e_Named_annot_track:
479  ? e_YES : e_NO;
480  default:
481  return e_error;
482  }
483 }
484 
485 
486 int CSeq_id::CompareOrdered(const CSeq_id& sid2) const
487 {
488  int ret = Which() - sid2.Which();
489  if ( ret != 0 ) {
490  return ret;
491  }
492  const CTextseq_id *tsip1 = GetTextseq_Id();
493  const CTextseq_id *tsip2 = sid2.GetTextseq_Id();
494  if ( tsip1 && tsip2 ) {
495  return tsip1->Compare(*tsip2);
496  }
497  switch ( Which() ) { // Now we only need to know one
498  case e_Local:
499  return GetLocal().Compare(sid2.GetLocal());
500  case e_Gibbsq:
501  return GetGibbsq() - sid2.GetGibbsq();
502  case e_Gibbmt:
503  return GetGibbmt() - sid2.GetGibbmt();
504  case e_Giim:
505  return GetGiim().GetId() - sid2.GetGiim().GetId();
506  case e_Patent:
507  return GetPatent().Compare(sid2.GetPatent());
508  case e_General:
509  return GetGeneral().Compare(sid2.GetGeneral());
510  case e_Gi:
511  if ( GetGi() < sid2.GetGi() ) {
512  return -1;
513  }
514  else {
515  return GetGi() > sid2.GetGi();
516  }
517  case e_Pdb:
518  return GetPdb().Compare(sid2.GetPdb());
519  default:
520  return 0;
521  }
522 }
523 
525 // used for binary searching; must be in order.
526 static const TChoiceMapEntry sc_ChoiceArray[] = {
527  { "???", CSeq_id::e_not_set },
528  { "bbm", CSeq_id::e_Gibbmt },
529  { "bbs", CSeq_id::e_Gibbsq },
530  { "dbj", CSeq_id::e_Ddbj },
531 // removed aliases, see s_HasFastaTag and x_Init
532 // { "ddbj", CSeq_id::e_Ddbj },
533  { "emb", CSeq_id::e_Embl },
534 // { "embl", CSeq_id::e_Embl },
535  { "gb", CSeq_id::e_Genbank },
536 // { "genbank", CSeq_id::e_Genbank },
537 // { "general", CSeq_id::e_General },
538  { "gi", CSeq_id::e_Gi },
539 // { "gibbmt", CSeq_id::e_Gibbmt },
540  { "gibbsq", CSeq_id::e_Gibbsq },
541 // { "giim", CSeq_id::e_Giim },
542  { "gim", CSeq_id::e_Giim },
543  { "gnl", CSeq_id::e_General },
544 // { "gpipe", CSeq_id::e_Gpipe },
545  { "gpp", CSeq_id::e_Gpipe },
546  { "lcl", CSeq_id::e_Local },
547 // { "local", CSeq_id::e_Local },
548 // { "named_annot_track", CSeq_id::e_Named_annot_track },
549  { "nat", CSeq_id::e_Named_annot_track },
550  { "not_set", CSeq_id::e_not_set },
551 // { "oth", CSeq_id::e_Other }, // deprecated vs. ref
552 // { "other", CSeq_id::e_Other },
553  { "pat", CSeq_id::e_Patent },
554 // { "patent", CSeq_id::e_Patent },
555  { "pdb", CSeq_id::e_Pdb },
556  { "pgp", CSeq_id::e_Patent },
557  { "pir", CSeq_id::e_Pir },
558  { "prf", CSeq_id::e_Prf },
559  { "ref", CSeq_id::e_Other },
560  { "sp", CSeq_id::e_Swissprot },
561 // { "swissprot", CSeq_id::e_Swissprot },
562  { "tpd", CSeq_id::e_Tpd },
563  { "tpe", CSeq_id::e_Tpe },
564  { "tpg", CSeq_id::e_Tpg },
565  { "tr", CSeq_id::e_Swissprot }
566 };
570 
571 
572 static const char* const s_TextId[CSeq_id::e_MaxChoice+1] =
573 { // FASTA_LONG formats
574  "???" , // not-set = ???
575  "lcl", // local = lcl|integer or string
576  "bbs", // gibbsq = bbs|integer
577  "bbm", // gibbmt = bbm|integer
578  "gim", // giim = gim|integer
579  "gb", // genbank = gb|accession|locus
580  "emb", // embl = emb|accession|locus
581  "pir", // pir = pir|accession|name
582  "sp", // swissprot = sp|accession|name *OR* tr|accession|name
583  "pat", // patent = pat|country|patent number (string)|seq number (integer)
584  // *OR* pgp|country|application number|seq number
585  "ref", // other = ref|accession|name|release - changed from oth to ref
586  "gnl", // general = gnl|database(string)|id (string or number)
587  "gi", // gi = gi|integer
588  "dbj", // ddbj = dbj|accession|locus
589  "prf", // prf = prf|accession|name
590  "pdb", // pdb = pdb|entry name (string)|chain id (char)
591  "tpg", // tpg = tpg|accession|name
592  "tpe", // tpe = tpe|accession|name
593  "tpd", // tpd = tpd|accession|name
594  "gpp", // gpipe = gpp|accession|name
595  "nat", // named_annot_track = nat|accession|name
596  "" // Placeholder for end of list
597 };
598 
600 {
601  TChoiceMap::const_iterator it = sc_ChoiceMap.find(SeqIdCode);
602  if (it == sc_ChoiceMap.end()) {
603  return e_not_set;
604  } else {
605  return it->second;
606  }
607 }
608 
609 const char* CSeq_id::WhichFastaTag(E_Choice choice)
610 {
611  if (choice >= 0 && choice < ArraySize(s_TextId)) {
612  return s_TextId[choice];
613  } else {
614  return kEmptyCStr;
615  }
616 }
617 
618 static inline bool s_HasFastaTag(const CTempString& s)
619 {
620  // > rather than >= because there should be content after the bar.
621  if (s.size() > 3 && s[2] == '|') {
622  return true;
623  } else if (s.size() > 4 && s[3] == '|') {
624  return true;
625  } else {
626  return false;
627  }
628 }
629 
630 
631 
633 {
634  // > rather than >= because there should be content after the bar.
635  if (s.size() > 3 && s[2] == '|') {
636  return CSeq_id::WhichInverseSeqId(s.substr(0, 2));
637  } else if (s.size() > 4 && s[3] == '|') {
638  return CSeq_id::WhichInverseSeqId(s.substr(0, 3));
639  } else {
640  return CSeq_id::e_not_set;
641  }
642 }
643 
644 
646 // used for binary searching; must be in order.
648  { "ambiguous_nuc", CSeq_id::eAcc_ambiguous_nuc },
649  { "ddbj_con", CSeq_id::eAcc_ddbj_con },
650  { "ddbj_dirsub", CSeq_id::eAcc_ddbj_dirsub },
651  { "ddbj_est", CSeq_id::eAcc_ddbj_est },
652  { "ddbj_genome", CSeq_id::eAcc_ddbj_genome },
653  { "ddbj_gss", CSeq_id::eAcc_ddbj_gss },
654  { "ddbj_htgs", CSeq_id::eAcc_ddbj_htgs },
655  { "ddbj_mga", CSeq_id::eAcc_ddbj_mga },
656  { "ddbj_mrna", CSeq_id::eAcc_ddbj_mrna },
657  { "ddbj_other", CSeq_id::eAcc_ddbj_other },
658  { "ddbj_other_nuc", CSeq_id::eAcc_ddbj_other_nuc },
659  { "ddbj_patent", CSeq_id::eAcc_ddbj_patent },
660  { "ddbj_prot", CSeq_id::eAcc_ddbj_prot },
661  { "ddbj_targeted_nuc", CSeq_id::eAcc_ddbj_targeted_nuc },
662  { "ddbj_targetedm_nuc", CSeq_id::eAcc_ddbj_targetedm_nuc },
663  { "ddbj_targetedv_nuc", CSeq_id::eAcc_ddbj_targetedv_nuc },
664  { "ddbj_targetedvm_nuc", CSeq_id::eAcc_ddbj_targetedvm_nuc },
665  { "ddbj_tpa_chromosome", CSeq_id::eAcc_ddbj_tpa_chromosome },
666  { "ddbj_tpa_con", CSeq_id::eAcc_ddbj_tpa_con },
667  { "ddbj_tpa_nuc", CSeq_id::eAcc_ddbj_tpa_nuc },
668  { "ddbj_tpa_other", CSeq_id::eAcc_ddbj_tpa_other },
669  { "ddbj_tpa_prot", CSeq_id::eAcc_ddbj_tpa_prot },
670  { "ddbj_tpa_targeted_nuc", CSeq_id::eAcc_ddbj_tpa_targeted_nuc },
671  { "ddbj_tpa_targetedm_nuc", CSeq_id::eAcc_ddbj_tpa_targetedm_nuc },
672  { "ddbj_tpa_targetedv_nuc", CSeq_id::eAcc_ddbj_tpa_targetedv_nuc },
673  { "ddbj_tpa_targetedvm_nuc", CSeq_id::eAcc_ddbj_tpa_targetedvm_nuc },
674  { "ddbj_tpa_tsa_nuc", CSeq_id::eAcc_ddbj_tpa_tsa_nuc },
675  { "ddbj_tpa_tsa_prot", CSeq_id::eAcc_ddbj_tpa_tsa_prot },
676  { "ddbj_tpa_tsam_nuc", CSeq_id::eAcc_ddbj_tpa_tsam_nuc },
677  { "ddbj_tpa_tsam_prot", CSeq_id::eAcc_ddbj_tpa_tsam_prot },
678  { "ddbj_tpa_tsav_nuc", CSeq_id::eAcc_ddbj_tpa_tsav_nuc },
679  { "ddbj_tpa_tsav_prot", CSeq_id::eAcc_ddbj_tpa_tsav_prot },
680  { "ddbj_tpa_tsavm_nuc", CSeq_id::eAcc_ddbj_tpa_tsavm_nuc },
681  { "ddbj_tpa_tsavm_prot", CSeq_id::eAcc_ddbj_tpa_tsavm_prot },
682  { "ddbj_tpa_wgs_nuc", CSeq_id::eAcc_ddbj_tpa_wgs_nuc },
683  { "ddbj_tpa_wgs_prot", CSeq_id::eAcc_ddbj_tpa_wgs_prot },
684  { "ddbj_tpa_wgsm_nuc", CSeq_id::eAcc_ddbj_tpa_wgsm_nuc },
685  { "ddbj_tpa_wgsm_prot", CSeq_id::eAcc_ddbj_tpa_wgsm_prot },
686  { "ddbj_tpa_wgsv_nuc", CSeq_id::eAcc_ddbj_tpa_wgsv_nuc },
687  { "ddbj_tpa_wgsv_prot", CSeq_id::eAcc_ddbj_tpa_wgsv_prot },
688  { "ddbj_tpa_wgsvm_nuc", CSeq_id::eAcc_ddbj_tpa_wgsvm_nuc },
689  { "ddbj_tpa_wgsvm_prot", CSeq_id::eAcc_ddbj_tpa_wgsvm_prot },
690  { "ddbj_tsa_nuc", CSeq_id::eAcc_ddbj_tsa_nuc },
691  { "ddbj_tsa_prot", CSeq_id::eAcc_ddbj_tsa_prot },
692  { "ddbj_tsam_nuc", CSeq_id::eAcc_ddbj_tsam_nuc },
693  { "ddbj_tsam_prot", CSeq_id::eAcc_ddbj_tsam_prot },
694  { "ddbj_tsav_nuc", CSeq_id::eAcc_ddbj_tsav_nuc },
695  { "ddbj_tsav_prot", CSeq_id::eAcc_ddbj_tsav_prot },
696  { "ddbj_tsavm_nuc", CSeq_id::eAcc_ddbj_tsavm_nuc },
697  { "ddbj_tsavm_prot", CSeq_id::eAcc_ddbj_tsavm_prot },
698  { "ddbj_wgs_nuc", CSeq_id::eAcc_ddbj_wgs_nuc },
699  { "ddbj_wgs_prot", CSeq_id::eAcc_ddbj_wgs_prot },
700  { "ddbj_wgsm_nuc", CSeq_id::eAcc_ddbj_wgsm_nuc },
701  { "ddbj_wgsm_prot", CSeq_id::eAcc_ddbj_wgsm_prot },
702  { "ddbj_wgsv_nuc", CSeq_id::eAcc_ddbj_wgsv_nuc },
703  { "ddbj_wgsv_prot", CSeq_id::eAcc_ddbj_wgsv_prot },
704  { "ddbj_wgsvm_nuc", CSeq_id::eAcc_ddbj_wgsvm_nuc },
705  { "ddbj_wgsvm_prot", CSeq_id::eAcc_ddbj_wgsvm_prot },
706  { "embl_con", CSeq_id::eAcc_embl_con },
707  { "embl_ddbj", CSeq_id::eAcc_embl_ddbj },
708  { "embl_dirsub", CSeq_id::eAcc_embl_dirsub },
709  { "embl_est", CSeq_id::eAcc_embl_est },
710  { "embl_genome", CSeq_id::eAcc_embl_genome },
711  { "embl_gss", CSeq_id::eAcc_embl_gss },
712  { "embl_htgs", CSeq_id::eAcc_embl_htgs },
713  { "embl_mga", CSeq_id::eAcc_embl_mga },
714  { "embl_other", CSeq_id::eAcc_embl_other },
715  { "embl_other_nuc", CSeq_id::eAcc_embl_other_nuc },
716  { "embl_patent", CSeq_id::eAcc_embl_patent },
717  { "embl_prot", CSeq_id::eAcc_embl_prot },
718  { "embl_tpa_nuc", CSeq_id::eAcc_embl_tpa_nuc },
719  { "embl_tpa_other", CSeq_id::eAcc_embl_tpa_other },
720  { "embl_tpa_prot", CSeq_id::eAcc_embl_tpa_prot },
721  { "embl_tpa_tsa_nuc", CSeq_id::eAcc_embl_tpa_tsa_nuc },
722  { "embl_tpa_tsa_prot", CSeq_id::eAcc_embl_tpa_tsa_prot },
723  { "embl_tpa_tsam_nuc", CSeq_id::eAcc_embl_tpa_tsam_nuc },
724  { "embl_tpa_tsam_prot", CSeq_id::eAcc_embl_tpa_tsam_prot },
725  { "embl_tpa_tsav_nuc", CSeq_id::eAcc_embl_tpa_tsav_nuc },
726  { "embl_tpa_tsav_prot", CSeq_id::eAcc_embl_tpa_tsav_prot },
727  { "embl_tpa_tsavm_nuc", CSeq_id::eAcc_embl_tpa_tsavm_nuc },
728  { "embl_tpa_tsavm_prot", CSeq_id::eAcc_embl_tpa_tsavm_prot },
729  { "embl_tpa_wgs_nuc", CSeq_id::eAcc_embl_tpa_wgs_nuc },
730  { "embl_tpa_wgs_prot", CSeq_id::eAcc_embl_tpa_wgs_prot },
731  { "embl_tpa_wgsm_nuc", CSeq_id::eAcc_embl_tpa_wgsm_nuc },
732  { "embl_tpa_wgsm_prot", CSeq_id::eAcc_embl_tpa_wgsm_prot },
733  { "embl_tpa_wgsv_nuc", CSeq_id::eAcc_embl_tpa_wgsv_nuc },
734  { "embl_tpa_wgsv_prot", CSeq_id::eAcc_embl_tpa_wgsv_prot },
735  { "embl_tpa_wgsvm_nuc", CSeq_id::eAcc_embl_tpa_wgsvm_nuc },
736  { "embl_tpa_wgsvm_prot", CSeq_id::eAcc_embl_tpa_wgsvm_prot },
737  { "embl_tsa_nuc", CSeq_id::eAcc_embl_tsa_nuc },
738  { "embl_tsa_prot", CSeq_id::eAcc_embl_tsa_prot },
739  { "embl_tsam_nuc", CSeq_id::eAcc_embl_tsam_nuc },
740  { "embl_tsam_prot", CSeq_id::eAcc_embl_tsam_prot },
741  { "embl_tsav_nuc", CSeq_id::eAcc_embl_tsav_nuc },
742  { "embl_tsav_prot", CSeq_id::eAcc_embl_tsav_prot },
743  { "embl_tsavm_nuc", CSeq_id::eAcc_embl_tsavm_nuc },
744  { "embl_tsavm_prot", CSeq_id::eAcc_embl_tsavm_prot },
745  { "embl_wgs_nuc", CSeq_id::eAcc_embl_wgs_nuc },
746  { "embl_wgs_prot", CSeq_id::eAcc_embl_wgs_prot },
747  { "embl_wgsm_nuc", CSeq_id::eAcc_embl_wgsm_nuc },
748  { "embl_wgsm_prot", CSeq_id::eAcc_embl_wgsm_prot },
749  { "embl_wgsv_nuc", CSeq_id::eAcc_embl_wgsv_nuc },
750  { "embl_wgsv_prot", CSeq_id::eAcc_embl_wgsv_prot },
751  { "embl_wgsvm_nuc", CSeq_id::eAcc_embl_wgsvm_nuc },
752  { "embl_wgsvm_prot", CSeq_id::eAcc_embl_wgsvm_prot },
753  { "gb_backbone", CSeq_id::eAcc_gb_backbone },
754  { "gb_cdna", CSeq_id::eAcc_gb_cdna },
755  { "gb_chromosome", CSeq_id::eAcc_gb_chromosome },
756  { "gb_con", CSeq_id::eAcc_gb_con },
757  { "gb_ddbj", CSeq_id::eAcc_gb_ddbj },
758  { "gb_dirsub", CSeq_id::eAcc_gb_dirsub },
759  { "gb_embl", CSeq_id::eAcc_gb_embl },
760  { "gb_embl_ddbj", CSeq_id::eAcc_gb_embl_ddbj },
761  { "gb_est", CSeq_id::eAcc_gb_est },
762  { "gb_genome", CSeq_id::eAcc_gb_genome },
763  { "gb_gsdb", CSeq_id::eAcc_gb_gsdb },
764  { "gb_gss", CSeq_id::eAcc_gb_gss },
765  { "gb_htgs", CSeq_id::eAcc_gb_htgs },
766  { "gb_mga", CSeq_id::eAcc_gb_mga },
767  { "gb_optical_map", CSeq_id::eAcc_gb_optical_map },
768  { "gb_other", CSeq_id::eAcc_gb_other },
769  { "gb_other_nuc", CSeq_id::eAcc_gb_other_nuc },
770  { "gb_patent", CSeq_id::eAcc_gb_patent },
771  { "gb_patent_prot", CSeq_id::eAcc_gb_patent_prot },
772  { "gb_prot", CSeq_id::eAcc_gb_prot },
773  { "gb_segset", CSeq_id::eAcc_gb_segset },
774  { "gb_sts", CSeq_id::eAcc_gb_sts },
775  { "gb_targeted_nuc", CSeq_id::eAcc_gb_targeted_nuc },
776  { "gb_tpa_chromosome", CSeq_id::eAcc_gb_tpa_chromosome },
777  { "gb_tpa_con", CSeq_id::eAcc_gb_tpa_con },
778  { "gb_tpa_nuc", CSeq_id::eAcc_gb_tpa_nuc },
779  { "gb_tpa_other", CSeq_id::eAcc_gb_tpa_other },
780  { "gb_tpa_prot", CSeq_id::eAcc_gb_tpa_prot },
781  { "gb_tpa_segset", CSeq_id::eAcc_gb_tpa_segset },
782  { "gb_tpa_wgs_nuc", CSeq_id::eAcc_gb_tpa_wgs_nuc },
783  { "gb_tpa_wgs_prot", CSeq_id::eAcc_gb_tpa_wgs_prot },
784  { "gb_tpa_wgsm_nuc", CSeq_id::eAcc_gb_tpa_wgsm_nuc },
785  { "gb_tpa_wgsm_prot", CSeq_id::eAcc_gb_tpa_wgsm_prot },
786  { "gb_tpa_wgsv_nuc", CSeq_id::eAcc_gb_tpa_wgsv_nuc },
787  { "gb_tpa_wgsv_prot", CSeq_id::eAcc_gb_tpa_wgsv_prot },
788  { "gb_tpa_wgsvm_nuc", CSeq_id::eAcc_gb_tpa_wgsvm_nuc },
789  { "gb_tpa_wgsvm_prot", CSeq_id::eAcc_gb_tpa_wgsvm_prot },
790  { "gb_tsa_nuc", CSeq_id::eAcc_gb_tsa_nuc },
791  { "gb_tsa_prot", CSeq_id::eAcc_gb_tsa_prot },
792  { "gb_tsam_nuc", CSeq_id::eAcc_gb_tsam_nuc },
793  { "gb_tsam_prot", CSeq_id::eAcc_gb_tsam_prot },
794  { "gb_tsav_nuc", CSeq_id::eAcc_gb_tsav_nuc },
795  { "gb_tsav_prot", CSeq_id::eAcc_gb_tsav_prot },
796  { "gb_tsavm_nuc", CSeq_id::eAcc_gb_tsavm_nuc },
797  { "gb_tsavm_prot", CSeq_id::eAcc_gb_tsavm_prot },
798  { "gb_wgs_nuc", CSeq_id::eAcc_gb_wgs_nuc },
799  { "gb_wgs_prot", CSeq_id::eAcc_gb_wgs_prot },
800  { "gb_wgsm_nuc", CSeq_id::eAcc_gb_wgsm_nuc },
801  { "gb_wgsm_prot", CSeq_id::eAcc_gb_wgsm_prot },
802  { "gb_wgsv_nuc", CSeq_id::eAcc_gb_wgsv_nuc },
803  { "gb_wgsv_prot", CSeq_id::eAcc_gb_wgsv_prot },
804  { "gb_wgsvm_nuc", CSeq_id::eAcc_gb_wgsvm_nuc },
805  { "gb_wgsvm_prot", CSeq_id::eAcc_gb_wgsvm_prot },
806  { "general", CSeq_id::eAcc_general },
807  { "general_nuc", CSeq_id::eAcc_general_nuc },
808  { "general_prot", CSeq_id::eAcc_general_prot },
809  { "gi", CSeq_id::eAcc_gi },
810  { "gibbmt", CSeq_id::eAcc_gibbmt },
811  { "gibbsq", CSeq_id::eAcc_gibbsq },
812  { "giim", CSeq_id::eAcc_giim },
813  { "gpipe_chromosome", CSeq_id::eAcc_gpipe_chromosome },
814  { "gpipe_genomic", CSeq_id::eAcc_gpipe_genomic },
815  { "gpipe_mrna", CSeq_id::eAcc_gpipe_mrna },
816  { "gpipe_ncrna", CSeq_id::eAcc_gpipe_ncrna },
817  { "gpipe_other_nuc", CSeq_id::eAcc_gpipe_other_nuc },
818  { "gpipe_prot", CSeq_id::eAcc_gpipe_prot },
819  { "gpipe_scaffold", CSeq_id::eAcc_gpipe_scaffold },
820  { "gpipe_unreserved", CSeq_id::eAcc_gpipe_unreserved },
821  { "gsdb_dirsub", CSeq_id::eAcc_gsdb_dirsub },
822  { "local", CSeq_id::eAcc_local },
823  { "maybe_ddbj", CSeq_id::eAcc_maybe_ddbj },
824  { "maybe_embl", CSeq_id::eAcc_maybe_embl },
825  { "maybe_gb", CSeq_id::eAcc_maybe_gb },
826  { "named_annot_track", CSeq_id::eAcc_named_annot_track },
827  { "patent", CSeq_id::eAcc_patent },
828  { "pdb", CSeq_id::eAcc_pdb },
829  { "pir", CSeq_id::eAcc_pir },
830  { "prf", CSeq_id::eAcc_prf },
831  { "refseq_chromosome", CSeq_id::eAcc_refseq_chromosome },
832  { "refseq_chromosome_ncbo", CSeq_id::eAcc_refseq_chromosome_ncbo },
833  { "refseq_contig", CSeq_id::eAcc_refseq_contig },
834  { "refseq_contig_ncbo", CSeq_id::eAcc_refseq_contig_ncbo },
835  { "refseq_genome", CSeq_id::eAcc_refseq_genome },
836  { "refseq_genomic", CSeq_id::eAcc_refseq_genomic },
837  { "refseq_mrna", CSeq_id::eAcc_refseq_mrna },
838  { "refseq_mrna_predicted", CSeq_id::eAcc_refseq_mrna_predicted },
839  { "refseq_ncrna", CSeq_id::eAcc_refseq_ncrna },
840  { "refseq_ncrna_predicted", CSeq_id::eAcc_refseq_ncrna_predicted },
841  { "refseq_prot", CSeq_id::eAcc_refseq_prot },
842  { "refseq_prot_predicted", CSeq_id::eAcc_refseq_prot_predicted },
843  { "refseq_unique_prot", CSeq_id::eAcc_refseq_unique_prot },
844  { "refseq_unreserved", CSeq_id::eAcc_refseq_unreserved },
845  { "refseq_wgs_intermed", CSeq_id::eAcc_refseq_wgs_intermed },
846  { "refseq_wgs_nuc", CSeq_id::eAcc_refseq_wgs_nuc },
847  { "refseq_wgs_prot", CSeq_id::eAcc_refseq_wgs_prot },
848  { "refseq_wgsm_intermed", CSeq_id::eAcc_refseq_wgsm_intermed },
849  { "refseq_wgsm_nuc", CSeq_id::eAcc_refseq_wgsm_nuc },
850  { "refseq_wgsm_prot", CSeq_id::eAcc_refseq_wgsm_prot },
851  { "refseq_wgsv_intermed", CSeq_id::eAcc_refseq_wgsv_intermed },
852  { "refseq_wgsv_nuc", CSeq_id::eAcc_refseq_wgsv_nuc },
853  { "refseq_wgsv_prot", CSeq_id::eAcc_refseq_wgsv_prot },
854  { "refseq_wgsvm_intermed", CSeq_id::eAcc_refseq_wgsvm_intermed },
855  { "refseq_wgsvm_nuc", CSeq_id::eAcc_refseq_wgsvm_nuc },
856  { "refseq_wgsvm_prot", CSeq_id::eAcc_refseq_wgsvm_prot },
857  { "swissprot", CSeq_id::eAcc_swissprot },
858  { "unknown", CSeq_id::eAcc_unknown },
859  { "unreserved_nuc", CSeq_id::eAcc_unreserved_nuc },
860  { "unreserved_prot", CSeq_id::eAcc_unreserved_prot }
861 };
865 
866 static const char kDigits[] = "0123456789";
867 // Maximum number of varying final digits for which it's practical to
868 // use a bit vector; this BV-based representation additionally
869 // requires a constant alphabetical prefix. (In other situations,
870 // this code sticks to a traditional representation that has no such
871 // limits but doesn't scale as well to large numbers of special cases.)
872 static const unsigned int kMaxSmallSpecialDigits = 9;
874  1,
875  10,
876  100,
877  1000,
878  10000,
879  100000,
880  1000000,
881  10000000,
882  100000000,
883  1000000000
884 };
885 
886 struct SAccGuide : public CObject
887 {
890  typedef pair<string, TAccInfo> TPair;
891  typedef list<TPair> TPairs; // not vector -- need stable ptrs
892  typedef map<string, TPair> TBigSpecialMap; // last -> first -> value
893  typedef pair<bm::bvector<>, TAccInfo> TSmallSpecialOption;
895  typedef unsigned int TFormatCode;
896  typedef pair<string, string> TFallback; // fallback, refinement
898 
899  struct SSubMap {
904  };
906 
907  struct SHints {
909  : prev_type(CSeq_id::eAcc_unknown), prev_submap(NULL),
911  prev_special_type(CSeq_id::eAcc_unknown),
912  prev_special_base_type(CSeq_id::eAcc_unknown),
914  {}
915 
917  TAccInfo FindSpecial(const SAccGuide& guide, TFormatCode fmt,
918  CTempString acc_or_range);
920 
935  unique_ptr<string> special2_old_name;
938  unsigned int version;
939  };
940 
941  SAccGuide(void);
942  SAccGuide(const string& filename)
943  : count(0)
944  { x_Load(filename); }
946  : count(0)
947  { x_Load(lr); }
948 
949  void AddRule(const CTempString& rule, SHints& hints);
950  const TAccInfo& Find(TFormatCode fmt, const CTempString& acc_or_pfx,
951  string* key_used = NULL) const;
952  static TFormatCode s_Key(unsigned short letters, unsigned short digits)
953  { return TFormatCode(letters) << 16 | digits; }
954 
955  unsigned int count;
959 
960 private:
961  void x_Load(const string& filename);
962  void x_Load(ILineReader& lr);
963  void x_InitGeneral(void);
964  void x_AddSpecial(SSubMap& submap, SHints& hints, TFormatCode fmt,
966  const string* old_name, const CTempString& new_name);
968  TFormatCode fmt);
969 };
970 
972  = static_cast<SAccGuide::TAccInfo>(-1);
973 
974 inline
976 {
977  if (name == prev_type_name) {
978  return prev_type;
979  } else {
980  TAccInfoMap::const_iterator it = sc_AccInfoMap.find(name);
981  if (it == sc_AccInfoMap.end()) {
982  return kUnrecognized;
983  } else {
984  prev_special_key.clear();
985  prev_type_name = it->first;
986  return prev_type = it->second;
987  }
988  }
989 }
990 
991 inline
993  TFormatCode fmt,
994  CTempString acc_or_range)
995 {
996  CTempString pfx(acc_or_range, 0, fmt >> 16);
997  if (fmt == prev_special_format) {
998  if (acc_or_range == prev_special_key) {
999  prev_special_type = prev_type;
1000  prev_special_type_name = prev_type_name;
1001  return prev_type;
1002  } else if (pfx == prev_special_base_key) {
1003  return prev_special_base_type;
1004  }
1005  }
1006  prev_special_format = fmt;
1007  prev_special_base_key = pfx;
1008  prev_special_base_type = guide.Find(fmt, pfx);
1009  return prev_special_base_type;
1010 }
1011 
1012 inline
1015 {
1016  if (prev_submap != NULL && prev_submap->first == fmt) {
1017  return prev_submap->second;
1018  } else {
1020  if (it == rules.end() || it->first != fmt) {
1021  it = rules.insert(it, make_pair(fmt, SAccGuide::SSubMap()));
1022  }
1023  prev_submap = &*it;
1024  prev_big_special = it->second.big_specials.end();
1025  prev_small_special = it->second.small_specials.end();
1026  return it->second;
1027  }
1028 }
1029 
1030 void SAccGuide::AddRule(const CTempString& rule, SHints& hints)
1031 {
1032  CTempString tmp1, tmp2;
1033  vector<CTempStringEx> tokens;
1034  SIZE_TYPE pos, pos2;
1035 
1036  ++count;
1037  tmp1.assign(rule, 0, rule.find('#')); // strip comment
1038  if (tmp1.empty())
1039  return;
1040  tokens.reserve(3);
1041  NStr::Split(tmp1, " \t", tokens,
1043  if (tokens.empty()) {
1044  return;
1045  } else if (tokens.size() == 2
1046  && NStr::EqualNocase(tokens[0], "version")) {
1047  hints.special2_submap = nullptr;
1049  if (hints.version > 2 || hints.version < 1) {
1050  ERR_POST_X(2, "SAccGuide::AddRule: " << count
1051  << ": Unsupported version " << tokens[1]);
1052  return;
1053  }
1054  } else if ((pos = tokens[0].find('+')) != NPOS
1055  && (tokens.size() == 3
1056  || (tokens.size() == 4 && tokens[3] == "*"))) {
1057  hints.special2_submap = nullptr;
1058  // _VERIFY(NStr::SplitInTwo(tokens[0], "+", tmp1, tmp2));
1059  tmp1.assign(tokens[0], 0, pos);
1060  tmp2.assign(tokens[0], pos + 1, NPOS);
1061  TFormatCode fmt
1064  TAccInfo value = hints.FindAccInfo(tokens[2]);
1065  unique_ptr<string> old_name;
1066  if (value == kUnrecognized) {
1067  string key_used;
1068  TAccInfo old = Find(fmt, tokens[1], &key_used);
1069  old_name.reset(new string);
1070  if (old != CSeq_id::eAcc_unknown) {
1072  if (old == hints.prev_type) {
1073  *old_name = hints.prev_type_name;
1074  } else {
1075  *old_name = "0x" + NStr::UIntToString(old, 0, 16);
1076  }
1077  if ( !key_used.empty() ) {
1078  key_used = " (per " + key_used + ')';
1079  }
1080  ERR_POST_X(8, Info << "SAccGuide::AddRule: " << count
1081  << ": ignoring refinement of " << tokens[1]
1082  << " from " << *old_name << key_used
1083  << " to unrecognized accession type " << tokens[2]);
1084  } else {
1085  auto it = hints.default_fallbacks.find(tokens[2]);
1086  if (it != hints.default_fallbacks.end()) {
1087  *old_name = it->second;
1088  value = TAccInfo(hints.FindAccInfo(*old_name)
1090  ERR_POST_X(17,
1091  Info << "SAccGuide::AddRule: " << count
1092  << ": using default fallback from " << tokens[2]
1093  << " to " << *old_name << " for " << tokens[1]);
1094  } else {
1095  *old_name = "unknown";
1096  ERR_POST_X(3,
1097  "SAccGuide::AddRule: " << count
1098  << ": unrecognized accession type " << tokens[2]
1099  << " for " << tokens[1]);
1100  }
1101  }
1102  }
1103  if (value != kUnrecognized) {
1104  SSubMap& submap = hints.FindSubMap(rules, fmt);
1105  if (tokens.size() == 4) {
1107  }
1108  const TAccInfo* value_ptr = NULL;
1109  if (tokens[1].find_first_of("?*") == NPOS) {
1110  value_ptr = &(submap.prefixes[tokens[1]] = value);
1111  } else {
1112  // Account for possible refinements of fallback definitions
1113  NON_CONST_ITERATE (TPairs, wit, submap.wildcards) {
1114  if (wit->first == tokens[1]) {
1115  wit->second = value;
1116  value_ptr = &wit->second;
1117  break;
1118  }
1119  }
1120  if (value_ptr == NULL) {
1121  submap.wildcards.push_back(TPair(tokens[1], value));
1122  value_ptr = &submap.wildcards.back().second;
1123  }
1124  }
1125  _ASSERT(*value_ptr == value);
1126  if ((value & CSeq_id::fAcc_fallback) != 0) {
1127  _ASSERT(old_name.get() != NULL && !old_name->empty());
1128  fallbacks[value_ptr] = make_pair(*old_name, tokens[2]);
1129  } else {
1130  _ASSERT(old_name.get() == NULL);
1131  }
1132  }
1133  } else if (tokens.size() == 3 && NStr::EqualNocase(tokens[0], "special")) {
1134  hints.special2_submap = nullptr;
1135  pos = tokens[1].find_first_of(kDigits);
1136  pos2 = tokens[1].find('-', pos);
1137  TFormatCode fmt
1138  = s_Key(pos, ((pos2 == NPOS) ? tokens[1].size() : pos2) - pos);
1139  TAccInfo old = hints.FindSpecial(*this, fmt, tokens[1]);
1140  TAccInfo value = hints.FindAccInfo(tokens[2]);
1141  if ((old & CSeq_id::fAcc_specials) != 0) {
1142  old = TAccInfo(old & ~CSeq_id::fAcc_specials);
1143  } else {
1144  string key_used;
1145  Find(fmt, tokens[1].substr(0, pos2), &key_used);
1146  if ( !key_used.empty() ) {
1147  ERR_POST_X(13, Warning
1148  << "SAccGuide::AddRule: Main listing for special "
1149  << tokens[1]
1150  << " doesn't indicate that specials are present.");
1151  }
1152  }
1153  unique_ptr<string> old_name;
1154  if (value == kUnrecognized) {
1155  string key_used;
1156  Find(fmt, tokens[1].substr(0, pos2), &key_used);
1157  old_name.reset(new string);
1158  if ( !key_used.empty() ) {
1159  auto it = hints.default_fallbacks.find(tokens[2]);
1160  if (it != hints.default_fallbacks.end()) {
1161  old = CSeq_id::eAcc_unknown;
1162  *old_name = it->second;
1163  value = TAccInfo(hints.FindAccInfo(*old_name)
1165  ERR_POST_X(17,
1166  Info << "SAccGuide::AddRule: " << count
1167  << ": using default fallback from " << tokens[2]
1168  << " to " << *old_name << " for " << tokens[1]);
1169  }
1170  }
1171  if (old) {
1173  if (old == hints.prev_type) {
1174  *old_name = hints.prev_type_name;
1175  } else if (old == hints.prev_special_type) {
1176  *old_name = hints.prev_special_type_name;
1177  } else {
1178  *old_name = "0x" + NStr::UIntToString(old, 0, 16);
1179  }
1180  if ( !key_used.empty() ) {
1181  key_used = " (per " + key_used + ')';
1182  }
1183  ERR_POST_X(4, Info << "SAccGuide::AddRule: " << count
1184  << ": unrecognized accession type " << tokens[2]
1185  << " for special case " << tokens[1]
1186  << "; falling back to " << *old_name << key_used);
1187  } else if (old_name->empty()) {
1188  *old_name = "unknown";
1189  ERR_POST_X(9, Warning << "SAccGuide::AddRule: " << count
1190  << ": unrecognized accession type " << tokens[2]
1191  << " for stray(!) special case " << tokens[1]);
1192  }
1193  } else {
1194  _ASSERT(hints.prev_type == value);
1195  hints.prev_special_key = tokens[1];
1196  }
1197  if (value != kUnrecognized) {
1198  SSubMap& submap = hints.FindSubMap(rules, fmt);
1199  if (pos2 == NPOS) {
1200  tmp1 = tmp2 = tokens[1];
1201  } else {
1202  tmp1.assign(tokens[1], 0, pos2);
1203  tmp2.assign(tokens[1], pos2 + 1, NPOS);
1204  }
1205  x_AddSpecial(submap, hints, fmt, tmp1, tmp2, value, old_name.get(),
1206  tokens[2]);
1207  }
1208  hints.prev_special_type_name.clear();
1210  } else if (tokens.size() >= 3
1211  && NStr::EqualNocase(tokens[0], "special2")) {
1212  if (hints.version < 2) {
1213  ERR_POST_X(18,
1214  Warning << "SAccGuide::AddRule: " << count
1215  << ": special2 valid only in version 2+ guides");
1216  }
1217  NStr::SplitInTwo(tokens[1], "+", tmp1, tmp2);
1218  auto digits = NStr::StringToNumeric<unsigned short>(tmp2);
1219  hints.prev_special_format = s_Key(tmp1.size(), digits);
1220  hints.special2_name = tokens[2];
1221  hints.special2_old_name.reset();
1222  hints.special2_submap
1223  = &hints.FindSubMap(rules, hints.prev_special_format);
1224  hints.prev_special2_acc = tmp1 + string(digits, '0');
1225  TAccInfo old = hints.FindSpecial(*this, hints.prev_special_format,
1226  hints.prev_special2_acc);
1227  string why;
1228  if ((old & CSeq_id::fAcc_specials) != 0) {
1229  old = TAccInfo(old & ~CSeq_id::fAcc_specials);
1230  } else {
1231  Find(hints.prev_special_format, hints.prev_special2_acc, &why);
1232  if ( !why.empty() ) {
1233  ERR_POST_X(13, Warning
1234  << "SAccGuide::AddRule: Main listing for special "
1235  << tokens[1]
1236  << " doesn't indicate that specials are present.");
1237  }
1238  }
1239  for (size_t i = 2; i < tokens.size(); ++i) {
1240  hints.special2_type = hints.FindAccInfo(tokens[i]);
1241  if (hints.special2_type != kUnrecognized) {
1242  if (i > 2) {
1243  hints.special2_old_name.reset(new string(tokens[i]));
1244  }
1245  break;
1246  }
1247  }
1248  if (hints.special2_type == kUnrecognized) {
1249  for (size_t i = 2; i < tokens.size(); ++i) {
1250  auto it = hints.default_fallbacks.find(tokens[i]);
1251  if (it != hints.default_fallbacks.end()) {
1252  hints.special2_type = hints.FindAccInfo(it->second);
1254  hints.special2_old_name.reset(new string(it->second));
1255  why = " (per default fallback for " + tokens[i] + ')';
1256  break;
1257  }
1258  }
1259  if (hints.special2_type == kUnrecognized) {
1260  hints.special2_old_name.reset(new string);
1261  if (old != kUnrecognized) {
1262  hints.special2_type = old;
1263  if (old == hints.prev_type) {
1264  *hints.special2_old_name = hints.prev_type_name;
1265  } else {
1266  *hints.special2_old_name
1267  = "0x" + NStr::UIntToString(old, 0, 16);
1268  }
1269  if ( !why.empty() ) {
1270  why = " (per " + why + ')';
1271  }
1272  } else {
1273  *hints.special2_old_name = "unknown";
1274  ERR_POST_X(9, Warning << "SAccGuide::AddRule: " << count
1275  << ": unrecognized accession type " << tokens[2]
1276  << " for stray(!) special case " << tokens[1]);
1277  }
1278  }
1279  }
1280  if (hints.special2_old_name.get() != nullptr) {
1281  hints.special2_type
1283  ERR_POST_X(4,
1284  Info << "SAccGuide::AddRule: " << count
1285  << ": unrecognized accession type " << tokens[2]
1286  << " for special case " << tokens[1]
1287  << "; falling back to " << *hints.special2_old_name
1288  << why);
1289  }
1290  hints.prev_special_type_name.clear();
1292  } else if (tokens.size() >= 2 && tokens[0] == ":") {
1293  if (hints.version < 2) {
1294  ERR_POST_X(19,
1295  Warning << "SAccGuide::AddRule: " << count
1296  << ": special2 continuation lines valid only in"
1297  " version 2+ guides");
1298  }
1299  if (hints.special2_submap == nullptr) {
1300  ERR_POST_X(20,
1301  Warning <<
1302  "SAccGuide::AddRule: " << count
1303  << ": ignoring misplaced special2 ranges line.");
1304  return;
1305  }
1306  string s;
1307  CTempString from;
1308  char *p = &hints.prev_special2_acc[hints.prev_special2_acc.size()];
1309  for (size_t i = 1; i < tokens.size(); ++i) {
1310  NStr::SplitInTwo(tokens[i], "-", tmp1, tmp2);
1311  memcpy(p - tmp1.size(), tmp1.data(), tmp1.size());
1312  if (tmp2.empty()) {
1313  from = hints.prev_special2_acc;
1314  } else {
1315  s = hints.prev_special2_acc;
1316  from = s;
1317  memcpy(p - tmp2.size(), tmp2.data(), tmp2.size());
1318  }
1319  x_AddSpecial(*hints.special2_submap, hints,
1320  hints.prev_special_format, from,
1321  hints.prev_special2_acc, hints.special2_type,
1322  hints.special2_old_name.get(), hints.special2_name);
1323  }
1324  } else if (tokens.size() == 3 && NStr::EqualNocase(tokens[0], "gnl")) {
1325  hints.special2_submap = nullptr;
1326  string key(tokens[1]);
1327  NStr::ToUpper(key);
1328  TAccInfo value = hints.FindAccInfo(tokens[2]);
1329  if (value == kUnrecognized) {
1331  if (it2 == general.end()) {
1332  ERR_POST_X(3, "SAccGuide::AddRule: " << count
1333  << ": unrecognized accession type " << tokens[2]
1334  << " for " << key);
1335  } else {
1336  string old_name;
1337  if (it2->second == hints.prev_type) {
1338  old_name = hints.prev_type_name;
1339  } else {
1340  old_name = "0x" + NStr::UIntToString(it2->second, 0, 16);
1341  }
1342  it2->second = TAccInfo(it2->second | CSeq_id::fAcc_fallback);
1343  fallbacks[&it2->second] = make_pair(old_name, tokens[2]);
1344  ERR_POST_X(8, Info << "SAccGuide::AddRule: " << count
1345  << ": ignoring refinement of " << key << " from "
1346  << old_name << " to unrecognized accession type "
1347  << tokens[2]);
1348  }
1349  } else {
1350  general[key] = value;
1351  }
1352  } else if (tokens.size() == 3
1353  && NStr::EqualNocase(tokens[0], "fallback")) {
1354  hints.special2_submap = nullptr;
1355  if (hints.version < 2) {
1356  ERR_POST_X(21,
1357  Warning << "SAccGuide::AddRule: " << count
1358  << ": default fallbacks valid only in version 2+"
1359  " guides");
1360  }
1361  if (hints.FindAccInfo(tokens[2]) != kUnrecognized) {
1362  hints.default_fallbacks[tokens[1]] = tokens[2];
1363  }
1364  } else {
1365  ERR_POST_X(5, Warning << "SAccGuide::AddRule: " << count
1366  << ": ignoring invalid line: " << rule);
1367  }
1368 }
1369 
1371  const CTempString& acc_or_pfx,
1372  string* key_used) const
1373 {
1374  static const TAccInfo kUnknown = CSeq_id::eAcc_unknown;
1376  if (it == rules.end()) {
1377  return kUnknown;
1378  }
1379 
1380  const SSubMap& submap = it->second;
1381  const TAccInfo* result = &kUnknown;
1382  CTempString pfx (acc_or_pfx, 0, fmt >> 16);
1383  TPrefixes::const_iterator pit = submap.prefixes.find(pfx);
1384  if (pit != submap.prefixes.end()) {
1385  result = &pit->second;
1386  } else {
1387  ITERATE (TPairs, wit, submap.wildcards) {
1388  if (NStr::MatchesMask(pfx, wit->first)) {
1389  bool bad_match = false; // Limit ? to matching letters
1390  SIZE_TYPE pos = wit->first.find('?');
1391  while (pos != NPOS) {
1392  if ( !isalnum(pfx[pos]) && pfx[pos] != '?' ) {
1393  bad_match = true;
1394  break;
1395  } else {
1396  pos = wit->first.find('?', pos + 1);
1397  }
1398  }
1399  if (bad_match) {
1400  continue;
1401  }
1402  if (key_used && acc_or_pfx != wit->first) {
1403  *key_used = wit->first;
1404  }
1405  result = &wit->second;
1406  break;
1407  }
1408  }
1409  }
1410  if (acc_or_pfx != pfx && (*result & CSeq_id::fAcc_specials) != 0) {
1411  pfx = acc_or_pfx;
1412  auto n = x_SplitSpecial(pfx, fmt);
1413  for (auto ssit = submap.small_specials.lower_bound(pfx);
1414  ssit != submap.small_specials.end() && ssit->first == pfx;
1415  ++ssit) {
1416  if (ssit->second.first[n]) {
1417  if (key_used) {
1418  key_used->erase();
1419  }
1420  return ssit->second.second;
1421  }
1422  }
1424  = submap.big_specials.lower_bound(acc_or_pfx);
1425  if (bsit != submap.big_specials.end()
1426  && !(acc_or_pfx < bsit->second.first) ) {
1427  if (key_used) {
1428  key_used->erase();
1429  }
1430  return bsit->second.second;
1431  } else {
1432  if (key_used && key_used->empty()) {
1433  *key_used = pfx.substr(0, fmt >> 16);
1434  }
1435  return *result;
1436  }
1437  } else /* if (*result != CSeq_id::eAcc_unknown) */ {
1438  return *result;
1439  }
1440 }
1441 
1442 
1444  : count(0)
1445 {
1446  bool file_is_old = false;
1447  {{
1448  string file = g_FindDataFile("accguide2.txt");
1449  CTime builtin_timestamp(static_cast<time_t>(kBuiltInGuide_Timestamp));
1450  if ( !file.empty() &&
1451  !(file_is_old = g_IsDataFileOld(file, builtin_timestamp)) ) {
1452  try {
1453  x_Load(file);
1454  } STD_CATCH_ALL_X(1, "SAccGuide::SAccGuide")
1455  }
1456  }}
1457  if (count == 0) {
1458  if (file_is_old) {
1459  ERR_POST_X(12, Info << "CSeq_id::IdentifyAccession: " // minor lie
1460  "using built-in rules because accguide.txt is older.");
1461  } else {
1462  ERR_POST_X(6, Info << "CSeq_id::IdentifyAccession: "
1463  "falling back on built-in rules.");
1464  }
1465  static const unsigned int kNumBuiltInRules
1466  = sizeof(kBuiltInGuide) / sizeof(*kBuiltInGuide);
1467  SHints hints;
1468  for (unsigned int i = 0; i < kNumBuiltInRules; ++i) {
1469  AddRule(kBuiltInGuide[i], hints);
1470  }
1471  }
1472  for (auto &rit : rules) {
1473  ERASE_ITERATE(TSmallSpecialMap, sit, rit.second.small_specials) {
1474  if (sit->second.first.any()) {
1475  sit->second.first.optimize();
1476  } else {
1477  rit.second.small_specials.erase(sit);
1478  }
1479  }
1480  }
1481  x_InitGeneral();
1482 }
1483 
1485 {
1486  if (general.empty()) {
1487  // Populate with a hard-coded list by default; there are only
1488  // a few tags to worry about, but listing them in accguide.txt
1489  // right away would yield warnings from old Toolkit versions.
1490  static const char* const kNucDBs[] = {
1491  "SRA", "TI", "TR_ASSM_CH", "TRACE_ASSM", "TRACE_CHGR", NULL
1492  };
1493  for (const char* const* p = kNucDBs; *p; ++p) {
1495  }
1496  }
1497 }
1498 
1499 void SAccGuide::x_Load(const string& filename)
1500 {
1502  x_Load(*in);
1503 }
1504 
1506 {
1507  SHints hints;
1508  do {
1509  AddRule(*++in, hints);
1510  } while ( !in.AtEOF() );
1511 }
1512 
1515  const string* old_name,
1516  const CTempString& new_name)
1517 {
1518  CTempString from_pfx = from, to_pfx = to;
1519  auto left = x_SplitSpecial(from_pfx, fmt),
1520  right = x_SplitSpecial(to_pfx, fmt);
1521  const TAccInfo* value_ptr = nullptr;
1522  if (from_pfx != to_pfx) {
1523  hints.prev_big_special
1524  = submap.big_specials.insert(hints.prev_big_special,
1525  make_pair(to, TPair(from, value)));
1526  // Account for possible refinement.
1527  hints.prev_big_special->second.second = value;
1528  /*
1529  if (pos2 == NPOS) {
1530  submap.big_specials[tokens[1]] = TPair(tokens[1], value);
1531  } else {
1532  // _VERIFY(NStr::SplitInTwo(tokens[1], "-", from, to));
1533  from.assign(tokens[1], 0, pos2);
1534  to.assign(tokens[1], pos2 + 1, NPOS);
1535  submap.big_specials[to] = TPair(from, value);
1536  }
1537  */
1538  if ((value & CSeq_id::fAcc_fallback) != 0) {
1539  value_ptr = &hints.prev_big_special->second.second;
1540  }
1541  } else {
1543  if (hints.prev_small_special != submap.small_specials.end()
1544  && hints.prev_small_special->first == from_pfx) {
1545  it = hints.prev_small_special;
1546  it->second.first.clear_range(left, right);
1547  while ((it->second.second & ~CSeq_id::fAcc_fallback)
1548  != (value & ~CSeq_id::fAcc_fallback)) {
1549  if (it == submap.small_specials.begin()
1550  || (--it)->first != from_pfx) {
1551  it = hints.prev_small_special;
1552  ++it;
1553  break;
1554  }
1555  }
1556  } else {
1557  it = submap.small_specials.lower_bound(from_pfx);
1558  }
1559  while (it != submap.small_specials.end()) {
1560  if (it->first != from_pfx) {
1561  it = submap.small_specials.end();
1562  break;
1563  } else if ((it->second.second & ~CSeq_id::fAcc_fallback)
1564  == (value & ~CSeq_id::fAcc_fallback)) {
1565  break;
1566  } else {
1567  ++it;
1568  }
1569  }
1570  if (it != submap.small_specials.end()) {
1571  _ASSERT(it->first == from_pfx);
1572  _ASSERT((it->second.second & ~CSeq_id::fAcc_fallback)
1573  == (value & ~CSeq_id::fAcc_fallback));
1574  hints.prev_small_special = it;
1575  } else {
1576  auto size = kBVSizes[min(fmt & 0xffff, kMaxSmallSpecialDigits)];
1577  hints.prev_small_special =
1578  submap.small_specials.emplace(
1579  from_pfx, make_pair(bm::bvector<>(size), value));
1580  }
1581  hints.prev_small_special->second.first.set_range(left, right);
1582  // Account for possible refinement.
1583  hints.prev_small_special->second.second = value;
1584  if ((value & CSeq_id::fAcc_fallback) != 0) {
1585  value_ptr = &hints.prev_small_special->second.second;
1586  }
1587  }
1588  if (value_ptr != nullptr) {
1589  _ASSERT(old_name != nullptr && !old_name->empty());
1590  fallbacks[value_ptr] = make_pair(*old_name, new_name);
1591  } else {
1592  _ASSERT(old_name == nullptr);
1593  }
1594 }
1595 
1597  TFormatCode fmt)
1598 {
1599  auto raw_digits = fmt & 0xffff, digits = raw_digits;
1600  auto normal_size = (fmt >> 16) + digits;
1601  if (digits == kMaxSmallSpecialDigits + 1) {
1602  digits -= 2;
1603  } else if (digits > kMaxSmallSpecialDigits) {
1604  digits = kMaxSmallSpecialDigits;
1605  }
1606  SIZE_TYPE pos;
1608  if (acc.size() == normal_size) {
1609  pos = acc.size() - digits;
1610  NStr::StringToNumeric(acc.substr(pos), &result);
1611  } else {
1612  _ASSERT(acc.size() == normal_size + 1);
1613  _ASSERT(digits >= 3);
1614  pos = (fmt >> 16) + 2;
1615  _ASSERT(isalpha(static_cast<unsigned char>(acc[pos])));
1616  NStr::StringToNumeric(acc.substr(pos + 1), &result);
1617  if (digits == raw_digits) {
1618  pos -= 2;
1619  result += (NStr::StringToNumeric<Uint1>(acc.substr(pos, 2))
1620  * kBVSizes[digits - 2]);
1621  }
1622  }
1623  acc.erase(pos);
1624  return result;
1625 }
1626 
1628 {
1629  return new CRef<SAccGuide>(new SAccGuide);
1630 }
1631 
1633 
1636 {
1637  SIZE_TYPE main_size = acc.find('.');
1638  bool has_version = true;
1639  if (main_size == NPOS) {
1640  has_version = false;
1641  main_size = acc.size();
1642  } else if (main_size >= acc.size() - 1
1643  || acc.find_first_not_of(kDigits, main_size + 1) != NPOS) {
1644  return eAcc_unknown; // non-numeric "version"
1645  }
1646 
1647  static const SIZE_TYPE kMainAccBufSize = 32;
1648  if (main_size <= kMainAccBufSize) {
1649  const unsigned char* ucdata = (const unsigned char*)acc.data();
1650  char main_acc_buf[kMainAccBufSize];
1651  for (SIZE_TYPE i = 0; i < main_size; ++i) {
1652  main_acc_buf[i] = toupper(ucdata[i]);
1653  }
1654  CTempString main_acc(main_acc_buf, main_size);
1655  return x_IdentifyAccession(main_acc, flags, has_version);
1656  } else {
1657  // Unlikely to prove recognizable (far too long for any standard
1658  // format as of January 2016), but try anyway.
1659  string main_acc(acc, 0, main_size);
1660  NStr::ToUpper(main_acc);
1661  return x_IdentifyAccession(main_acc, flags, has_version);
1662  }
1663 }
1664 
1667  bool has_version)
1668 {
1669  SIZE_TYPE digit_pos = main_acc.find_first_of(kDigits),
1670  main_size = main_acc.size();
1671  char flag_char = '\0';
1672  if (digit_pos == NPOS) {
1673  return eAcc_unknown;
1674  } else {
1675  SIZE_TYPE non_dig_pos = main_acc.find_first_not_of(kDigits, digit_pos);
1676  const unsigned char* ucdata = (const unsigned char*)main_acc.data();
1677  if (non_dig_pos != NPOS && (flags & fParse_RawText) != 0) {
1678  if ( !has_version && digit_pos == 0 && main_size >= 4
1679  && non_dig_pos < 5 && isalnum(ucdata[1])
1680  && isalnum(ucdata[2]) && isalnum(ucdata[3])) {
1681  // Possible PDB (always unversioned); examine further
1682  // to avoid false positives.
1683  if (main_size > 4 && main_size <= 17
1684  && strchr("|-_", main_acc[4])
1685  && (main_size <= 6 || isalnum(ucdata[5]))) {
1686  // Conventionally delimited
1687  return eAcc_pdb;
1688  } else switch (main_size) {
1689  /*
1690  case 7:
1691  if ((main_acc[5] != main_acc[6]
1692  && (main_acc[5] != 'V' || main_acc[6] != 'B'))
1693  || !isalpha(ucdata[5])) {
1694  break;
1695  } // else fall through
1696  case 6:
1697  // Be extra strict when the potential molecule ID
1698  // could simply be a year. (NB: *insisting* on a
1699  // non-digit would rule out 1914|A, gi 157829621.)
1700  if ((non_dig_pos < 4 && ispunct(ucdata[4]))) {
1701  return eAcc_pdb;
1702  }
1703  break;
1704  case 5:
1705  if ((flags & fParse_ValidLocal) == 0) {
1706  break;
1707  } // else fall through
1708  */
1709  case 4:
1710  return eAcc_pdb;
1711  }
1712  }
1713  if (digit_pos == 1 && main_size == 6
1714  && (main_acc[0] == 'O' || main_acc[0] == 'P'
1715  || main_acc[0] == 'Q' || isalpha(ucdata[2]))
1716  && isdigit(ucdata[1]) && isalnum(ucdata[2])
1717  && isalnum(ucdata[3]) && isalnum(ucdata[4])
1718  && isdigit(ucdata[5])) {
1719  return eAcc_swissprot;
1720  } else if (digit_pos == 1 && main_size == 10
1721  && main_acc[0] != 'O' && main_acc[0] != 'P'
1722  && main_acc[0] != 'Q'
1723  && isalpha(ucdata[2]) && isalnum(ucdata[3])
1724  && isalnum(ucdata[4]) && isdigit(ucdata[5])
1725  && isalpha(ucdata[6]) && isalnum(ucdata[7])
1726  && isalnum(ucdata[8]) && isdigit(ucdata[9])) {
1727  return eAcc_swissprot;
1728  } else if ( !has_version && digit_pos == 0
1729  && (non_dig_pos == 6 || non_dig_pos == 7)
1730  && (main_size == non_dig_pos + 1
1731  || main_acc[non_dig_pos + 1] == ':'
1732  || (isalpha(ucdata[non_dig_pos + 1])
1733  && (main_size == non_dig_pos + 2
1734  || main_acc[non_dig_pos + 2] == ':')))) {
1735  // A formal spec appears to be elusive, but all examples in ID
1736  // contain six or seven digits followed by one or two letters,
1737  // followed in some rare cases by a tag such as :PDB=...
1738  return eAcc_prf;
1739  } else if (digit_pos >= 4 && non_dig_pos == digit_pos + 2
1740  && main_size - non_dig_pos >= 6 && main_acc[3] != '_'
1741  && (main_acc[non_dig_pos] == 'S'
1742  || main_acc[non_dig_pos] == 'P')
1743  && (main_acc.find_first_not_of
1744  (kDigits, non_dig_pos + 1) == NPOS)) {
1745  flag_char = main_acc[non_dig_pos];
1746  } else {
1747  return eAcc_unknown;
1748  }
1749  }
1750  }
1751 
1752  if (digit_pos == 0) {
1753  if ((flags & fParse_RawGI) != 0 && !has_version
1754  && main_acc[0] != '0'
1755  && main_acc.find_first_not_of(kDigits) == NPOS) {
1756  return eAcc_gi; // just digits
1757  } else {
1758  return eAcc_unknown; // PDB already handled
1759  }
1760  } else if ((flags & fParse_RawText) == 0) {
1761  return eAcc_unknown;
1762  }
1763 
1764  SIZE_TYPE flag_len = (flag_char == '\0') ? 0 : 1;
1765  SIZE_TYPE digit_count = main_size - digit_pos - flag_len;
1766  auto& guide = *s_Guide;
1767  const EAccessionInfo& found_ai
1768  = guide->Find(SAccGuide::s_Key(digit_pos, digit_count), main_acc);
1769  EAccessionInfo ai = found_ai;
1770  if ((ai & fAcc_specials) != 0) {
1771  ai = EAccessionInfo(ai & ~fAcc_specials);
1772  }
1773  if ((ai & fAcc_fallback) != 0) {
1774  ai = EAccessionInfo(ai & ~fAcc_fallback);
1775  static bool s_ReportedFallback;
1776  if ((flags & fParse_FallbackOK) == 0 && !s_ReportedFallback ) {
1777  // TODO - arrange to skip when only interested in the overall type
1778  s_ReportedFallback = true;
1779  auto it = guide->fallbacks.find(&found_ai);
1780  if (it != guide->fallbacks.end()) {
1781  ERR_POST_X(14, Warning << "CSeq_id::IdentifyAccession:"
1782  " Returning fallback type "
1783  << it->second.first << " for accession "
1784  << main_acc << ". (Preferred type "
1785  << it->second.second << " unrecognized.)");
1786  } else {
1787  ERR_POST_X(15, Warning << "CSeq_id::IdentifyAccession:"
1788  " Returning fallback type 0x"
1789  << NStr::UIntToString(ai, 0, 16)
1790  << " for accession " << main_acc
1791  << ". (Internal error looking up names of"
1792  " fallback and preferred types.)");
1793  }
1794  }
1795  }
1796  if (flag_char == 'P') {
1797  switch (ai & eAcc_division_mask) {
1798  case eAcc_targeted:
1799  case eAcc_wgs:
1800  // case eAcc_wgs_intermed:
1802  break;
1803  case eAcc_tsa:
1805  break;
1806  default:
1807  ERR_POST_X(11,
1808  Warning << main_acc
1809  << ": Protein flag found with unexpected division "
1810  << ((ai & eAcc_division_mask) >> 8));
1812  | fAcc_prot);
1813  break;
1814  }
1815  }
1816  switch (ai & eAcc_division_mask) {
1817  case eAcc_targeted:
1818  case eAcc_tsa:
1819  case eAcc_wgs:
1820  case eAcc_wgs_intermed:
1821  if (digit_pos >= 4
1822  && (main_acc.find_first_not_of
1823  ("0", digit_pos /* + flag_len */ + 2) == NPOS)) {
1824  return EAccessionInfo(ai | fAcc_master);
1825  }
1826  default:
1827  break;
1828  }
1829  return ai;
1830 }
1831 
1832 
1834 {
1835  E_Choice type = Which();
1836  switch (type) {
1837  case e_Pir: case e_Swissprot: case e_Prf: // but *NOT* e_Pdb
1838  // always just protein
1840 
1841  case e_Genbank: case e_Embl: case e_Ddbj:
1842  case e_Tpg: case e_Tpe: case e_Tpd:
1843  case e_Other: case e_Gpipe: case e_Named_annot_track:
1844  {
1845  const CTextseq_id* tsid = GetTextseq_Id();
1846  if (tsid->IsSetAccession()) {
1847  // Can't necessarily go straight to x_IdentifyAccession, as
1848  // the accession may contain lowercase letters.
1850  E_Choice type2 = GetAccType(ai);
1851  auto div2 = ai & eAcc_division_mask;
1852  if (type2 == e_not_set) {
1853  // We *know* what the type should be....
1854  return (EAccessionInfo)((ai & eAcc_flag_mask) | type);
1855  } else if (type2 == type) {
1856  return ai;
1857  } else if (type == e_Tpe && type2 == e_Embl
1858  && (div2 == eAcc_other || div2 == eAcc_wgs)) {
1859  return (EAccessionInfo)((ai & ~eAcc_type_mask) | type);
1860  } else { // misidentified or mislabeled; assume the former
1861  return static_cast<EAccessionInfo>(type);
1862  }
1863  } else {
1864  return static_cast<EAccessionInfo>(type);
1865  }
1866  }
1867 
1868  case e_General:
1869  {
1870  string db = GetGeneral().GetDb();
1871  NStr::ToUpper(db);
1872  SAccGuide::TPrefixes::const_iterator it = (*s_Guide)->general.find(db);
1873  return it == (*s_Guide)->general.end() ? eAcc_general : it->second;
1874  }
1875 
1876  default:
1877  return static_cast<EAccessionInfo>(type);
1878  }
1879 }
1880 
1881 
1882 void CSeq_id::LoadAccessionGuide(const string& filename)
1883 {
1884  s_Guide->Reset(new SAccGuide(filename));
1885 }
1886 
1888 {
1889  s_Guide->Reset(new SAccGuide(in));
1890 }
1891 
1892 
1893 static inline
1894 void x_GetLabel_Type(const CSeq_id& id, string* label,
1896 {
1897  unsigned choice = id.Which();
1898  _ASSERT(choice < CSeq_id::e_MaxChoice);
1899  if (choice >= CSeq_id::e_MaxChoice) {
1900  return;
1901  }
1902 
1903  switch (choice) {
1904  case CSeq_id::e_General:
1905  // we may encode 'gnl' or the database name as requested
1907  *label += id.GetGeneral().GetDb();
1908  } else {
1909  *label += "gnl";
1910  }
1911  break;
1912 
1913  case CSeq_id::e_Patent:
1914  *label += "pat";
1915  break;
1916 
1917  default:
1918  *label += s_TextId[choice];
1919  break;
1920  }
1921 
1922  // no extra flag interpretation currently
1923 }
1924 
1925 
1926 static inline
1927 void x_GetLabel_Content(const CSeq_id& id, string* label,
1929 {
1930  const CTextseq_id* tsid = id.GetTextseq_Id();
1931 
1932  if (version != NULL) {
1933  *version = 0;
1934  }
1935 
1936  //text id
1937  if (tsid) {
1938  string str;
1939  if (tsid->IsSetAccession()) {
1940  str = tsid->GetAccession();
1941  NStr::ToUpper(str);
1942  } else if (tsid->IsSetName()) {
1943  str = tsid->GetName();
1944  }
1945 
1946  if (version != NULL && tsid->IsSetVersion()) {
1947  *version = tsid->GetVersion();
1948  }
1949  if ( !str.empty() ) {
1950  if ( (flags & CSeq_id::fLabel_Version) && tsid->IsSetVersion()) {
1951  str += "." + NStr::IntToString(tsid->GetVersion());
1952  }
1953  }
1954  *label += str;
1955 
1956  } else { //non-text id
1957  switch (id.Which()) {
1958  case CSeq_id::e_not_set:
1959  break;
1960 
1961  case CSeq_id::e_Local:
1962  {{
1963  const CObject_id& oid = id.GetLocal();
1964  if (oid.IsId()) {
1965  *label += NStr::IntToString(oid.GetId());
1966  } else if (oid.IsStr()) {
1967  *label += oid.GetStr();
1968  }
1969  }}
1970  break;
1971 
1972  case CSeq_id::e_Gibbsq:
1973  *label += NStr::IntToString(id.GetGibbsq());
1974  break;
1975 
1976  case CSeq_id::e_Gibbmt:
1977  *label += NStr::IntToString(id.GetGibbmt());
1978  break;
1979 
1980  case CSeq_id::e_Giim:
1981  *label += NStr::IntToString(id.GetGiim().GetId());
1982  break;
1983 
1984  case CSeq_id::e_General:
1985  {{
1986  const CDbtag& dbt = id.GetGeneral();
1988  *label += dbt.GetDb() + ':';
1989  }
1990  if (dbt.GetTag().IsId()) {
1991  *label += NStr::IntToString(dbt.GetTag().GetId());
1992  } else if (dbt.GetTag().IsStr()) {
1993  *label += dbt.GetTag().GetStr();
1994  }
1995  }}
1996  break;
1997 
1998  case CSeq_id::e_Patent:
1999  {{
2000  const CId_pat& idp = id.GetPatent().GetCit();
2001  *label += idp.GetCountry();
2002  // *label += '|';
2003  *label += (idp.GetId().IsNumber() ?
2004  idp.GetId().GetNumber() :
2005  idp.GetId().GetApp_number());
2006  *label += '_'; // |
2007  *label += NStr::IntToString(id.GetPatent().GetSeqid());
2008  }}
2009  break;
2010 
2011  case CSeq_id::e_Gi:
2012  *label += NStr::NumericToString(id.GetGi());
2013  break;
2014 
2015  case CSeq_id::e_Pdb:
2016  {{
2017  const CPDB_seq_id& pid = id.GetPdb();
2018  *label += pid.GetMol().Get();
2019  if (pid.IsSetChain_id()) {
2020  *label += '_';
2021  *label += pid.GetChain_id();
2022  } else if (pid.IsSetChain()) {
2023  unsigned char chain = static_cast<unsigned char>(pid.GetChain());
2024  if (chain > ' ') {
2025  *label += '_';
2026  // previously if: (islower(chain)) then doubled the upper-case version with: *label += string(SIZE_TYPE(2), static_cast<char>(toupper(chain)));
2027  *label += static_cast<char>(chain);
2028  }
2029  }
2030  }}
2031  break;
2032 
2033  default:
2034  break;
2035  }
2036  }
2037 }
2038 
2039 
2041 {
2042  if ( !label ) {
2043  return;
2044  }
2045 
2046  switch (type) {
2047  case eFasta:
2048  *label += AsFastaString();
2049  break;
2050 
2051  case eFastaContent:
2052  {
2053  CNcbiOstrstream oss;
2054  x_WriteContentAsFasta(oss);
2055  *label += CNcbiOstrstreamToString(oss);
2056  break;
2057  }
2058 
2059  case eBoth:
2060  x_GetLabel_Type(*this, label, flags);
2061  *label += "|";
2062  if (flags & fLabel_UpperCase) {
2063  NStr::ToUpper(*label);
2064  // ID-5290 : This function may be called for primary or secondary
2065  // Seq-ids (e.g. gis), so need to check both primary and secondary id
2066  // values returned from the ComposeOSLT function. In the latter case,
2067  // always look at the first secondary ID in the list (there's almost
2068  // always just one anyway).
2069  // CXX-10440 : Original default version of ComposeOSLT function returns
2070  // empty string for local ids, but in this context local Seq-ids must
2071  // be parsed, hence use a special flag.
2072  string primary_id;
2073  list<string> secondary_id_list;
2074  primary_id = ComposeOSLT(&secondary_id_list, fAllowLocalId);
2075  if (!primary_id.empty())
2076  *label += primary_id;
2077  else if (secondary_id_list.size() > 0)
2078  *label += *secondary_id_list.begin();
2079  if (flags & fLabel_Version) {
2080  const CTextseq_id* tsid = GetTextseq_Id();
2081  if (tsid && tsid->IsSetVersion())
2082  *label += "." + NStr::IntToString(tsid->GetVersion());
2083  }
2084  } else {
2085  x_GetLabel_Content(*this, label, flags, NULL);
2086  }
2087  break;
2088 
2089  case eType:
2090  x_GetLabel_Type(*this, label, flags);
2091  break;
2092 
2093  case eContent:
2094  x_GetLabel_Content(*this, label, flags, NULL);
2095  break;
2096  }
2097 
2098  if ((flags & fLabel_Trimmed) != 0
2099  && (type == eFasta || type == eFastaContent)) {
2100  while ((*label)[label->size() - 1] == '|') {
2101  label->resize(label->size() - 1);
2102  }
2103  }
2104 }
2105 
2106 void CSeq_id::GetLabel(string* label, int* version, ELabelType type) const
2107 {
2108  if ( !label ) {
2109  return;
2110  }
2111 
2112  switch (type) {
2113  case eFasta:
2114  *label += AsFastaString();
2115  break;
2116 
2117  case eFastaContent:
2118  {
2119  CNcbiOstrstream oss;
2120  x_WriteContentAsFasta(oss);
2121  *label += CNcbiOstrstreamToString(oss);
2122  break;
2123  }
2124 
2125  case eBoth:
2126  x_GetLabel_Type(*this, label, 0);
2127  *label += "|";
2128  x_GetLabel_Content(*this, label, 0, version);
2129  break;
2130 
2131  case eType:
2132  x_GetLabel_Type(*this, label, 0);
2133  break;
2134 
2135  case eContent:
2136  x_GetLabel_Content(*this, label, 0, version);
2137  break;
2138  }
2139 }
2140 
2141 
2142 
2143 /*Return seqid string with optional version for text seqid type
2144 (default no version).*/
2145 string CSeq_id::GetSeqIdString(bool with_version) const
2146 {
2147  string label;
2148  TLabelFlags flags = 0;
2149  if (with_version) {
2150  flags |= fLabel_Version;
2151  }
2153  return label;
2154 }
2155 
2157 {
2158  string label;
2160  return label;
2161 }
2162 
2163 
2165  const
2166 {
2167  unsigned the_type = Which();
2168  if (the_type >= e_MaxChoice) // New SeqId type
2169  the_type = e_not_set;
2170 
2171  if (IsPatent() && !GetPatent().GetCit().GetId().IsNumber() ) {
2172  const char pgp[] = "pat|";
2173  out.write(pgp, sizeof(pgp) - 1);
2174  } else if (IsSwissprot() && GetSwissprot().IsSetRelease()
2175  && GetSwissprot().GetRelease() == "unreviewed") {
2176  const char tr[] = "tr|";
2177  out.write(tr, sizeof(tr) - 1);
2178  } else {
2179  out.write(s_TextId[the_type], strlen(s_TextId[the_type]));
2180  out.put('|');
2181  }
2182 
2184 }
2185 
2187 {
2188  unsigned the_type = Which();
2189  if (the_type >= e_MaxChoice) // New SeqId type
2190  the_type = e_not_set;
2191 
2192  switch (the_type) {
2193  case e_not_set:
2194  break;
2195  case e_Local:
2196  GetLocal().AsString(out);
2197  break;
2198  case e_Gibbsq:
2199  out << GetGibbsq();
2200  break;
2201  case e_Gibbmt:
2202  out << GetGibbmt();
2203  break;
2204  case e_Giim:
2205  out << (GetGiim().GetId());
2206  break;
2207  case e_Genbank:
2209  break;
2210  case e_Embl:
2212  break;
2213  case e_Pir:
2215  break;
2216  case e_Swissprot:
2218  break;
2219  case e_Patent:
2221  break;
2222  case e_Other:
2224  break;
2225  case e_General:
2226  {
2227  const CDbtag& dbt = GetGeneral();
2228  out << (dbt.GetDb()) << '|'; // no Upcase per Ostell - Karl 7/2001
2229  dbt.GetTag().AsString(out);
2230  }
2231  break;
2232  case e_Gi:
2233  out << GetGi();
2234  break;
2235  case e_Ddbj:
2237  break;
2238  case e_Prf:
2240  break;
2241  case e_Pdb:
2243  break;
2244  case e_Tpg:
2246  break;
2247  case e_Tpe:
2249  break;
2250  case e_Tpd:
2252  break;
2253  case e_Gpipe:
2254  // don't suppress version after all
2255  GetGpipe().AsFastaString(out /*, false */);
2256  break;
2257  case e_Named_annot_track:
2259  break;
2260  default:
2261  out << "[UnknownSeqIdType]";
2262  break;
2263  }
2264 }
2265 
2266 const string CSeq_id::AsFastaString(void) const
2267 {
2268 #ifdef HAVE_THREAD_LOCAL
2269  thread_local static CNcbiOstrstream str;
2270  str.seekp(0);
2271 #if NCBI_SHUN_OSTRSTREAM
2272  str.str("");
2273 #endif
2274 
2275  // VS2017 needs this call presumably because the first time seekp(0) is
2276  // called on an empty stream and thus a failbit is set.
2277  str.clear();
2278 #else
2280 #endif
2281  WriteAsFasta(str);
2282  return CNcbiOstrstreamToString(str);
2283 }
2284 
2285 
2286 //
2287 // GetStringDescr()
2288 // Given a bioseq, return the best possible ID description, in a number of
2289 // appealing formats. This function can produce FastA-formatted titles or a
2290 // number of sub-titles (GI only, Best Accession with or without version).
2291 //
2293 {
2294  if (fmt == eFormat_FastA) {
2295  CNcbiOstrstream ostr;
2296  WriteAsFasta(ostr, bioseq);
2297  return CNcbiOstrstreamToString(ostr);
2298  }
2299 
2300  bool is_na = bioseq.GetInst().GetMol() != CSeq_inst::eMol_aa;
2301  CRef<CSeq_id> best_id = FindBestChoice(bioseq.GetId(),
2302  is_na ? CSeq_id::FastaNARank
2304  switch (fmt) {
2305  case eFormat_ForceGI:
2306  // eForceGI produces a string containing only the GI in FastA format
2307  // so we have:
2308  // gi|####
2309  ITERATE (CBioseq::TId, iter, bioseq.GetId()) {
2310  if ( (*iter)->IsGi() ) {
2311  CNcbiOstrstream out_str;
2312  (*iter)->WriteAsFasta(out_str);
2313 
2314  return CNcbiOstrstreamToString(out_str);
2315  }
2316  }
2317  break;
2318 
2320  // eBestWithVersion produces only the 'best' accession name, with
2321  // its version indicator
2322  if (best_id.NotEmpty()) {
2323  string label;
2324  best_id->GetLabel(&label, eDefault, fLabel_Version);
2325  return label;
2326  }
2327  break;
2328 
2330  // eBestWithoutVersion produces only the 'best' accession name,
2331  // without its version indicator
2332  if (best_id.NotEmpty()) {
2333  string label;
2334  best_id->GetLabel(&label, eDefault, 0);
2335  return label;
2336  }
2337  break;
2338 
2339  default:
2340  break;
2341  }
2342 
2343  // catch-all for unusual events
2344  return "";
2345 }
2346 
2348 {
2349  bool is_na = bioseq.GetInst().GetMol() != CSeq_inst::eMol_aa;
2350  CRef<CSeq_id> best_id = FindBestChoice(bioseq.GetId(),
2351  is_na ? CSeq_id::FastaNARank
2353 
2354  // FastA format
2355  // Here we have something like:
2356  // gi|###|SOME_ACCESSION|title
2357  bool found_gi = false;
2358 
2359  ITERATE (CBioseq::TId, id, bioseq.GetId()) {
2360  if ((*id)->IsGi()) {
2361  (*id)->WriteAsFasta(ostr);
2362  found_gi = true;
2363  break;
2364  }
2365  }
2366 
2367  if (best_id.NotEmpty() && !best_id->IsGi() ) {
2368  if (found_gi) {
2369  ostr << '|';
2370  }
2371 
2372  best_id->WriteAsFasta(ostr);
2373  }
2374 
2375  return ostr;
2376 }
2377 
2378 
2379 CSeq_id::CSeq_id(const CDbtag& dbtag, bool set_as_general)
2380 {
2381  Set(dbtag, set_as_general);
2382 }
2383 
2384 CSeq_id& CSeq_id::Set(const CDbtag& dbtag, bool set_as_general)
2385 {
2386  int version = -1;
2387  CTempString acc;
2388  string accver;
2389 
2390  switch (dbtag.GetTag().Which()) {
2391  case CObject_id::e_Str:
2392  accver = dbtag.GetTag().GetStr();
2393  s_SplitVersion(accver, acc, version);
2394  break;
2395  case CObject_id::e_Id:
2396  acc = accver = NStr::IntToString(dbtag.GetTag().GetId());
2397  break;
2398  default:
2399  NCBI_THROW(CSeqIdException, eFormat,
2400  "Bad CDbtag tag type "
2401  + CObject_id::SelectionName(dbtag.GetTag().Which()));
2402  break;
2403  }
2404 
2405  CDbtag::EDbtagType type = dbtag.GetType();
2406  switch (type) {
2408  SetEmbl().Set(accver);
2409  break;
2410 
2412  SetDdbj().Set(accver);
2413  break;
2414 
2415  case CDbtag::eDbtagType_GI:
2416  if (dbtag.GetTag().IsStr()) {
2417  Set(e_Gi, dbtag.GetTag().GetStr());
2418  } else {
2419  SetGi(GI_FROM(CObject_id::TId, dbtag.GetTag().GetId()));
2420  }
2421  break;
2422 
2424  default:
2425  // not understood as a sequence id
2426  if (set_as_general) {
2427  SetGeneral().Assign(dbtag);
2428  } else {
2429  NCBI_THROW(CSeqIdException, eFormat,
2430  "Unrecognized Dbtag DB " + dbtag.GetDb());
2431  }
2432  break;
2433  }
2434 
2435  return *this;
2436 }
2437 
2438 inline
2440  const CTempString& str)
2441 {
2442  if (type == CSeq_id::e_Swissprot && NStr::EqualNocase(str, "tr")) {
2443  return eTV_tr;
2444  } else if (type == CSeq_id::e_Patent && NStr::EqualNocase(str, "pgp")) {
2445  return eTV_pgp;
2446  } else {
2447  return eTV_plain;
2448  }
2449 }
2450 
2451 //SeqIdFastAConstructors
2453 {
2454  Set(the_id, flags);
2455 }
2456 
2458 {
2459  CTempString the_id = NStr::TruncateSpaces_Unsafe(the_id_in,
2462 
2463  if ((flags & fParse_NoFASTA) == 0) {
2464  type = s_CheckForFastaTag(the_id);
2465  }
2466  if (type == e_not_set) {
2467  if (the_id.empty()) {
2468  NCBI_THROW(CSeqIdException, eFormat,
2469  "Empty bare accession supplied");
2470  }
2471  // If no (attempt at a) valid tag, tries to interpret the string
2472  // as a pure accession.
2473  if ((flags & fParse_AnyRaw) != 0) {
2476  }
2477  switch (type) {
2478  case e_Gi:
2479  return Set(type, the_id);
2480  case e_not_set:
2481  {
2482  // Check for general IDs, albeit only with well-known
2483  // database names like SRA.
2484  SIZE_TYPE colon_pos = the_id.find(':');
2485  if (colon_pos != NPOS) {
2486  string db = the_id.substr(0, colon_pos);
2487  NStr::ToUpper(db);
2488  // const auto& whitelist = (*s_Guide)->general;
2489  const auto& whitelist = kSupportedRawDbtags;
2490  if (whitelist.find(db) != whitelist.end()) {
2491  // Reextract prefix to preserve case.
2492  return Set(e_General, the_id.substr(0, colon_pos),
2493  the_id.substr(colon_pos + 1));
2494  }
2495  }
2496  if ((flags & fParse_ValidLocal) != 0
2498  || IsValidLocalID(the_id))) {
2499  return Set(e_Local, the_id);
2500  } else {
2501  NCBI_THROW(CSeqIdException, eFormat,
2502  "Malformatted ID " + string(the_id));
2503  }
2504  }
2505  case e_Prf:
2506  // technically a name/locus, not an accession!
2507  return Set(type, kEmptyStr, the_id);
2508  case e_Pdb:
2509  {
2510  string mol(the_id, 0, 4), chain;
2511  // NStr::SplitInTwo(the_id, "|", mol, chain);
2512  if (the_id.size() > 5) {
2513  chain = the_id.substr(5);
2514  } else if (the_id.size() == 5 && the_id[4] != '|') {
2515  chain = the_id[4];
2516  }
2517  return Set(type, mol, chain);
2518  }
2519  default:
2520  {
2521  CTempString acc;
2522  int ver;
2523  s_SplitVersion(the_id, acc, ver);
2524  return Set(type, acc, kEmptyStr, ver);
2525  }
2526  }
2527  } else {
2528  list<CTempString> fasta_pieces;
2529  NStr::Split(the_id, "|", fasta_pieces);
2530  ETypeVariant tv = x_IdentifyTypeVariant(type, fasta_pieces.front());
2531  fasta_pieces.pop_front();
2532  x_Init(fasta_pieces, type, tv);
2533  if ( !fasta_pieces.empty() ) {
2534  // tolerate trailing parts if they're all empty.
2535  ITERATE(list<CTempString>, it, fasta_pieces) {
2536  if ( !it->empty() ) {
2537  if ((flags & fParse_PartialOK) != 0) {
2538  ERR_POST_X(10, Warning << "Ignoring extra parts"
2539  " (synonyms?) in FASTA-style ID "
2540  << the_id);
2541  } else {
2542  NCBI_THROW(CSeqIdException, eFormat,
2543  "FASTA-style ID " + string(the_id)
2544  + " has too many parts.");
2545  }
2546  }
2547  }
2548  }
2549  return *this;
2550  }
2551 }
2552 
2553 
2555  const CTempString& the_content)
2556 {
2557  Set(f, the_type, the_content);
2558 }
2559 
2561  const CTempString& the_content)
2562 {
2563  list<CTempString> fasta_pieces;
2564  ETypeVariant tv = eTV_plain; // default assumption
2565  NStr::Split(the_content, "|", fasta_pieces);
2566  if ( !fasta_pieces.empty()
2567  && WhichInverseSeqId(fasta_pieces.front()) == the_type) {
2568  tv = x_IdentifyTypeVariant(the_type, fasta_pieces.front());
2569  fasta_pieces.pop_front();
2570  }
2571  x_Init(fasta_pieces, the_type, tv);
2572  return *this;
2573 }
2574 
2575 
2577 {
2578  return (fNoError == CheckLocalID(s));
2579 }
2580 
2581 
2584 {
2585  if (NStr::IsBlank(s)) {
2586  return fEmptyId;
2587  }
2588 
2589  TErrorFlags error_flags = fNoError;
2590  if (s.length() > kMaxLocalIDLength) {
2591  error_flags |= fExceedsMaxLength;
2592  }
2593 
2594  static const char* kIllegal = " >[]|\"";
2595  CSeq_id_find_pred pred; pred.kSymbols = kIllegal;
2596  if (find_if(s.begin(), s.end(), pred) != s.end()) {
2597  error_flags |= fInvalidChar;
2598  }
2599  return error_flags;
2600 }
2601 
2602 
2604  bool allow_partial_failure)
2605 {
2607  if (allow_partial_failure) {
2609  }
2610  return ParseIDs(ids, s, flags);
2611 }
2612 
2615 {
2617  if (ss.empty()) {
2618  return 0;
2619  }
2620 
2621  // first simple check to make it faster
2622  if (!s_HasFastaTag(ss)) {
2623  CRef<CSeq_id> id(new CSeq_id(ss, flags | fParse_NoFASTA));
2624  ids.push_back(id);
2625  return 1;
2626  }
2627 
2628  SIZE_TYPE count = 0;
2629  list<CTempString> fasta_pieces;
2630  NStr::Split(ss, "|", fasta_pieces);
2631  _ASSERT(fasta_pieces.size() > 0);
2632  if (fasta_pieces.size() == 1)
2633  {
2634  CRef<CSeq_id> id(new CSeq_id(ss, flags | fParse_NoFASTA));
2635  ids.push_back(id);
2636  count = 1;
2637  }
2638  else
2639  {
2640  E_Choice type = WhichInverseSeqId(fasta_pieces.front());
2641  ETypeVariant tv;
2642  if (type == e_not_set) {
2643  if (fasta_pieces.size() == 2) {
2644  // unknown database are reported as 'general'
2645  type = e_General;
2646  }
2647  tv = eTV_plain;
2648  } else {
2649  tv = x_IdentifyTypeVariant(type, fasta_pieces.front());
2650  fasta_pieces.pop_front();
2651  }
2652  while ( !fasta_pieces.empty() ) {
2653  try {
2654  CRef<CSeq_id> id(new CSeq_id);
2655  if (type != e_not_set) {
2656  type = id->x_Init(fasta_pieces, type, tv);
2657  }
2658  if (type == e_not_set && !fasta_pieces.empty() ) {
2659  type = WhichInverseSeqId(fasta_pieces.front());
2660  if (type == e_not_set) {
2661  CTempString typestr = fasta_pieces.front();
2662  fasta_pieces.pop_front();
2663  NCBI_THROW(CSeqIdException, eFormat,
2664  "Unsupported ID type " + typestr);
2665  }
2666  }
2667  if (type != e_not_set) {
2668  _ASSERT( !fasta_pieces.empty() );
2669  tv = x_IdentifyTypeVariant(type, fasta_pieces.front());
2670  fasta_pieces.pop_front();
2671  }
2672  ids.push_back(id);
2673  ++count;
2674  } catch (std::exception& e) {
2675  if (fasta_pieces.empty()) {
2676  throw;
2677  }
2678  if ((flags & fParse_PartialOK) != 0) {
2679  ERR_POST_X(7, Warning << e.what());
2680  do {
2681  auto l = fasta_pieces.front().size();
2682  if (l != 2 && l != 3) {
2683  fasta_pieces.pop_front();
2684  } else {
2685  break;
2686  }
2687  } while ( !fasta_pieces.empty() );
2688  } else {
2689  throw;
2690  }
2691  }
2692  }
2693  }
2694  return count;
2695 }
2696 
2697 
2698 CSeq_id::E_Choice CSeq_id::x_Init(list<CTempString>& fasta_pieces,
2700 {
2701  _ASSERT(!fasta_pieces.empty());
2702  _ASSERT(type != e_not_set);
2703 
2704  vector<CTempString> fields(3);
2705  SIZE_TYPE min_fields, max_fields;
2706  E_Choice next_type = e_not_set;
2707  switch (type) {
2708  case e_Local:
2709  case e_Gibbsq:
2710  case e_Gibbmt:
2711  case e_Giim:
2712  case e_Gi:
2713  min_fields = max_fields = 1;
2714  break;
2715  case e_Patent:
2716  min_fields = max_fields = 3;
2717  break;
2718  case e_General:
2719  min_fields = max_fields = 2;
2720  break;
2721 #if 0 // release no longer used
2722  case e_Other:
2723  min_fields = 1;
2724  max_fields = 3;
2725  break;
2726 #endif
2727  default: // text seqid: accession and optional name
2728  min_fields = 1;
2729  max_fields = 2;
2730  break;
2731  }
2732 
2733  for (SIZE_TYPE i = 0; i < max_fields; ++i) {
2734  if (fasta_pieces.empty()) {
2735  if (i >= min_fields) {
2736  break;
2737  } else {
2738  NCBI_THROW(CSeqIdException, eFormat,
2739  "Not enough fields for ID of type "
2740  + string(s_TextId[type]));
2741  }
2742  } else {
2743  if (i >= min_fields && fasta_pieces.size() > 1
2744  && (fasta_pieces.front().size() == 2
2745  || fasta_pieces.front().size() == 3)
2746  && ((next_type = WhichInverseSeqId(fasta_pieces.front()))
2747  != e_not_set)) {
2748  // Likely mid-string optional-field omission; look ahead
2749  // more to see whether the piece works better as a field
2750  // or a tag.
2751  list<CTempString>::iterator it = fasta_pieces.begin();
2752  ++it;
2753  _ASSERT(it != fasta_pieces.end());
2754  E_Choice next_type_2;
2755  if ((it->size() == 2 || it->size() == 3)
2756  && (next_type_2 = WhichInverseSeqId(*it)) != e_not_set) {
2757  next_type = next_type_2;
2758  } else if (it->find_first_not_of(" \t\n") == NPOS
2759  && ++it == fasta_pieces.end()) {
2760  next_type = e_not_set;
2761  } else {
2762  break;
2763  }
2764  }
2765  fields[i] = fasta_pieces.front();
2766  fasta_pieces.pop_front();
2767  }
2768  }
2769 
2770  // Special case -- dbSNP IDs have historically contained internal
2771  // vertical bars, so we have to parse them greedily.
2772  string snp_name; // must survive until the end of the function
2773  if (type == e_General && NStr::EqualNocase(fields[0], "dbSNP")
2774  && !fasta_pieces.empty() ) {
2775  snp_name = string(fields[1]) + '|' + NStr::Join(fasta_pieces, "|");
2776  fields[1] = snp_name;
2777  fasta_pieces.clear();
2778  }
2779 
2780  // Clear out extra empty pieces
2781  while ( !fasta_pieces.empty() && fasta_pieces.front().empty() ) {
2782  fasta_pieces.pop_front();
2783  }
2784 
2785  int ver = 0;
2786  switch (type) {
2787  case e_Swissprot:
2788  if (tv == eTV_tr) {
2789  fields[2] = "unreviewed";
2790  } else {
2791  fields[2] = "reviewed";
2792  }
2793  break;
2794 
2795  case e_Patent:
2796  // "version" actually sequence number within patent, but whatever...
2797  ver = NStr::StringToNonNegativeInt(fields[2]);
2798  if (ver < 0) {
2799  NCBI_THROW(CSeqIdException, eFormat,
2800  "Bad sequence number " + string(fields[2]) + " for "
2801  + string(fields[0]) + " patent " + string(fields[1]));
2802  }
2803  // to distinguish applications from granted patents; the numeric
2804  // content has already made its way into ver.
2805  fields[2] = "pat";
2806  break;
2807 
2808  case e_Pdb:
2809  if (fields[0].size() < 4
2810  || (fields[0].size() > 5
2811  && ( !fields[1].empty()
2812  || strchr("|-_", fields[0][4]) == NULL))) {
2813  NCBI_THROW(CSeqIdException, eFormat,
2814  "Malformatted PDB ID " + string(fields[0]));
2815  }
2816  if (fields[0].size() > 4 && fields[1].empty()) { // misdelimited
2817  if (fields[0].size() > 5) {
2818  fields[1] = fields[0].substr(5);
2819  } else {
2820  _ASSERT(fields[0][4] != '|');
2821  fields[1] = fields[0].substr(4);
2822  }
2823  fields[0] = fields[0].substr(0, 4);
2824  }
2825  break;
2826 
2827  default:
2828  break; // avoid compiler warnings
2829  }
2830 
2831  Set(type, fields[0] /* acc */, fields[1] /* name */, ver,
2832  fields[2] /* rel */);
2833 
2834  return next_type;
2835 }
2836 
2837 
2839 {
2840  Set(the_type, the_id);
2841 }
2842 
2843 #ifdef NCBI_STRICT_GI
2845 {
2846  Set(the_type, GI_TO(TIntId, gi));
2847 }
2848 #endif
2849 
2851 {
2852 // see CSeq_id::Set below, it prohibits lcl|0, but allows gi|0
2853  if ((the_id < 0) || (the_type == e_Local && the_id == 0)) {
2854  NCBI_THROW(CSeqIdException, eFormat,
2855  "Non-positive numeric ID " + NStr::NumericToString(the_id));
2856  }
2857 
2858  switch (the_type) {
2859  case e_Local:
2861  break;
2862  case e_Gibbsq:
2864  break;
2865  case e_Gibbmt:
2867  break;
2868  case e_Giim:
2869  {
2870  CGiimport_id& giim = SetGiim();
2871  giim.SetId(INT_ID_TO(CGiimport_id::TId, the_id));
2872  giim.ResetDb();
2873  giim.ResetRelease();
2874  break;
2875  }
2876  case e_Gi:
2877  SetGi(GI_FROM(TIntId, the_id));
2878  break;
2879  default:
2880  NCBI_THROW(CSeqIdException, eFormat,
2881  "Invalid numeric ID type" + SelectionName(the_type));
2882  }
2883  return *this;
2884 }
2885 
2886 
2888  const CTempString& acc_in,
2889  const CTempString& name_in,
2890  int version,
2891  const CTempString& release_in)
2892 {
2893  Set(the_type, acc_in, name_in, version, release_in);
2894 }
2895 
2896 // Karl Sirotkin 7/2001
2897 
2899  const CTempString& acc_in,
2900  const CTempString& name_in,
2901  int version,
2902  const CTempString& release_in)
2903 {
2906 
2907  int the_id;
2908  CTextseq_id* tsid = 0;
2909  bool allow_dot = true;
2910 
2911  switch (the_type) {
2912  case e_not_set: // Will cause unspecified SeqId to be returned.
2913  break;
2914 
2915  case e_Local:
2916  SetLocal().SetStrOrId(acc);
2917  break;
2918 
2919  // numeric IDs
2920  case e_Gibbsq:
2921  case e_Gibbmt:
2922  case e_Giim:
2923  case e_Gi:
2924 #ifdef NCBI_INT8_GI
2925  if ( the_type == e_Gi ) {
2926  try {
2927  TGi gi = NStr::StringToNumeric<TGi>(acc);
2928  if ( gi > ZERO_GI ) {
2929  SetGi(gi);
2930  return *this;
2931  }
2932  }
2933  catch ( CException& /*ignored*/ ) {
2934  // will be processed by the code below
2935  }
2936  }
2937 #endif
2938  if ( (the_id = NStr::StringToNonNegativeInt (acc)) >= 0 ) {
2939  return Set(the_type, the_id);
2940  } else {
2941  NCBI_THROW(CSeqIdException, eFormat,
2942  "Negative, excessively large, or non-numeric "
2943  + SelectionName(the_type)
2944  + " ID " + string(acc));
2945  }
2946  break;
2947 
2948  // text IDs
2949  case e_Genbank: tsid = &SetGenbank(); break;
2950  case e_Embl: tsid = &SetEmbl(); break;
2951  case e_Pir: tsid = &SetPir(); allow_dot = false; break;
2952  case e_Swissprot: tsid = &SetSwissprot(); break;
2953  case e_Other: tsid = &SetOther(); break;
2954  case e_Ddbj: tsid = &SetDdbj(); break;
2955  case e_Prf: tsid = &SetPrf(); allow_dot = false; break;
2956  case e_Tpg: tsid = &SetTpg(); break;
2957  case e_Tpe: tsid = &SetTpe(); break;
2958  case e_Tpd: tsid = &SetTpd(); break;
2959  case e_Gpipe: tsid = &SetGpipe(); break;
2960  case e_Named_annot_track: tsid = &SetNamed_annot_track(); break;
2961 
2962  case e_Patent:
2963  {
2964  CTempString name =
2966  CTempString release =
2968  CPatent_seq_id& pat = SetPatent();
2969  CId_pat& id_pat = pat.SetCit();
2970  CId_pat::C_Id& id_pat_id = id_pat.SetId();
2971  id_pat.SetCountry(acc);
2972 
2973  if (NStr::EqualNocase(release, "pgp")) {
2974  id_pat_id.SetApp_number(name);
2975  } else {
2976  id_pat_id.SetNumber(name);
2977  }
2978  id_pat.ResetDoc_type();
2979  pat.SetSeqid(version);
2980  break;
2981  }
2982 
2983  case e_General:
2984  {
2987  CDbtag& dbt = SetGeneral();
2988  dbt.SetDb(acc);
2989  dbt.SetTag().SetStrOrId(name);
2990  break;
2991  }
2992 
2993  case e_Pdb:
2994  {
2997  CPDB_seq_id& pdb = SetPdb();
2998  pdb.SetMol().Set(acc);
2999 
3000  // Consult name_in in addition to name as whitespace
3001  // stripping can lose relevant information here.
3002  if (name_in.empty()) {
3003  pdb.ResetChain();
3004  } else if (name.empty()) {
3005  pdb.SetChain(' ');
3006  name = " ";
3007  } else if (name.size() == 1) {
3008  pdb.SetChain(static_cast<unsigned char>(name[0]));
3009  } else {
3010  pdb.ResetChain();
3011  ERR_POST_X(16,
3012  Info << "Necessarily using backwards-incompatible"
3013  " representation for chain " << string(name)
3014  << " of PDB molecule " << acc << '.');
3015  }
3016  if (name.empty()) {
3017  pdb.ResetChain_id();
3018  } else {
3019  pdb.SetChain_id(name);
3020  }
3021  pdb.ResetRel();
3022  break;
3023  }
3024 
3025  default:
3026  NCBI_THROW(CSeqIdException, eFormat,
3027  "Unsupported Seq-id type " + SelectionName(the_type));
3028  }
3029 
3030  if (tsid) {
3031  // CTextseq_id::Set will take care of truncating any spaces.
3032  tsid->Set(acc, name_in, version, release_in, allow_dot);
3033  }
3034 
3035  return *this;
3036 }
3037 
3038 
3039 int CSeq_id::BaseTextScore(void) const
3040 {
3041  switch (Which()) {
3042  // Accession and accession-like ids - only one can be present in a bioseq's list of ids,
3043  // the order is not important.
3044  case e_Other:
3045  case e_Swissprot:
3046  case e_Pir:
3047  case e_Pdb:
3048  case e_Genbank:
3049  case e_Embl:
3050  case e_Ddbj:
3051  case e_Tpg:
3052  case e_Tpe:
3053  case e_Tpd: return 10;
3054 
3055  // Second group of mutually exclusive ids, any order can be used.
3056  case e_Gpipe:
3057  case e_Named_annot_track:
3058  case e_Prf: return 20;
3059  case e_Patent: return 50;
3060 
3061  // "local" < "general" < "gi"
3062  case e_Local: return 50;
3063  case e_General: return 60;
3064  case e_Gi: return PreferAccessionOverGi() ? kMaxScore + 1 : 70;
3065 
3066  // All other ids rank just above "not-set" except the obsolete ones listed below.
3067  case e_not_set: return 100;
3068 
3069  // Obsolete ids, lowest rank, any order is OK.
3070  case e_Giim:
3071  case e_Gibbmt:
3072  case e_Gibbsq: return 1000;
3073 
3074  // All other ids should go just above "not-set".
3075  default: return 90;
3076  }
3077 }
3078 
3079 
3081 {
3082  switch (Which()) {
3083  case e_not_set: return 83;
3084  case e_General: case e_Local: return 80;
3085  case e_Gibbsq: case e_Gibbmt: case e_Giim: return 70;
3086  case e_Named_annot_track: return 69;
3087  case e_Gpipe: return 68;
3088  case e_Patent: return 67;
3089  case e_Other: return 65;
3090  case e_Gi: return PreferAccessionOverGi() ? kMaxScore + 1 : 51;
3091  default: return 60;
3092  }
3093 }
3094 
3095 
3097 {
3098  switch (Which()) {
3099  // these few are bogus, at least for nucleotide sequences
3100  case e_not_set: case e_Giim:
3101  case e_Pir: case e_Swissprot: case e_Prf: return 255;
3102  case e_Local: return 230;
3103  case e_Gi: return PreferAccessionOverGi() ? kMaxScore + 1 : 120;
3104  case e_General:
3105  {
3106  const string& db = GetGeneral().GetDb();
3107  if (db.compare("TMSMART") == 0 ||
3108  db.compare("BankIt") == 0 ||
3109  db.compare("NCBIFILE") == 0 )
3110  return 240;
3111  else
3112  return 100;
3113  }
3114  case e_Patent: return 90;
3115  case e_Pdb: return 80;
3116 // see SQD-4175 ticket for priorities
3117  case e_Gibbsq: return 72;
3118  case e_Gibbmt: return 71;
3119  case e_Genbank: return 70;
3120  case e_Other: return 15;
3121  default: /* [third party] GB/EMBL/DDBJ */ return 20;
3122  }
3123 }
3124 
3125 
3127 {
3128  switch (Which()) {
3129  case e_not_set: case e_Giim: return 255;
3130  case e_Local: return 230;
3131  case e_Gi: return PreferAccessionOverGi() ? kMaxScore + 1 : 120;
3132  case e_General:
3133  {
3134  const string& db = GetGeneral().GetDb();
3135  if (db.compare("TMSMART") == 0 ||
3136  db.compare("BankIt") == 0 ||
3137  db.compare("NCBIFILE") == 0)
3138  return 240;
3139  else
3140  return 90;
3141  }
3142  case e_Patent: return 80;
3143  case e_Prf: return 70;
3144  case e_Pdb: return 50;
3145 // see SQD-4175 ticket for priorities
3146  case e_Gibbsq: return 42;
3147  case e_Gibbmt: return 41;
3148  case e_Genbank: return 40;
3149  case e_Pir: return 30;
3150  case e_Swissprot: return 20;
3151  case e_Other: return 15;
3152  default: return 60; // [third party] GB/EMBL/DDBJ
3153  }
3154 }
3155 
3156 
3158 {
3159  switch (Which()) {
3160  case e_Other: return 10;
3161  case e_Swissprot: return 20;
3162  case e_Pir: return 30;
3163  case e_Pdb: return 40;
3164  case e_Genbank: return 50;
3165  case e_Embl: return 60;
3166  case e_Ddbj: return 70;
3167  case e_Tpg: return 80;
3168  case e_Tpe: return 90;
3169  case e_Tpd: return 100;
3170  case e_Gpipe: return 120;
3171  case e_Named_annot_track: return 130;
3172  case e_Prf: return 140;
3173  case e_Patent: return 150;
3174  case e_Gi: return PreferAccessionOverGi() ? kMaxScore + 1 : 160;
3175  case e_General:
3176  {
3177  const string& db = GetGeneral().GetDb();
3178  if (db.compare("TMSMART") == 0 ||
3179  db.compare("BankIt") == 0 ||
3180  db.compare("NCBIFILE") == 0)
3181  return 180;
3182  else
3183  return 170;
3184  }
3185  case e_Local: return 190;
3186  case e_not_set: return 250;
3187  case e_Giim: return 251;
3188  case e_Gibbmt: return 252;
3189  case e_Gibbsq: return 253;
3190  default: return 255;
3191  }
3192 }
3193 
3194 
3195 int CSeq_id::AdjustScore(int base_score, TAdjustScoreFlags flags) const
3196 {
3197  int score = base_score * 10;
3198  if ( IsGeneral() ) {
3199  const string& db = GetGeneral().GetDb();
3200  if ( db == "TRACE" ) {
3201  // prefer "ti" over "TRACE"
3202  score += 5;
3203  }
3204  }
3205  else if ( const CTextseq_id* text_id = GetTextseq_Id() ) {
3206  if ( !text_id->IsSetVersion() ) {
3207  score += 4;
3208  }
3209  if ( !text_id->IsSetAccession() ) {
3210  if ((flags & fRequireAccessions) == 0) {
3211  score += 3; // still penalize somewhat
3212  } else {
3213  score = kMax_Int;
3214  }
3215  }
3216  if ( !text_id->IsSetName() ) {
3217  score += 2;
3218  }
3219  }
3220  return score;
3221 }
3222 
3224 {
3225  return false;
3226 }
3227 
3229 {
3230  return false;
3231 }
3232 
3233 
3235 {
3236  switch ( choice ) {
3237  case CSeq_id::e_Genbank:
3238  return &match.SetGenbank();
3239  case CSeq_id::e_Embl:
3240  return &match.SetEmbl();
3241  case CSeq_id::e_Pir:
3242  return &match.SetPir();
3243  case CSeq_id::e_Swissprot:
3244  return &match.SetSwissprot();
3245  case CSeq_id::e_Other:
3246  return &match.SetOther();
3247  case CSeq_id::e_Ddbj:
3248  return &match.SetDdbj();
3249  case CSeq_id::e_Prf:
3250  return &match.SetPrf();
3251  case CSeq_id::e_Tpg:
3252  return &match.SetTpg();
3253  case CSeq_id::e_Tpe:
3254  return &match.SetTpe();
3255  case CSeq_id::e_Tpd:
3256  return &match.SetTpd();
3257  case CSeq_id::e_Gpipe:
3258  return &match.SetGpipe();
3260  return &match.SetNamed_annot_track();
3261  default:
3262  break;
3263  }
3264  return 0;
3265 }
3266 
3267 
3269 {
3270  const CTextseq_id* orig = GetTextseq_Id();
3271  if ( !orig ) return;
3272 
3273  bool A = orig->IsSetAccession();
3274  CTextseq_id::TAccession av = A ? orig->GetAccession() : kEmptyStr;
3275  bool v = orig->IsSetVersion();
3276  CTextseq_id::TVersion vv = v ? orig->GetVersion() : 0;
3277  bool N = orig->IsSetName();
3278  CTextseq_id::TName nv = N ? orig->GetName() : kEmptyStr;
3279  bool r = orig->IsSetRelease();
3280  CTextseq_id::TRelease rv = r ? orig->GetRelease() : kEmptyStr;
3281 
3282  CSeq_id match;
3284 
3285  if (A && (v || N || r)) {
3286  // Accession only
3287  ti.SetAccession(av);
3289  if (v && (N || r)) {
3290  // A.v
3291  ti.SetVersion(vv);
3293  }
3294  if ( N ) {
3295  // Name only
3296  ti.Reset();
3297  ti.SetName(nv);
3299  if (v || r) {
3300  if ( r ) {
3301  // N.r
3302  ti.SetRelease(rv);
3304  ti.ResetRelease();
3305  }
3306  // A + N
3307  ti.SetAccession(av);
3309  if (v && r) {
3310  // A.v + N
3311  ti.SetVersion(vv);
3313  // A + N.r
3314  ti.ResetVersion();
3315  ti.SetRelease(rv);
3317  }
3318  }
3319  }
3320  }
3321  else if (N && (v || r)) {
3322  // N only
3323  ti.Reset();
3324  ti.SetName(nv);
3326  if (v && r) {
3327  // N.r
3328  ti.SetRelease(rv);
3330  }
3331  }
3332 }
3333 
3334 
3336 {
3337  switch ( Which() ) {
3338  // CTextseq_id
3339  case CSeq_id::e_Genbank:
3340  case CSeq_id::e_Embl:
3341  case CSeq_id::e_Pir:
3342  case CSeq_id::e_Swissprot:
3343  case CSeq_id::e_Other:
3344  case CSeq_id::e_Ddbj:
3345  case CSeq_id::e_Prf:
3346  case CSeq_id::e_Tpg:
3347  case CSeq_id::e_Tpe:
3348  case CSeq_id::e_Tpd:
3349  case CSeq_id::e_Gpipe:
3351  GetMatchingTextseqIds(matches);
3352  break;
3353 
3354  // CPDB_seq_id
3355  case CSeq_id::e_Pdb:
3356  // 'rel' is optional
3357  if ( GetPdb().IsSetRel() ) {
3358  CSeq_id match;
3359  match.Assign(*this);
3360  match.SetPdb().ResetRel();
3362  }
3363  break;
3364 
3365  case CSeq_id::e_General: // CDbtag
3366  if ( GetGeneral().IsSetTag() ) {
3367  CSeq_id match;
3368  if ( match.SetGeneral().SetAsMatchingTo(GetGeneral()) ) {
3370  }
3371  }
3372  break;
3373  case CSeq_id::e_Local: // CObject_id
3374  {
3375  CSeq_id match;
3376  if ( match.SetLocal().SetAsMatchingTo(GetLocal()) ) {
3378  }
3379  break;
3380  }
3381  // Other types have no matching versions.
3382  case CSeq_id::e_not_set:
3383  case CSeq_id::e_Gibbsq: // int
3384  case CSeq_id::e_Gibbmt: // int
3385  case CSeq_id::e_Giim: // CGiimport_id
3386  case CSeq_id::e_Patent: // CPatent_seq_id
3387  case CSeq_id::e_Gi: // TGi
3388  return;
3389  }
3390 }
3391 
3392 
3393 NCBI_PARAM_DECL(bool, SeqId, PreferAccessionOverGi);
3394 NCBI_PARAM_DEF_EX(bool, SeqId, PreferAccessionOverGi, false, eParam_NoThread,
3395  SEQ_ID_PREFER_ACCESSION_OVER_GI);
3396 typedef NCBI_PARAM_TYPE(SeqId, PreferAccessionOverGi) TPreferAccessionOverGi;
3397 
3398 NCBI_PARAM_DECL(bool, SeqId, AvoidGi);
3399 NCBI_PARAM_DEF_EX(bool, SeqId, AvoidGi, false, eParam_NoThread,
3400  SEQ_ID_AVOID_GI);
3401 typedef NCBI_PARAM_TYPE(SeqId, AvoidGi) TAvoidGi;
3402 
3403 
3405 {
3406  return TPreferAccessionOverGi::GetDefault() || AvoidGi();
3407 }
3408 
3409 
3411 {
3412  return TAvoidGi::GetDefault();
3413 }
3414 
3415 
3416 string CSeq_id::ComposeOSLT(list<string>* secondary_id_list,
3417  TComposeOSLTFlags parse_flags) const
3418 {
3419  string primary_id;
3420  string secondary_id;
3421  E_Choice seqid_type = Which();
3422  bool mixed_case = false;
3423 
3424  switch (seqid_type) {
3425  // CXX-11062 : gibbsq and gibbmt ids are sometimes primary, sometimes
3426  // secondary. Since it cannot be determined here which of the two is the case,
3427  // they are returned in both fields.
3428  // Use same logic for giim, but in fact there are no records in ID with this
3429  // Seq-id type at all.
3430  case e_Giim:
3431  primary_id = NStr::IntToString(GetGiim().GetId());
3432  secondary_id = primary_id;
3433  break;
3434  case e_Gibbsq:
3435  primary_id = NStr::IntToString(GetGibbsq());
3436  secondary_id = primary_id;
3437  break;
3438  case e_Gibbmt:
3439  primary_id = NStr::IntToString(GetGibbmt());
3440  secondary_id = primary_id;
3441  break;
3442  case e_Pir:
3443  case e_Prf:
3444  {
3445  // This is a Textseq-id, however primary id is normally stored in the
3446  // name field.
3447  // For PIR, if name is empty, id is allowed to be placed in the accession field;
3448  // For PRF only name is allowed!
3449  const CTextseq_id* tsid = GetTextseq_Id();
3450  if (tsid->CanGetName())
3451  primary_id = tsid->GetName();
3452  else if (seqid_type == e_Pir && tsid->CanGetAccession())
3453  primary_id = tsid->GetAccession();
3454  break;
3455  }
3456  case e_Patent:
3457  if (secondary_id_list) {
3458  // All patents have GenBank Seq-ids, so id string derived from a patent
3459  // seqid is always secondary
3460  const CId_pat& pat = GetPatent().GetCit();
3461  secondary_id = pat.GetCountry() + "|" +
3462  (pat.GetId().IsNumber() ?
3463  pat.GetId().GetNumber() : pat.GetId().GetApp_number()) + "|" +
3465  }
3466  break;
3467  case e_Pdb:
3468  {
3469  const CPDB_seq_id& pdb = GetPdb();
3470  primary_id = pdb.GetMol().Get();
3471  // ID-5995 : Use FASTA-style "mol|chain" format as OSLT - with upper case
3472  // mol but mixed case chain values. This is how they are stored in Cassandra.
3473  NStr::ToUpper(primary_id);
3474  if (pdb.IsSetChain_id()) {
3475  primary_id += "|" + pdb.GetChain_id();
3476  } else if (pdb.IsSetChain() && pdb.GetChain() != ' ') {
3477  primary_id += "|" + string(1, (char)pdb.GetChain());
3478  }
3479  mixed_case = true;
3480  break;
3481  }
3482  case e_General:
3483  {
3484  // General ids are always secondary!
3485  if (secondary_id_list) {
3486  const CObject_id& dbtag = GetGeneral().GetTag();
3487  string suffix =
3488  (dbtag.IsId() ? NStr::IntToString(dbtag.GetId()) : dbtag.GetStr());
3489  if (!suffix.empty())
3490  secondary_id = GetGeneral().GetDb() + "|" + suffix;
3491  }
3492  break;
3493  }
3494  case e_Gi:
3495  // GIs are always secondary
3496  if (secondary_id_list) {
3497  secondary_id = NStr::NumericToString(GetGi());
3498  }
3499  break;
3500  case CSeq_id::e_Local:
3501  {
3502  if ((parse_flags & fAllowLocalId) != 0 && secondary_id_list) {
3503  const CObject_id& oid = GetLocal();
3504  if (oid.IsId()) {
3505  secondary_id = NStr::IntToString(oid.GetId());
3506  } else if (oid.IsStr()) {
3507  secondary_id = oid.GetStr();
3508  }
3509  }
3510  break;
3511  }
3512  default:
3513  {
3514  // In the logic below, any Textseq-id is treated as primary. However a
3515  // Bioseq object may contain multiple Textseq-ids in its list of Seq-ids,
3516  // e.g. when RefSeq takes over a preexisting GPIPE record.
3517  const CTextseq_id* tsid = GetTextseq_Id();
3518  if (tsid) {
3519  if (tsid->CanGetAccession())
3520  primary_id = tsid->GetAccession();
3521  if ( secondary_id_list ) {
3522  if (seqid_type == e_Gpipe
3523  && (parse_flags & fGpipeAddSecondary) != 0
3524  && !primary_id.empty()) {
3525  if ( tsid->IsSetVersion() )
3526  secondary_id = primary_id + "." + to_string(tsid->GetVersion());
3527  else
3528  secondary_id = primary_id + ".1";
3529  }
3530  else if (tsid->CanGetName() && !tsid->GetName().empty()) {
3531  secondary_id = tsid->GetName();
3532  }
3533  }
3534  }
3535  break;
3536  }
3537  }
3538 
3539  if (!mixed_case)
3540  NStr::ToUpper(primary_id);
3541  if (secondary_id_list && !secondary_id.empty()) {
3542  NStr::ToUpper(secondary_id);
3543  secondary_id_list->emplace_back(secondary_id);
3544  }
3545  return primary_id;
3546 }
3547 
3548 
3550 {
3551  switch (value) {
3552  case CSeq_id::eSNPScaleLimit_Unit: return "unit";
3553  case CSeq_id::eSNPScaleLimit_Contig: return "contig";
3554  case CSeq_id::eSNPScaleLimit_Supercontig: return "supercontig";
3555  case CSeq_id::eSNPScaleLimit_Chromosome: return "chromosome";
3556  default: return "";
3557  }
3558 }
3559 
3560 
3562 {
3563  if (name == "unit") return eSNPScaleLimit_Unit;
3564  if (name == "contig") return eSNPScaleLimit_Contig;
3565  if (name == "supercontig") return eSNPScaleLimit_Supercontig;
3566  if (name == "chromosome") return eSNPScaleLimit_Chromosome;
3567  return eSNPScaleLimit_Default;
3568 }
3569 
3570 
3572 {
3573  if (scale_limit == eSNPScaleLimit_Default || IsGi()) return true;
3574  auto text_id = GetTextseq_Id();
3575  if (!text_id || !text_id->IsSetAccession() || !text_id->IsSetVersion()) return true;
3576  EAccessionInfo acc_info = IdentifyAccession();
3577  if (GetAccType(acc_info) == e_Other) {
3578  ESNPScaleLimit min_limit = eSNPScaleLimit_Unit;
3579  switch (acc_info & eAcc_division_mask) {
3580  case eAcc_chromosome: // AC_ / NC_
3581  min_limit = eSNPScaleLimit_Chromosome;
3582  break;
3583  case eAcc_wgs_intermed: // NW_
3584  min_limit = eSNPScaleLimit_Supercontig;
3585  break;
3586  case eAcc_con: // NT_ / NZ_?(?)
3587  min_limit = eSNPScaleLimit_Contig;
3588  break;
3589  default:
3590  break;
3591  }
3592  if (scale_limit < min_limit) return false;
3593  }
3594  return true;
3595 }
3596 
3597 
3599  : start(0), stop(0), digits(0), acc_info(CSeq_id::eAcc_unknown)
3600 {
3601  size_t pos = 0, n = s.size();
3602  while (pos < n
3603  && (isalpha((unsigned char) s[pos])
3604  || (((flags & fAllowUnderscores) != 0) && s[pos] == '_'))) {
3605  prefix += s[pos++];
3606  }
3607  while (pos < n && isdigit((unsigned char) s[pos])) {
3608  start = start * 10 + s[pos++] - '0';
3609  ++digits;
3610  }
3611  if (pos == n) {
3612  stop = start;
3613  return;
3614  } else if (s[pos++] != '-') {
3615  NCBI_THROW(CSeqIdException, eFormat,
3616  "Expected hyphen in range " + string(s));
3617  }
3618 
3619  {{
3620  string pfx2;
3621  while (pos < n
3622  && (isalpha((unsigned char) s[pos])
3623  || (((flags & fAllowUnderscores) != 0) && s[pos] == '_'))) {
3624  pfx2 += s[pos++];
3625  }
3626  if ( !pfx2.empty() && pfx2 != prefix) {
3627  NCBI_THROW(CSeqIdException, eFormat,
3628  "Mismatched prefixes in range " + string(s));
3629  }
3630  }}
3631  if (pos + digits != n) {
3632  NCBI_THROW(CSeqIdException, eFormat,
3633  "Mismatched digit counts in range " + string(s));
3634  }
3635  while (pos < n && isdigit((unsigned char) s[pos])) {
3636  stop = stop * 10 + s[pos++] - '0';
3637  }
3638 }
3639 
3640 
3642 {
3643  CRef<CSeq_id> ret;
3644  static const CSeq_id::TParseFlags flags
3646 
3649  if (m_Range->size() > 1 && m_Range->digits == 5) {
3650  // account for possible non-uniformity
3651  switch (m_Range->prefix[0]) {
3652  case 'C': case 'D': case 'c': case 'd':
3653  if (m_Range->prefix.size() == 3) {
3655  }
3656  case 'N': case 'n':
3657  if (m_Range->prefix.size() == 1) {
3659  }
3660  }
3661  }
3662  }
3663 
3665  if (type == CSeq_id::e_not_set) {
3666  ret = new CSeq_id(**this);
3667  } else {
3668  ret = new CSeq_id(type, **this);
3669  }
3670 
3671  return ret;
3672 }
3673 
3674 
3676 {
3677  CNcbiOstrstream oss;
3678  oss << m_Range->prefix << setw(m_Range->digits) << setfill('0') << m_Number;
3679  m_Accession = CNcbiOstrstreamToString(oss);
3680  return m_Accession;
3681 }
3682 
3683 
3684 END_objects_SCOPE // namespace ncbi::objects::
3686 
3687 #undef NCBI_USE_ERRCODE_X
User-defined methods of the data storage class.
NCBI_PARAM_DECL(bool, SeqId, PreferAccessionOverGi)
static const TChoiceMapEntry sc_ChoiceArray[]
Definition: Seq_id.cpp:526
bool CanCmpAcc(CSeq_id::E_Choice choice)
Definition: Seq_id.cpp:392
static void x_GetLabel_Content(const CSeq_id &id, string *label, CSeq_id::TLabelFlags flags, int *version)
Definition: Seq_id.cpp:1927
static const char *const s_TextId[CSeq_id::e_MaxChoice+1]
Definition: Seq_id.cpp:572
static const char * sc_SupportedRawDbtags[]
Definition: Seq_id.cpp:96
DEFINE_STATIC_ARRAY_MAP_WITH_COPY(CStaticArraySet< string >, kSupportedRawDbtags, sc_SupportedRawDbtags)
typedef NCBI_PARAM_TYPE(SeqId, PreferAccessionOverGi) TPreferAccessionOverGi
static CSeq_id::E_Choice s_CheckForFastaTag(const CTempString &s)
Definition: Seq_id.cpp:632
CStaticPairArrayMap< CTempString, CSeq_id::E_Choice, PNocase_Generic< CTempString > > TChoiceMap
Definition: Seq_id.cpp:568
static const unsigned int kMaxSmallSpecialDigits
Definition: Seq_id.cpp:872
void x_Assign(CObject_id &dst, const CObject_id &src)
Definition: Seq_id.cpp:203
static const TAccInfoMapEntry sc_AccInfoArray[]
Definition: Seq_id.cpp:647
CStaticPairArrayMap< CTempString, CSeq_id::EAccessionInfo, PNocase_Generic< CTempString > > TAccInfoMap
Definition: Seq_id.cpp:863
static CSafeStatic< CRef< SAccGuide > > s_Guide(s_CreateGuide, NULL)
static const bm::bvector ::size_type kBVSizes[kMaxSmallSpecialDigits+1]
Definition: Seq_id.cpp:873
static void x_GetLabel_Type(const CSeq_id &id, string *label, CSeq_id::TLabelFlags flags)
Definition: Seq_id.cpp:1894
SStaticPair< const char *, CSeq_id::EAccessionInfo > TAccInfoMapEntry
Definition: Seq_id.cpp:645
static bool s_HasFastaTag(const CTempString &s)
Definition: Seq_id.cpp:618
static CRef< SAccGuide > * s_CreateGuide(void)
Definition: Seq_id.cpp:1627
CTextseq_id * s_GetTextseq_id(const CSeq_id::E_Choice &choice, CSeq_id &match)
Definition: Seq_id.cpp:3234
NCBI_PARAM_DEF_EX(bool, SeqId, PreferAccessionOverGi, false, eParam_NoThread, SEQ_ID_PREFER_ACCESSION_OVER_GI)
static void s_SplitVersion(const CTempString &acc_in, CTempString &acc, int &ver)
Definition: Seq_id.cpp:151
static const SAccGuide::TAccInfo kUnrecognized
Definition: Seq_id.cpp:972
SStaticPair< const char *, CSeq_id::E_Choice > TChoiceMapEntry
Definition: Seq_id.cpp:524
static const char kDigits[]
Definition: Seq_id.cpp:866
Definition: Date.hpp:53
Definition: Dbtag.hpp:53
EDbtagType GetType(void) const
Definition: Dbtag.cpp:289
EDbtagType
Definition: Dbtag.hpp:58
@ eDbtagType_GI
Definition: Dbtag.hpp:99
@ eDbtagType_DDBJ
Definition: Dbtag.hpp:86
@ eDbtagType_bad
Definition: Dbtag.hpp:59
@ eDbtagType_EMBL
Definition: Dbtag.hpp:88
int Compare(const CDbtag &dbt2) const
Definition: Dbtag.cpp:176
CGiimport_id –.
Definition: Giimport_id.hpp:66
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
void SetStrOrId(CTempString str)
Definition: Object_id.cpp:187
ostream & AsString(ostream &s) const
Definition: Object_id.cpp:202
int Compare(const CObject_id &oid2) const
Definition: Object_id.cpp:145
bool Match(const CObject_id &oid2) const
Definition: Object_id.cpp:61
CObject –.
Definition: ncbiobj.hpp:180
int Compare(const CPDB_seq_id &psip2) const
Definition: PDB_seq_id.cpp:81
bool Match(const CPDB_seq_id &psip2) const
Definition: PDB_seq_id.cpp:53
ostream & AsFastaString(ostream &s) const
Definition: PDB_seq_id.cpp:94
ostream & AsFastaString(ostream &s) const
int Compare(const CPatent_seq_id &psip2) const
bool Match(const CPatent_seq_id &psip2) const
CSafeStatic<>::
CSeqIdException –.
Definition: Seq_id.hpp:1001
Base class for all serializable objects.
Definition: serialbase.hpp:150
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
TBase::const_iterator const_iterator
Definition: static_map.hpp:109
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
ostream & AsFastaString(ostream &s, bool allow_version=true) const
Format the contents FASTA string style.
Definition: Textseq_id.cpp:229
CTextseq_id & Set(const CTempString &acc_in, const CTempString &name_in=kEmptyStr, int version=0, const CTempString &release_in=kEmptyStr, bool allow_dot_version=true)
Set all fields with a single call.
Definition: Textseq_id.cpp:59
int Compare(const CTextseq_id &tsip2) const
Definition: Textseq_id.cpp:178
bool Match(const CTextseq_id &tsip2) const
Comparison functions.
Definition: Textseq_id.cpp:143
CTime –.
Definition: ncbitime.hpp:296
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:54
bvector_size_type size_type
Definition: bm.h:121
const_iterator end() const
Definition: map.hpp:152
const_iterator lower_bound(const key_type &key) const
Definition: map.hpp:154
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
bool empty() const
Definition: map.hpp:149
const_iterator find(const key_type &key) const
Definition: map.hpp:153
const_iterator lower_bound(const key_type &key) const
Definition: map.hpp:294
const_iterator end() const
Definition: map.hpp:292
const_iterator begin() const
Definition: map.hpp:291
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
static uch flags
std::ofstream out("events_result.xml")
main entry point for tests
static int type
Definition: getdata.c:31
static const char * str(char *buf, int n)
Definition: stats.c:84
static FILE * f
Definition: readconf.c:23
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
Definition: ncbimisc.hpp:843
Int8 TIntId
Definition: ncbimisc.hpp:999
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
#define INT_ID_TO(T, id)
Convert gi-compatible int to/from other types.
Definition: ncbimisc.hpp:1120
string
Definition: cgiapp.hpp:690
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST_X(err_subcode, message)
Error posting with default error code and given error subcode.
Definition: ncbidiag.hpp:550
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:453
#define STD_CATCH_ALL_X(err_subcode, message)
Standard handling of "exception"-derived exceptions; catches non-standard exceptions and generates "u...
Definition: ncbiexpt.hpp:608
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
Definition: ncbiexpt.cpp:444
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
ESerialRecursionMode
How to assign and compare child sub-objects of serial objects.
Definition: serialdef.hpp:191
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
const TPrim & Get(void) const
Definition: serialbase.hpp:347
virtual const CTypeInfo * GetThisTypeInfo(void) const =0
static CRef< ILineReader > New(const string &filename)
Return a new ILineReader object corresponding to the given filename, taking "-" (but not "....
Definition: line_reader.cpp:49
void x_WriteContentAsFasta(ostream &out) const
Definition: Seq_id.cpp:2186
CSeq_id(void)
See also CSeq_id related functions in "util/sequence.hpp":
Definition: Seq_id.cpp:139
string ComposeOSLT(list< string > *secondary_ids=nullptr, TComposeOSLTFlags parse_flags=0) const
JIRA ID-5188 : Compose OSLT string for the primary id, as well as OSLT strings for the secondary ids,...
Definition: Seq_id.cpp:3416
int TErrorFlags
Definition: Seq_id.hpp:113
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
static bool IsValid(const CBioseq::TId &ids, TParseFlags flags=fParse_Default)
Definition: Seq_id.cpp:3223
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
int BaseBlastScore(void) const
Definition: Seq_id.cpp:3157
int BaseFastaAAScore(void) const
Definition: Seq_id.cpp:3126
static const size_t kMaxLocalIDLength
ID length restrictions.
Definition: Seq_id.hpp:841
static SIZE_TYPE ParseIDs(CBioseq::TId &ids, const CTempString &s, TParseFlags flags=fParse_Default)
Parse a string representing one or more Seq-ids, appending the results to IDS.
Definition: Seq_id.cpp:2613
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2040
E_Choice x_Init(list< CTempString > &fasta_pieces, E_Choice type, ETypeVariant tv)
Definition: Seq_id.cpp:2698
int TLabelFlags
Definition: Seq_id.hpp:625
int TComposeOSLTFlags
Definition: Seq_id.hpp:823
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
static ESNPScaleLimit GetSNPScaleLimit_Value(const string &name)
Definition: Seq_id.cpp:3561
int CompareOrdered(const CSeq_id &sid2) const
Definition: Seq_id.cpp:486
const SSeqIdRange * m_Range
Definition: Seq_id.hpp:972
static SIZE_TYPE ParseFastaIds(CBioseq::TId &ids, const CTempString &s, bool allow_partial_failure=false)
Parse an entire set of |-delimited FASTA-style IDs, appending the results to IDS.
Definition: Seq_id.cpp:2603
static E_Choice GetAccType(EAccessionInfo info)
Definition: Seq_id.hpp:562
E_SIC
Compare return values.
Definition: Seq_id.hpp:579
EFastaAsTypeAndContent
Tag for method variants that would otherwise be ambiguous.
Definition: Seq_id.hpp:116
static ETypeVariant x_IdentifyTypeVariant(E_Choice type, const CTempString &str)
Definition: Seq_id.cpp:2439
virtual void WriteAsFasta(ostream &out) const
Implement serializable interface.
Definition: Seq_id.cpp:2164
int BaseBestRankScore(void) const
Definition: Seq_id.cpp:3080
int TFlags
binary OR of EFlags
Definition: Seq_id.hpp:899
CSeq_id & Set(const CTempString &the_id, TParseFlags flags=fParse_AnyRaw)
Reassign based on flat specifications; arguments interpreted as with constructors.
Definition: Seq_id.cpp:2457
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1065
SSeqIdRange(const CTempString &s, TFlags flags=0)
Definition: Seq_id.cpp:3598
int digits
Definition: Seq_id.hpp:989
static E_Choice WhichInverseSeqId(const CTempString &SeqIdCode)
Converts a string to a choice, no need to require a member.
Definition: Seq_id.cpp:599
static const char * GetSNPScaleLimit_Name(ESNPScaleLimit value)
Definition: Seq_id.cpp:3549
static bool IsValidLocalID(const CTempString &s)
Perform rudimentary validation on potential local IDs, whose contents should be pure ASCII and limite...
Definition: Seq_id.cpp:2576
static string GetStringDescr(const CBioseq &bioseq, EStringFormat fmt)
Definition: Seq_id.cpp:2292
static void LoadAccessionGuide(const string &filename)
Definition: Seq_id.cpp:1882
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
int AdjustScore(int base_score, TAdjustScoreFlags flags=TAdjustScoreFlags()) const
Definition: Seq_id.cpp:3195
EStringFormat
Get a string representation of the sequence IDs of a given bioseq.
Definition: Seq_id.hpp:661
CSeq_id::EAccessionInfo acc_info
Definition: Seq_id.hpp:990
virtual const char * GetErrCodeString(void) const override
Translate from the error code value to its string representation.
Definition: Seq_id.cpp:128
bool IsAllowedSNPScaleLimit(ESNPScaleLimit scale_limit) const
Definition: Seq_id.cpp:3571
static EAccessionInfo x_IdentifyAccession(const CTempString &main_acc, TParseFlags flags, bool has_version)
Definition: Seq_id.cpp:1666
string prefix
Definition: Seq_id.hpp:986
int TParseFlags
Definition: Seq_id.hpp:104
CRef< CSeq_id > GetID(void) const
Definition: Seq_id.cpp:3641
void GetMatchingTextseqIds(TSeqIdHandles &matches) const
Collect partially matching textseq-ids.
Definition: Seq_id.cpp:3268
ELabelType
return the label for a given string
Definition: Seq_id.hpp:603
const string & x_SetAccession(void) const
Definition: Seq_id.cpp:3675
void GetMatchingIds(TSeqIdHandles &matches) const
Collect partially matching seq-ids: no-version, no-name etc.
Definition: Seq_id.cpp:3335
static int FastaNARank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:780
int BaseTextScore(void) const
Definition: Seq_id.cpp:3039
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
Definition: Seq_id.cpp:169
static bool PreferAccessionOverGi(void)
Check if the option to prefer accession.version over GI is enabled (SeqId/PreferAccessionOverGi or SE...
Definition: Seq_id.cpp:3404
static bool AvoidGi(void)
Check if the option to avoid GI ids is enabled (SeqId/AvoidGi or SEQ_ID_AVOID_GI).
Definition: Seq_id.cpp:3410
virtual ~CSeq_id(void)
Destructor.
Definition: Seq_id.cpp:145
static TErrorFlags CheckLocalID(const CTempString &s)
Perform rudimentary validation on potential local IDs, whose contents should not exceed fifty charact...
Definition: Seq_id.cpp:2583
ETypeVariant
Definition: Seq_id.hpp:860
int BaseFastaNAScore(void) const
Definition: Seq_id.cpp:3096
static const char * WhichFastaTag(E_Choice choice)
Converts a choice to a FASTA tag, with no trailing vertical bar.
Definition: Seq_id.cpp:609
E_SIC Compare(const CSeq_id &sid2) const
Compare() - more general.
Definition: Seq_id.cpp:411
ESNPScaleLimit
SNP annotation scale limits.
Definition: Seq_id.hpp:847
size_t size(void) const
Definition: Seq_id.hpp:983
static int FastaAARank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:778
@ fLabel_Trimmed
Trim trailing FASTA delimeters.
Definition: Seq_id.hpp:619
@ fLabel_UpperCase
Upper case label, with special encoding for PDB chain-ids.
Definition: Seq_id.hpp:620
@ fLabel_Version
Show the version.
Definition: Seq_id.hpp:615
@ fLabel_GeneralDbIsContent
For type general, use the database name as the tag and the (text or numeric) key as the content.
Definition: Seq_id.hpp:618
@ eUnknownType
Unrecognized Seq-id type.
Definition: Seq_id.hpp:1005
@ eFormat
Contents not parsable as expected.
Definition: Seq_id.hpp:1006
@ eAcc_gb_other_nuc
Definition: Seq_id.hpp:346
@ eAcc_ddbj_est
Definition: Seq_id.hpp:453
@ eAcc_ddbj_tsavm_nuc
Definition: Seq_id.hpp:463
@ eAcc_gb_embl_ddbj
Definition: Seq_id.hpp:333
@ eAcc_wgs
Definition: Seq_id.hpp:290
@ eAcc_gb_optical_map
Definition: Seq_id.hpp:379
@ eAcc_gb_tpa_chromosome
Definition: Seq_id.hpp:499
@ eAcc_gb_mga
Definition: Seq_id.hpp:378
@ eAcc_ddbj_wgs_prot
Definition: Seq_id.hpp:470
@ eAcc_refseq_wgsv_nuc
Definition: Seq_id.hpp:425
@ eAcc_gb_tsavm_prot
Definition: Seq_id.hpp:362
@ eAcc_ddbj_tpa_wgs_nuc
Definition: Seq_id.hpp:533
@ eAcc_gb_cdna
Definition: Seq_id.hpp:351
@ eAcc_refseq_wgsv_intermed
Definition: Seq_id.hpp:434
@ eAcc_ddbj_tpa_tsa_prot
Definition: Seq_id.hpp:525
@ eAcc_embl_tpa_prot
Definition: Seq_id.hpp:503
@ eAcc_embl_gss
Definition: Seq_id.hpp:396
@ eAcc_refseq_unique_prot
Definition: Seq_id.hpp:417
@ eAcc_flag_mask
Definition: Seq_id.hpp:260
@ eAcc_ddbj_wgs_nuc
Definition: Seq_id.hpp:469
@ eAcc_ddbj_tpa_wgsvm_prot
Definition: Seq_id.hpp:540
@ eAcc_ddbj_wgsv_nuc
Definition: Seq_id.hpp:473
@ eAcc_embl_tsam_nuc
Definition: Seq_id.hpp:390
@ eAcc_embl_tsav_prot
Definition: Seq_id.hpp:393
@ eAcc_gb_tpa_wgsv_nuc
Definition: Seq_id.hpp:495
@ eAcc_embl_wgsv_prot
Definition: Seq_id.hpp:405
@ eAcc_type_mask
Definition: Seq_id.hpp:247
@ eAcc_ddbj_tsam_nuc
Definition: Seq_id.hpp:459
@ eAcc_embl_wgs_nuc
Definition: Seq_id.hpp:400
@ eAcc_gb_tpa_other
Definition: Seq_id.hpp:486
@ eAcc_gb_wgsm_nuc
Definition: Seq_id.hpp:370
@ eAcc_ddbj_tpa_targeted_nuc
Definition: Seq_id.hpp:542
@ eAcc_embl_tsa_prot
Definition: Seq_id.hpp:389
@ eAcc_embl_wgsvm_prot
Definition: Seq_id.hpp:407
@ eAcc_ddbj_tpa_targetedvm_nuc
Definition: Seq_id.hpp:545
@ eAcc_gb_gss
Definition: Seq_id.hpp:364
@ eAcc_patent
Definition: Seq_id.hpp:412
@ eAcc_gpipe_prot
Definition: Seq_id.hpp:549
@ eAcc_gb_tpa_con
Definition: Seq_id.hpp:490
@ eAcc_gb_con
Definition: Seq_id.hpp:367
@ eAcc_embl_tpa_tsa_nuc
Definition: Seq_id.hpp:504
@ eAcc_embl_tpa_wgsvm_nuc
Definition: Seq_id.hpp:518
@ eAcc_refseq_contig_ncbo
Definition: Seq_id.hpp:442
@ eAcc_embl_patent
Definition: Seq_id.hpp:387
@ fAcc_prot
Definition: Seq_id.hpp:252
@ eAcc_gpipe_ncrna
Definition: Seq_id.hpp:554
@ eAcc_refseq_wgsvm_nuc
Definition: Seq_id.hpp:427
@ eAcc_ddbj_tsam_prot
Definition: Seq_id.hpp:460
@ eAcc_ddbj_tpa_tsam_prot
Definition: Seq_id.hpp:527
@ eAcc_embl_htgs
Definition: Seq_id.hpp:398
@ eAcc_gb_tpa_wgsm_nuc
Definition: Seq_id.hpp:493
@ eAcc_embl_wgsm_nuc
Definition: Seq_id.hpp:402
@ eAcc_gibbmt
Definition: Seq_id.hpp:339
@ eAcc_gpipe_unreserved
Definition: Seq_id.hpp:555
@ eAcc_gb_tpa_wgsvm_prot
Definition: Seq_id.hpp:498
@ eAcc_maybe_embl
Definition: Seq_id.hpp:328
@ eAcc_ddbj_tpa_tsav_prot
Definition: Seq_id.hpp:529
@ eAcc_refseq_contig
Definition: Seq_id.hpp:420
@ eAcc_refseq_mrna_predicted
Definition: Seq_id.hpp:439
@ eAcc_ddbj_wgsm_prot
Definition: Seq_id.hpp:472
@ eAcc_gb_backbone
Definition: Seq_id.hpp:354
@ eAcc_gb_tsav_prot
Definition: Seq_id.hpp:360
@ eAcc_embl_other_nuc
Definition: Seq_id.hpp:384
@ eAcc_gb_wgs_prot
Definition: Seq_id.hpp:369
@ eAcc_embl_con
Definition: Seq_id.hpp:399
@ eAcc_ddbj_gss
Definition: Seq_id.hpp:465
@ eAcc_ddbj_dirsub
Definition: Seq_id.hpp:454
@ eAcc_ddbj_con
Definition: Seq_id.hpp:468
@ eAcc_ddbj_tpa_other
Definition: Seq_id.hpp:521
@ eAcc_general
Definition: Seq_id.hpp:444
@ eAcc_chromosome
Definition: Seq_id.hpp:292
@ eAcc_gpipe_chromosome
Definition: Seq_id.hpp:552
@ eAcc_refseq_unreserved
Definition: Seq_id.hpp:418
@ eAcc_refseq_wgsm_nuc
Definition: Seq_id.hpp:423
@ eAcc_gb_tpa_nuc
Definition: Seq_id.hpp:487
@ eAcc_ddbj_wgsvm_prot
Definition: Seq_id.hpp:476
@ eAcc_gpipe_genomic
Definition: Seq_id.hpp:553
@ fAcc_fallback
Definition: Seq_id.hpp:258
@ eAcc_embl_tpa_wgsvm_prot
Definition: Seq_id.hpp:519
@ eAcc_targeted
Definition: Seq_id.hpp:298
@ eAcc_embl_wgsv_nuc
Definition: Seq_id.hpp:404
@ eAcc_embl_ddbj
Definition: Seq_id.hpp:332
@ eAcc_refseq_mrna
Definition: Seq_id.hpp:415
@ eAcc_ddbj_tpa_wgsm_prot
Definition: Seq_id.hpp:536
@ eAcc_ambiguous_nuc
Definition: Seq_id.hpp:326
@ eAcc_maybe_gb
Definition: Seq_id.hpp:327
@ eAcc_embl_tpa_tsav_nuc
Definition: Seq_id.hpp:508
@ eAcc_ddbj_other_nuc
Definition: Seq_id.hpp:452
@ eAcc_gb_tpa_prot
Definition: Seq_id.hpp:488
@ eAcc_refseq_wgsv_prot
Definition: Seq_id.hpp:426
@ eAcc_ddbj_tpa_wgsv_nuc
Definition: Seq_id.hpp:537
@ eAcc_embl_mga
Definition: Seq_id.hpp:408
@ eAcc_gb_tpa_wgs_nuc
Definition: Seq_id.hpp:491
@ eAcc_gb_wgsvm_prot
Definition: Seq_id.hpp:375
@ eAcc_gb_tpa_wgsv_prot
Definition: Seq_id.hpp:496
@ eAcc_embl_tpa_wgs_nuc
Definition: Seq_id.hpp:512
@ eAcc_gb_embl
Definition: Seq_id.hpp:330
@ eAcc_refseq_wgsm_prot
Definition: Seq_id.hpp:424
@ eAcc_named_annot_track
Definition: Seq_id.hpp:558
@ eAcc_embl_tpa_tsam_nuc
Definition: Seq_id.hpp:506
@ eAcc_pdb
Definition: Seq_id.hpp:484
@ eAcc_ddbj_tpa_prot
Definition: Seq_id.hpp:523
@ eAcc_maybe_ddbj
Definition: Seq_id.hpp:329
@ eAcc_embl_tpa_wgsv_nuc
Definition: Seq_id.hpp:516
@ eAcc_ddbj_tpa_wgs_prot
Definition: Seq_id.hpp:534
@ eAcc_gb_targeted_nuc
Definition: Seq_id.hpp:380
@ eAcc_embl_prot
Definition: Seq_id.hpp:383
@ eAcc_gpipe_mrna
Definition: Seq_id.hpp:551
@ eAcc_refseq_prot
Definition: Seq_id.hpp:414
@ eAcc_ddbj_htgs
Definition: Seq_id.hpp:467
@ eAcc_embl_tpa_tsa_prot
Definition: Seq_id.hpp:505
@ eAcc_unreserved_nuc
Definition: Seq_id.hpp:334
@ eAcc_gb_tsa_nuc
Definition: Seq_id.hpp:355
@ eAcc_refseq_wgs_nuc
Definition: Seq_id.hpp:421
@ eAcc_refseq_wgs_prot
Definition: Seq_id.hpp:422
@ eAcc_embl_tsav_nuc
Definition: Seq_id.hpp:392
@ eAcc_ddbj_tpa_tsavm_nuc
Definition: Seq_id.hpp:530
@ eAcc_embl_wgs_prot
Definition: Seq_id.hpp:401
@ eAcc_ddbj_patent
Definition: Seq_id.hpp:455
@ eAcc_gb_dirsub
Definition: Seq_id.hpp:348
@ eAcc_ddbj_tpa_tsav_nuc
Definition: Seq_id.hpp:528
@ eAcc_embl_tpa_wgsm_nuc
Definition: Seq_id.hpp:514
@ eAcc_ddbj_tsavm_prot
Definition: Seq_id.hpp:464
@ eAcc_refseq_ncrna
Definition: Seq_id.hpp:416
@ eAcc_embl_tpa_wgsm_prot
Definition: Seq_id.hpp:515
@ eAcc_refseq_chromosome
Definition: Seq_id.hpp:429
@ eAcc_wgs_intermed
Definition: Seq_id.hpp:294
@ eAcc_ddbj_tpa_targetedm_nuc
Definition: Seq_id.hpp:543
@ eAcc_ddbj_mga
Definition: Seq_id.hpp:477
@ eAcc_embl_wgsvm_nuc
Definition: Seq_id.hpp:406
@ eAcc_gb_tpa_wgsvm_nuc
Definition: Seq_id.hpp:497
@ eAcc_ddbj_other
Definition: Seq_id.hpp:450
@ eAcc_gb_tpa_wgs_prot
Definition: Seq_id.hpp:492
@ eAcc_gb_ddbj
Definition: Seq_id.hpp:331
@ eAcc_gpipe_scaffold
Definition: Seq_id.hpp:550
@ eAcc_ddbj_mrna
Definition: Seq_id.hpp:456
@ eAcc_general_nuc
Definition: Seq_id.hpp:445
@ eAcc_ddbj_tpa_tsam_nuc
Definition: Seq_id.hpp:526
@ eAcc_gb_tsa_prot
Definition: Seq_id.hpp:356
@ eAcc_ddbj_tpa_targetedv_nuc
Definition: Seq_id.hpp:544
@ eAcc_ddbj_tpa_tsavm_prot
Definition: Seq_id.hpp:531
@ eAcc_refseq_genomic
Definition: Seq_id.hpp:430
@ eAcc_gb_wgsv_prot
Definition: Seq_id.hpp:373
@ eAcc_ddbj_targetedvm_nuc
Definition: Seq_id.hpp:481
@ eAcc_gsdb_dirsub
Definition: Seq_id.hpp:352
@ eAcc_ddbj_tpa_nuc
Definition: Seq_id.hpp:522
@ eAcc_gb_tpa_wgsm_prot
Definition: Seq_id.hpp:494
@ eAcc_refseq_prot_predicted
Definition: Seq_id.hpp:438
@ eAcc_embl_tpa_tsam_prot
Definition: Seq_id.hpp:507
@ eAcc_gb_genome
Definition: Seq_id.hpp:365
@ eAcc_gi
Definition: Seq_id.hpp:448
@ eAcc_gb_tsam_prot
Definition: Seq_id.hpp:358
@ eAcc_embl_tsavm_nuc
Definition: Seq_id.hpp:394
@ eAcc_gpipe_other_nuc
Definition: Seq_id.hpp:548
@ eAcc_gb_prot
Definition: Seq_id.hpp:345
@ eAcc_ddbj_wgsm_nuc
Definition: Seq_id.hpp:471
@ eAcc_ddbj_tpa_tsa_nuc
Definition: Seq_id.hpp:524
@ eAcc_embl_tpa_nuc
Definition: Seq_id.hpp:502
@ eAcc_local
Definition: Seq_id.hpp:337
@ eAcc_gb_htgs
Definition: Seq_id.hpp:366
@ eAcc_gb_patent_prot
Definition: Seq_id.hpp:350
@ eAcc_prf
Definition: Seq_id.hpp:483
@ eAcc_embl_tsa_nuc
Definition: Seq_id.hpp:388
@ eAcc_giim
Definition: Seq_id.hpp:340
@ eAcc_gb_wgs_nuc
Definition: Seq_id.hpp:368
@ eAcc_gb_tpa_segset
Definition: Seq_id.hpp:489
@ eAcc_gb_chromosome
Definition: Seq_id.hpp:376
@ eAcc_con
Definition: Seq_id.hpp:289
@ eAcc_embl_tpa_tsav_prot
Definition: Seq_id.hpp:509
@ eAcc_gb_segset
Definition: Seq_id.hpp:363
@ eAcc_embl_wgsm_prot
Definition: Seq_id.hpp:403
@ eAcc_embl_other
Definition: Seq_id.hpp:382
@ eAcc_ddbj_targetedv_nuc
Definition: Seq_id.hpp:480
@ eAcc_embl_tpa_tsavm_prot
Definition: Seq_id.hpp:511
@ eAcc_ddbj_prot
Definition: Seq_id.hpp:451
@ eAcc_embl_tsavm_prot
Definition: Seq_id.hpp:395
@ eAcc_ddbj_targeted_nuc
Definition: Seq_id.hpp:478
@ eAcc_refseq_wgsm_intermed
Definition: Seq_id.hpp:432
@ eAcc_gb_tsavm_nuc
Definition: Seq_id.hpp:361
@ eAcc_gibbsq
Definition: Seq_id.hpp:338
@ eAcc_gb_patent
Definition: Seq_id.hpp:349
@ eAcc_gb_wgsv_nuc
Definition: Seq_id.hpp:372
@ eAcc_ddbj_tsa_nuc
Definition: Seq_id.hpp:457
@ eAcc_gb_tsam_nuc
Definition: Seq_id.hpp:357
@ fAcc_specials
Definition: Seq_id.hpp:255
@ eAcc_ddbj_tpa_chromosome
Definition: Seq_id.hpp:541
@ eAcc_tsa
Definition: Seq_id.hpp:273
@ eAcc_general_prot
Definition: Seq_id.hpp:446
@ eAcc_embl_tsam_prot
Definition: Seq_id.hpp:391
@ eAcc_gb_wgsm_prot
Definition: Seq_id.hpp:371
@ eAcc_unknown
Definition: Seq_id.hpp:322
@ eAcc_embl_genome
Definition: Seq_id.hpp:397
@ eAcc_embl_est
Definition: Seq_id.hpp:385
@ eAcc_ddbj_tpa_wgsvm_nuc
Definition: Seq_id.hpp:539
@ eAcc_unreserved_prot
Definition: Seq_id.hpp:335
@ eAcc_embl_tpa_wgsv_prot
Definition: Seq_id.hpp:517
@ eAcc_refseq_chromosome_ncbo
Definition: Seq_id.hpp:441
@ eAcc_pir
Definition: Seq_id.hpp:410
@ eAcc_gb_est
Definition: Seq_id.hpp:347
@ eAcc_gb_sts
Definition: Seq_id.hpp:377
@ eAcc_swissprot
Definition: Seq_id.hpp:411
@ eAcc_ddbj_wgsvm_nuc
Definition: Seq_id.hpp:475
@ eAcc_ddbj_tpa_con
Definition: Seq_id.hpp:532
@ eAcc_embl_tpa_other
Definition: Seq_id.hpp:501
@ eAcc_other
Definition: Seq_id.hpp:264
@ eAcc_ddbj_wgsv_prot
Definition: Seq_id.hpp:474
@ fAcc_master
Definition: Seq_id.hpp:256
@ eAcc_refseq_ncrna_predicted
Definition: Seq_id.hpp:440
@ eAcc_refseq_wgsvm_intermed
Definition: Seq_id.hpp:436
@ eAcc_gb_gsdb
Definition: Seq_id.hpp:353
@ eAcc_gb_wgsvm_nuc
Definition: Seq_id.hpp:374
@ eAcc_embl_dirsub
Definition: Seq_id.hpp:386
@ eAcc_gb_tsav_nuc
Definition: Seq_id.hpp:359
@ eAcc_ddbj_genome
Definition: Seq_id.hpp:466
@ eAcc_ddbj_tsav_prot
Definition: Seq_id.hpp:462
@ eAcc_embl_tpa_tsavm_nuc
Definition: Seq_id.hpp:510
@ eAcc_division_mask
Definition: Seq_id.hpp:299
@ eAcc_embl_tpa_wgs_prot
Definition: Seq_id.hpp:513
@ eAcc_ddbj_tsa_prot
Definition: Seq_id.hpp:458
@ eAcc_refseq_wgs_intermed
Definition: Seq_id.hpp:431
@ eAcc_refseq_genome
Definition: Seq_id.hpp:419
@ eAcc_ddbj_targetedm_nuc
Definition: Seq_id.hpp:479
@ eAcc_ddbj_tpa_wgsv_prot
Definition: Seq_id.hpp:538
@ eAcc_ddbj_tsav_nuc
Definition: Seq_id.hpp:461
@ eAcc_ddbj_tpa_wgsm_nuc
Definition: Seq_id.hpp:535
@ eAcc_gb_other
Definition: Seq_id.hpp:344
@ eAcc_refseq_wgsvm_prot
Definition: Seq_id.hpp:428
@ e_NO
different SeqId types-can't compare
Definition: Seq_id.hpp:582
@ e_DIFF
some problem
Definition: Seq_id.hpp:581
@ e_error
Definition: Seq_id.hpp:580
@ e_YES
SeqIds compared, but are different.
Definition: Seq_id.hpp:583
@ fAllowLocalId
Definition: Seq_id.hpp:820
@ fGpipeAddSecondary
Add "ACC.VER(=1)" for a 2ndary id.
Definition: Seq_id.hpp:821
@ fParse_NoFASTA
Don't bother checking for a tag.
Definition: Seq_id.hpp:91
@ fParse_RawText
Try to ID raw non-numeric accessions.
Definition: Seq_id.hpp:81
@ fParse_PartialOK
Warn rather than throwing an exception when a FASTA-style ID set contains unparsable portions,...
Definition: Seq_id.hpp:80
@ fParse_RawGI
Treat raw numbers as GIs, not local IDs.
Definition: Seq_id.hpp:82
@ fParse_AnyRaw
Definition: Seq_id.hpp:83
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
Definition: Seq_id.hpp:90
@ fParse_FallbackOK
For IdentifyAccession, don't warn about falling back to a different specific type because broad ident...
Definition: Seq_id.hpp:96
@ fParse_ValidLocal
Treat otherwise unidentified strings as raw accessions, provided that they pass rudimentary validatio...
Definition: Seq_id.hpp:87
@ kMaxScore
Definition: Seq_id.hpp:733
@ eFormat_BestWithoutVersion
Definition: Seq_id.hpp:664
@ eFormat_FastA
Definition: Seq_id.hpp:662
@ eFormat_BestWithVersion
Definition: Seq_id.hpp:665
@ eFormat_ForceGI
Definition: Seq_id.hpp:663
@ eFastaContent
Like eFasta, but without any tag.
Definition: Seq_id.hpp:608
@ eContent
Untagged human-readable accession or the like.
Definition: Seq_id.hpp:605
@ eDefault
default is to show type + content
Definition: Seq_id.hpp:611
@ eBoth
Type and content, delimited by a vertical bar.
Definition: Seq_id.hpp:606
@ eFasta
Tagged ID in NCBI's traditional FASTA style.
Definition: Seq_id.hpp:607
@ eType
FASTA-style type, or database in GeneralDbIsContent mode.
Definition: Seq_id.hpp:604
@ fAllowUnderscores
Allow prefixes to contain underscores.
Definition: Seq_id.hpp:897
@ eTV_pgp
Definition: Seq_id.hpp:863
@ eTV_tr
Definition: Seq_id.hpp:862
@ eTV_plain
Definition: Seq_id.hpp:861
@ fEmptyId
Definition: Seq_id.hpp:109
@ fNoError
Definition: Seq_id.hpp:108
@ fExceedsMaxLength
Definition: Seq_id.hpp:111
@ fInvalidChar
Definition: Seq_id.hpp:110
@ fRequireAccessions
Definition: Seq_id.hpp:737
@ eSNPScaleLimit_Supercontig
Definition: Seq_id.hpp:851
@ eSNPScaleLimit_Unit
Definition: Seq_id.hpp:849
@ eSNPScaleLimit_Default
Definition: Seq_id.hpp:848
@ eSNPScaleLimit_Contig
Definition: Seq_id.hpp:850
@ eSNPScaleLimit_Chromosome
Definition: Seq_id.hpp:852
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
@ eParam_NoThread
Do not use per-thread values.
Definition: ncbi_param.hpp:418
#define kMax_Int
Definition: ncbi_limits.h:184
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
const_iterator end() const
Return an iterator to the string's ending position (one past the end of the represented sequence)
Definition: tempstr.hpp:306
static CTempString TruncateSpaces_Unsafe(const CTempString str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string.
Definition: ncbistr.cpp:3182
static int StringToNonNegativeInt(const CTempString str, TStringToNumFlags flags=0)
Convert string to non-negative integer value.
Definition: ncbistr.cpp:457
#define kEmptyStr
Definition: ncbistr.hpp:123
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static bool MatchesMask(CTempString str, CTempString mask, ECase use_case=eCase)
Match "str" against the "mask".
Definition: ncbistr.cpp:389
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
CTempString & assign(const char *src_str, size_type len)
Assign new values to the content of the a string.
Definition: tempstr.hpp:733
static TNumeric StringToNumeric(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to a numeric value.
Definition: ncbistr.hpp:330
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
Definition: ncbistr.hpp:2699
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
void erase(size_type pos=0)
Truncate the string at some specified position Note: basic_string<> supports additional erase() optio...
Definition: tempstr.hpp:514
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
Definition: ncbistr.hpp:5103
void clear(void)
Clears the string.
Definition: tempstr.hpp:351
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3545
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:642
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:776
size_type find_first_not_of(const CTempString match, size_type pos=0) const
Find the first occurrence of any character not in the matching string within the current string,...
Definition: tempstr.hpp:553
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5347
size_type find_first_of(const CTempString match, size_type pos=0) const
Find the first occurrence of any character in the matching string within the current string,...
Definition: tempstr.hpp:538
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
Definition: tempstr.hpp:655
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
const char *const kEmptyCStr
Empty "C" string (points to a '\0').
Definition: ncbistr.cpp:68
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
const_iterator begin() const
Return an iterator to the string's starting position.
Definition: tempstr.hpp:299
@ fConvErr_NoThrow
Do no