NCBI C++ ToolKit
Seq_id.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: Seq_id.hpp 102244 2024-04-10 18:51:00Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: .......
27  *
28  * File Description:
29  * .......
30  *
31  * Remark:
32  * This code was originally generated by application DATATOOL
33  * using specifications from the ASN data definition file
34  * 'seqloc.asn'.
35  */
36 
37 #ifndef OBJECTS_SEQLOC_SEQ_ID_HPP
38 #define OBJECTS_SEQLOC_SEQ_ID_HPP
39 
40 
41 // generated includes
43 #include <corelib/ncbi_limits.hpp>
44 #include <serial/serializable.hpp>
45 
46 #include <objects/seq/Bioseq.hpp>
48 
49 #include <set>
50 
51 // generated classes
52 
54 
55 class ILineReader;
56 
57 BEGIN_objects_SCOPE // namespace ncbi::objects::
58 
59 /** @addtogroup OBJECTS_Seqid
60  *
61  * @{
62  */
63 
64 
65 class CBioseq;
66 class CSeq_id_Handle;
67 
68 
70  public CSerializable
71 {
73 
74 public:
75  enum EParseFlags {
76  /// Warn rather than throwing an exception when a FASTA-style ID set
77  /// contains unparsable portions, either from unsupported syntax or
78  /// because it is only possible to accept a single ID anyway (in the
79  /// string-based constructor and corresponding Set() variant).
80  fParse_PartialOK = 0x01,
81  fParse_RawText = 0x02, ///< Try to ID raw non-numeric accessions
82  fParse_RawGI = 0x04, ///< Treat raw numbers as GIs, not local IDs
83  fParse_AnyRaw = fParse_RawText | fParse_RawGI,
84  /// Treat otherwise unidentified strings as raw accessions,
85  /// provided that they pass rudimentary validation. Also,
86  /// accept PDB accessions with chains but no delimiters.
87  fParse_ValidLocal = 0x08,
88  /// Treat otherwise unidentified strings as local accessions as long
89  /// as they don't resemble FASTA-style IDs (or ID sets).
90  fParse_AnyLocal = 0x18,
91  fParse_NoFASTA = 0x20, ///< Don't bother checking for a tag
92  /// For IdentifyAccession, don't warn about falling back to a
93  /// different specific type because broad identification is
94  /// sufficient. (Automatically on when calling IdentifyAccession
95  /// internally.)
96  fParse_FallbackOK = 0x40,
97 
98  /// By default in ParseIDs and IsValid, allow raw parsable
99  /// non-numeric accessions and plausible local accessions.
100  /// (The string-based constructor and Set method have a
101  /// stricter default: fParse_AnyRaw.)
102  fParse_Default = fParse_RawText | fParse_ValidLocal
103  };
104  typedef int TParseFlags; // binary OR of EParseFlags
105 
106 
107  enum EErrorFlags {
108  fNoError = 0,
109  fEmptyId = 1, // Id may consists of empty string(s)
110  fInvalidChar = 1 << 1,
111  fExceedsMaxLength = 1 << 2
112  };
113  using TErrorFlags = int;
114 
115  /// Tag for method variants that would otherwise be ambiguous.
117  eFasta_AsTypeAndContent
118  };
119 
120  ///
121  /// See also CSeq_id related functions in "util/sequence.hpp":
122  ///
123  /// TSeqPos GetLength(const CSeq_id&, CScope*);
124  /// bool IsSameBioseq(const CSeq_id&, const CSeq_id&, CScope*);
125  ///
126 
127  /// Default constructor
128  CSeq_id(void);
129 
130  /// Construct a Seq-id from a flat string.
131  /// @param the_id
132  /// Input ID, preferably FASTA-style.
133  /// @param flags
134  /// How to interpret anything other than a single FASTA-style ID.
135  explicit CSeq_id(const CTempString& the_id,
136  TParseFlags flags = fParse_AnyRaw);
137 
138  /// Construct a seq-id from a dbtag.
139  /// @param tag
140  /// Input dbtag.
141  /// @param set_as_general
142  /// Whether to store tags from unrecognized databases as is in
143  /// Seq-ids of type general rather than rejecting them altogether.
144  explicit CSeq_id(const CDbtag& tag, bool set_as_general = true);
145 
146  /// Construct a numeric Seq-id.
147  /// @param the_type
148  /// Type of Seq-id (normally e_Gi)
149  /// @param int_seq_id
150  /// Numeric value.
151  CSeq_id(E_Choice the_type,
152  TIntId int_seq_id);
153 #ifdef NCBI_STRICT_GI
154  CSeq_id(E_Choice the_type,
155  TGi gi);
156 #endif
157 
158  /// Construct a Seq-id from a flat representation.
159  /// @param the_type
160  /// Type of Seq_id to construct
161  /// @param acc_in
162  /// Primary string value -- normally accession, overridden as
163  /// country for patents, database for "general" IDs, molecule ID
164  /// for PDB IDs.
165  /// @param name_in
166  /// Additional string value -- normally name/locus, overridden as
167  /// (application) number for patents, tag for "general" IDs,
168  /// chain ID for PDB.
169  /// @param version
170  /// Numeric value -- normally version number, overriden as sequence
171  /// number for patents.
172  /// @param release_in
173  /// Historically used to convey release identifiers; for patents,
174  /// may be set to "pgp" (case-insensitive) to indicate a
175  /// Pre-Grant Patent [application].
176  CSeq_id(E_Choice the_type,
177  const CTempString& acc_in,
178  const CTempString& name_in = kEmptyStr,
179  int version = 0,
180  const CTempString& release_in = kEmptyStr);
181 
182  /// Construct a Seq-id from a FASTA string with the leading (type)
183  /// component already parsed out.
184  /// @param the_type
185  /// Type of Seq_id to construct
186  /// @param the_content
187  /// FASTA-style content, with embedded vertical bars as appropriate.
188  CSeq_id(EFastaAsTypeAndContent, E_Choice the_type,
189  const CTempString& the_content);
190 
191  /// Reassign based on flat specifications; arguments interpreted
192  /// as with constructors. (Returns a reference to self.)
193 
194  CSeq_id& Set(const CTempString& the_id, TParseFlags flags = fParse_AnyRaw);
195 
196  CSeq_id& Set(const CDbtag& tag, bool set_as_general = true);
197 
198  CSeq_id& Set(E_Choice the_type,
199  TIntId int_seq_id);
200 
201  CSeq_id& Set(E_Choice the_type,
202  const CTempString& acc_in,
203  const CTempString& name_in = kEmptyStr,
204  int version = 0,
205  const CTempString& release_in = kEmptyStr);
206 
207  CSeq_id& Set(EFastaAsTypeAndContent, E_Choice the_type,
208  const CTempString& the_content);
209 
210  /// Destructor
211  virtual ~CSeq_id(void);
212 
213  /// Converts a string to a choice, no need to require a member.
214  static E_Choice WhichInverseSeqId(const CTempString& SeqIdCode);
215 
216  /// Converts a choice to a FASTA tag, with no trailing vertical bar.
217  static const char* WhichFastaTag(E_Choice choice);
218 
219  /// For IdentifyAccession (below)
221  // E_Choice values, explicitly pulled in to help avoid warnings
222  // elsewhere about bitwise operations between different enum types.
223  eSeqId_not_set = e_not_set,
224  eSeqId_local = e_Local,
225  eSeqId_gibbsq = e_Gibbsq,
226  eSeqId_gibbmt = e_Gibbmt,
227  eSeqId_giim = e_Giim,
228  eSeqId_genbank = e_Genbank,
229  eSeqId_embl = e_Embl,
230  eSeqId_pir = e_Pir,
231  eSeqId_swissprot = e_Swissprot,
232  eSeqId_patent = e_Patent,
233  eSeqId_other = e_Other,
234  eSeqId_refseq = eSeqId_other,
235  eSeqId_general = e_General,
236  eSeqId_gi = e_Gi,
237  eSeqId_ddbj = e_Ddbj,
238  eSeqId_prf = e_Prf,
239  eSeqId_pdb = e_Pdb,
240  eSeqId_tpg = e_Tpg,
241  eSeqId_tpe = e_Tpe,
242  eSeqId_tpd = e_Tpd,
243  eSeqId_gpipe = e_Gpipe,
244  eSeqId_named_annot_track = e_Named_annot_track,
245 
246  // Mask for Seq_id type; allow 8 bits to be safe
247  eAcc_type_mask = 0xff,
248 
249  // Useful general flags (not inherent in choice of division,
250  // albeit not necessarily applicable to all divisions).
251  fAcc_nuc = 0x80000000,
252  fAcc_prot = 0x40000000,
253  fAcc_seq = 0,
254  fAcc_predicted = 0x20000000, // only for refseq
255  fAcc_specials = 0x10000000, // has special cases; only used internally
256  fAcc_master = 0x08000000,
257  fAcc_ncbo = 0x04000000, // for refseq pathogen detection pipeline
258  fAcc_fallback = 0x02000000, // is a fallback; only used internally
259  fAcc_vdb_only = 0x01000000,
260  eAcc_flag_mask = 0xff000000,
261 
262  // Divisions and categories (multiples of 1 << 8; always
263  // globally unique nowadays, no matter how specialized)
264  eAcc_other = 0 << 8, // no further classification
265  eAcc_est = 1 << 8, // expressed sequence tag (mRNA)
266  eAcc_dirsub = 2 << 8, // direct submission of anything
267  eAcc_div_patent = 3 << 8, // patented sequence
268  eAcc_mrna = 4 << 8, // non-EST mRNA or cDNA
269  eAcc_ncrna = 5 << 8, // non-coding RNA
270  eAcc_gsdb_ds = 6 << 8, // Genome Sequence DB direct submission
271  eAcc_gsdb = 7 << 8, // other GSDB record
272  eAcc_backbone = 8 << 8, // from journal scanning
273  eAcc_tsa = 9 << 8, // transcriptome shotgun assembly
274  eAcc_segset = 10 << 8, // seg-set header; might not be genomic
275  eAcc_gss = 11 << 8, // genome survey sequence (may be mRNA)
276  eAcc_unique = 12 << 8, // sequence data shared across records
277  eAcc_ambig = 112 << 8, // multiply assigned (!)
278  eAcc_ambig_g = 113 << 8, // assigned in GenBank
279  eAcc_ambig_e = 114 << 8, // assigned in EMBL
280  eAcc_ambig_ge = 115 << 8, // assigned in both GenBank and EMBL
281  eAcc_ambig_d = 116 << 8, // assigned in DDBJ
282  eAcc_ambig_gd = 117 << 8, // GenBank and DDBJ
283  eAcc_ambig_ed = 118 << 8, // EMBL and DDBJ
284  eAcc_ambig_ged = 119 << 8, // all three
285  eAcc_unreserved = 127 << 8, // not yet formally assigned
286  fAcc_genomic = 128 << 8, // genomic (flag; any of the below)
287  eAcc_genome = 128 << 8, // complete genome
288  eAcc_htgs = 129 << 8, // high-throughput genome sequence
289  eAcc_con = 130 << 8, // intermediate genomic assembly; contig
290  eAcc_wgs = 131 << 8, // whole-genome shotgun collection
291  // 132 was GSS, which isn't necessarily genomic after all. :-/
292  eAcc_chromosome = 133 << 8, // whole chromosome
293  eAcc_genomic_rgn = 134 << 8, // incomplete genomic region
294  eAcc_wgs_intermed = 135 << 8, // WGS intermediate assembly
295  eAcc_sts = 136 << 8, // sequence tagged site
296  eAcc_mga = 137 << 8, // Mass sequence for Genome Annotation
297  eAcc_optical_map = 138 << 8, // optical map
298  eAcc_targeted = 139 << 8, // targeted genomic project
299  eAcc_division_mask = 0xff00,
300 
301  // Internal values combinining a division and a flag for convenience.
302  eAcc_wgs_master = eAcc_wgs | fAcc_master,
303  eAcc_wgs_intermed_master = eAcc_wgs_intermed | fAcc_master,
304  eAcc_tsa_master = eAcc_tsa | fAcc_master,
305  eAcc_targeted_master = eAcc_targeted | fAcc_master,
306  eAcc_wgs_vdb_only = eAcc_wgs | fAcc_vdb_only,
307  eAcc_wgs_intermed_vdb_only = eAcc_wgs_intermed | fAcc_vdb_only,
308  eAcc_tsa_vdb_only = eAcc_tsa | fAcc_vdb_only,
309  eAcc_targeted_vdb_only = eAcc_targeted | fAcc_vdb_only,
310  eAcc_wgs_vdb_master = eAcc_wgs | fAcc_master | fAcc_vdb_only,
311  eAcc_wgs_intermed_vdb_master
312  = eAcc_wgs_intermed | fAcc_master | fAcc_vdb_only,
313  eAcc_tsa_vdb_master = eAcc_tsa | fAcc_master | fAcc_vdb_only,
314  eAcc_targeted_vdb_master = eAcc_targeted | fAcc_master | fAcc_vdb_only,
315 
316  // Internal macro, left defined only temporarily
317 #define NCBI_ACC(type, div, mol) eSeqId_##type | eAcc_##div | fAcc_##mol
318  // Actual return values with EXAMPLE prefixes (to be followed
319  // by digits) or IDs, grouped by Seq-id type. In most cases,
320  // there are other prefixes with the same classification, and
321  // if not there could be in principle.
322  eAcc_unknown = NCBI_ACC(not_set, other, seq),
323  // Most N accessions are GenBank ESTs, but some low-numbered
324  // ones (now only used as secondary accessions) were assigned
325  // haphazardly, and some are therefore ambiguous.
326  eAcc_ambiguous_nuc = NCBI_ACC(not_set, ambig, nuc), // N0-N1
327  eAcc_maybe_gb = NCBI_ACC(not_set, ambig_g, nuc),
328  eAcc_maybe_embl = NCBI_ACC(not_set, ambig_e, nuc),
329  eAcc_maybe_ddbj = NCBI_ACC(not_set, ambig_d, nuc),
330  eAcc_gb_embl = NCBI_ACC(not_set, ambig_ge, nuc), // N00001
331  eAcc_gb_ddbj = NCBI_ACC(not_set, ambig_gd, nuc), // N00006
332  eAcc_embl_ddbj = NCBI_ACC(not_set, ambig_ed, nuc), // N00070
333  eAcc_gb_embl_ddbj = NCBI_ACC(not_set, ambig_ged, nuc), // N00005
334  eAcc_unreserved_nuc = NCBI_ACC(not_set, unreserved, nuc), // XY
335  eAcc_unreserved_prot = NCBI_ACC(not_set, unreserved, prot), // XYZ
336 
337  eAcc_local = NCBI_ACC(local, other, seq),
338  eAcc_gibbsq = NCBI_ACC(gibbsq, other, seq),
339  eAcc_gibbmt = NCBI_ACC(gibbmt, other, seq),
340  eAcc_giim = NCBI_ACC(giim, other, seq),
341 
342  // NB: eAcc_gb_patent and eAcc_gb_segset are only *mostly* nucleotide,
343  // and eAcc_gb_mga is so far unused.
344  eAcc_gb_other = NCBI_ACC(genbank, other, seq),
345  eAcc_gb_prot = NCBI_ACC(genbank, other, prot), // AAA
346  eAcc_gb_other_nuc = NCBI_ACC(genbank, other, nuc), // AS
347  eAcc_gb_est = NCBI_ACC(genbank, est, nuc), // H
348  eAcc_gb_dirsub = NCBI_ACC(genbank, dirsub, nuc), // U
349  eAcc_gb_patent = NCBI_ACC(genbank, div_patent, seq), // I
350  eAcc_gb_patent_prot = NCBI_ACC(genbank, div_patent, prot), // AAE
351  eAcc_gb_cdna = NCBI_ACC(genbank, mrna, nuc), // BC
352  eAcc_gsdb_dirsub = NCBI_ACC(genbank, gsdb_ds, nuc), // J
353  eAcc_gb_gsdb = NCBI_ACC(genbank, gsdb, nuc), // AD
354  eAcc_gb_backbone = NCBI_ACC(genbank, backbone, nuc), // S
355  eAcc_gb_tsa_nuc = NCBI_ACC(genbank, tsa, nuc), // EZ
356  eAcc_gb_tsa_prot = NCBI_ACC(genbank, tsa, prot), // JAA
357  eAcc_gb_tsam_nuc = NCBI_ACC(genbank, tsa_master, nuc),
358  eAcc_gb_tsam_prot = NCBI_ACC(genbank, tsa_master, prot),
359  eAcc_gb_tsav_nuc = NCBI_ACC(genbank, tsa_vdb_only, nuc),
360  eAcc_gb_tsav_prot = NCBI_ACC(genbank, tsa_vdb_only, prot),
361  eAcc_gb_tsavm_nuc = NCBI_ACC(genbank, tsa_vdb_master, nuc),
362  eAcc_gb_tsavm_prot = NCBI_ACC(genbank, tsa_vdb_master, prot),
363  eAcc_gb_segset = NCBI_ACC(genbank, segset, seq), // AH
364  eAcc_gb_gss = NCBI_ACC(genbank, gss, nuc), // B
365  eAcc_gb_genome = NCBI_ACC(genbank, genome, nuc), // AE
366  eAcc_gb_htgs = NCBI_ACC(genbank, htgs, nuc), // AC
367  eAcc_gb_con = NCBI_ACC(genbank, con, nuc), // CH
368  eAcc_gb_wgs_nuc = NCBI_ACC(genbank, wgs, nuc), // AAAA
369  eAcc_gb_wgs_prot = NCBI_ACC(genbank, wgs, prot), // EAA
370  eAcc_gb_wgsm_nuc = NCBI_ACC(genbank, wgs_master, nuc),
371  eAcc_gb_wgsm_prot = NCBI_ACC(genbank, wgs_master, prot),
372  eAcc_gb_wgsv_nuc = NCBI_ACC(genbank, wgs_vdb_only, nuc),
373  eAcc_gb_wgsv_prot = NCBI_ACC(genbank, wgs_vdb_only, prot),
374  eAcc_gb_wgsvm_nuc = NCBI_ACC(genbank, wgs_vdb_master, nuc),
375  eAcc_gb_wgsvm_prot = NCBI_ACC(genbank, wgs_vdb_master, prot),
376  eAcc_gb_chromosome = NCBI_ACC(genbank, chromosome, nuc), // CM
377  eAcc_gb_sts = NCBI_ACC(genbank, sts, nuc), // G
378  eAcc_gb_mga = NCBI_ACC(genbank, mga, nuc),
379  eAcc_gb_optical_map = NCBI_ACC(genbank, optical_map, nuc), // MAP_
380  eAcc_gb_targeted_nuc = NCBI_ACC(genbank, targeted, nuc), // KAAA
381 
382  eAcc_embl_other = NCBI_ACC(embl, other, seq),
383  eAcc_embl_prot = NCBI_ACC(embl, other, prot), // CAA
384  eAcc_embl_other_nuc = NCBI_ACC(embl, other, nuc), // AL
385  eAcc_embl_est = NCBI_ACC(embl, est, nuc), // F
386  eAcc_embl_dirsub = NCBI_ACC(embl, dirsub, nuc), // V
387  eAcc_embl_patent = NCBI_ACC(embl, div_patent, nuc), // A
388  eAcc_embl_tsa_nuc = NCBI_ACC(embl, tsa, nuc), // HAAA
389  eAcc_embl_tsa_prot = NCBI_ACC(embl, tsa, prot), // unused
390  eAcc_embl_tsam_nuc = NCBI_ACC(embl, tsa_master, nuc),
391  eAcc_embl_tsam_prot = NCBI_ACC(embl, tsa_master, prot),
392  eAcc_embl_tsav_nuc = NCBI_ACC(embl, tsa_vdb_only, nuc),
393  eAcc_embl_tsav_prot = NCBI_ACC(embl, tsa_vdb_only, prot),
394  eAcc_embl_tsavm_nuc = NCBI_ACC(embl, tsa_vdb_master, nuc),
395  eAcc_embl_tsavm_prot = NCBI_ACC(embl, tsa_vdb_master, prot),
396  eAcc_embl_gss = NCBI_ACC(embl, gss, nuc), // AJ864682
397  eAcc_embl_genome = NCBI_ACC(embl, genome, nuc), // unused
398  eAcc_embl_htgs = NCBI_ACC(embl, htgs, nuc), // unused
399  eAcc_embl_con = NCBI_ACC(embl, con, nuc), // AN
400  eAcc_embl_wgs_nuc = NCBI_ACC(embl, wgs, nuc), // CAAA
401  eAcc_embl_wgs_prot = NCBI_ACC(embl, wgs, prot), // unused
402  eAcc_embl_wgsm_nuc = NCBI_ACC(embl, wgs_master, nuc),
403  eAcc_embl_wgsm_prot = NCBI_ACC(embl, wgs_master, prot),
404  eAcc_embl_wgsv_nuc = NCBI_ACC(embl, wgs_vdb_only, nuc),
405  eAcc_embl_wgsv_prot = NCBI_ACC(embl, wgs_vdb_only, prot),
406  eAcc_embl_wgsvm_nuc = NCBI_ACC(embl, wgs_vdb_master, nuc),
407  eAcc_embl_wgsvm_prot = NCBI_ACC(embl, wgs_vdb_master, prot),
408  eAcc_embl_mga = NCBI_ACC(embl, mga, nuc), // unused
409 
410  eAcc_pir = NCBI_ACC(pir, other, prot),
411  eAcc_swissprot = NCBI_ACC(swissprot, other, prot), // P
412  eAcc_patent = NCBI_ACC(patent, div_patent, seq),
413 
414  eAcc_refseq_prot = NCBI_ACC(refseq, other, prot),//NP_
415  eAcc_refseq_mrna = NCBI_ACC(refseq, mrna, nuc), //NM_
416  eAcc_refseq_ncrna = NCBI_ACC(refseq, ncrna, nuc), //NR_
417  eAcc_refseq_unique_prot = NCBI_ACC(refseq, unique, prot),//WP_
418  eAcc_refseq_unreserved = NCBI_ACC(refseq, unreserved, seq), //AA_
419  eAcc_refseq_genome = NCBI_ACC(refseq, genome, nuc), //NS_
420  eAcc_refseq_contig = NCBI_ACC(refseq, con, nuc), //NT_
421  eAcc_refseq_wgs_nuc = NCBI_ACC(refseq, wgs, nuc), //NZ_
422  eAcc_refseq_wgs_prot = NCBI_ACC(refseq, wgs, prot),//ZP_
423  eAcc_refseq_wgsm_nuc = NCBI_ACC(refseq, wgs_master, nuc),
424  eAcc_refseq_wgsm_prot = NCBI_ACC(refseq, wgs_master, prot),
425  eAcc_refseq_wgsv_nuc = NCBI_ACC(refseq, wgs_vdb_only, nuc),
426  eAcc_refseq_wgsv_prot = NCBI_ACC(refseq, wgs_vdb_only, prot),
427  eAcc_refseq_wgsvm_nuc = NCBI_ACC(refseq, wgs_vdb_master, nuc),
428  eAcc_refseq_wgsvm_prot = NCBI_ACC(refseq, wgs_vdb_master, prot),
429  eAcc_refseq_chromosome = NCBI_ACC(refseq, chromosome, nuc), //NC_
430  eAcc_refseq_genomic = NCBI_ACC(refseq, genomic_rgn, nuc), //NG_
431  eAcc_refseq_wgs_intermed = NCBI_ACC(refseq, wgs_intermed, nuc), //NW_
432  eAcc_refseq_wgsm_intermed = NCBI_ACC(refseq, wgs_intermed_master,
433  nuc),
434  eAcc_refseq_wgsv_intermed = NCBI_ACC(refseq, wgs_intermed_vdb_only,
435  nuc),
436  eAcc_refseq_wgsvm_intermed = NCBI_ACC(refseq, wgs_intermed_vdb_master,
437  nuc),
438  eAcc_refseq_prot_predicted = eAcc_refseq_prot | fAcc_predicted, //XP_
439  eAcc_refseq_mrna_predicted = eAcc_refseq_mrna | fAcc_predicted, //XM_
440  eAcc_refseq_ncrna_predicted = eAcc_refseq_ncrna | fAcc_predicted, //XR_
441  eAcc_refseq_chromosome_ncbo = eAcc_refseq_chromosome | fAcc_ncbo, //WC_
442  eAcc_refseq_contig_ncbo = eAcc_refseq_contig | fAcc_ncbo, //WT_
443 
444  eAcc_general = NCBI_ACC(general, other, seq),
445  eAcc_general_nuc = NCBI_ACC(general, other, nuc), // TRACE_ASSM
446  eAcc_general_prot = NCBI_ACC(general, other, prot),
447 
448  eAcc_gi = NCBI_ACC(gi, other, seq),
449 
450  eAcc_ddbj_other = NCBI_ACC(ddbj, other, seq),
451  eAcc_ddbj_prot = NCBI_ACC(ddbj, other, prot), // BAA
452  eAcc_ddbj_other_nuc = NCBI_ACC(ddbj, other, nuc),//N00028
453  eAcc_ddbj_est = NCBI_ACC(ddbj, est, nuc), // C
454  eAcc_ddbj_dirsub = NCBI_ACC(ddbj, dirsub, nuc), // D
455  eAcc_ddbj_patent = NCBI_ACC(ddbj, div_patent, nuc), // E
456  eAcc_ddbj_mrna = NCBI_ACC(ddbj, mrna, nuc), // AK
457  eAcc_ddbj_tsa_nuc = NCBI_ACC(ddbj, tsa, nuc), // FX
458  eAcc_ddbj_tsa_prot = NCBI_ACC(ddbj, tsa, prot), // LAA
459  eAcc_ddbj_tsam_nuc = NCBI_ACC(ddbj, tsa_master, nuc),
460  eAcc_ddbj_tsam_prot = NCBI_ACC(ddbj, tsa_master, prot),
461  eAcc_ddbj_tsav_nuc = NCBI_ACC(ddbj, tsa_vdb_only, nuc),
462  eAcc_ddbj_tsav_prot = NCBI_ACC(ddbj, tsa_vdb_only, prot),
463  eAcc_ddbj_tsavm_nuc = NCBI_ACC(ddbj, tsa_vdb_master, nuc),
464  eAcc_ddbj_tsavm_prot = NCBI_ACC(ddbj, tsa_vdb_master, prot),
465  eAcc_ddbj_gss = NCBI_ACC(ddbj, gss, nuc), // AG
466  eAcc_ddbj_genome = NCBI_ACC(ddbj, genome, nuc), // AP
467  eAcc_ddbj_htgs = NCBI_ACC(ddbj, htgs, nuc), // {}
468  eAcc_ddbj_con = NCBI_ACC(ddbj, con, nuc), // BA
469  eAcc_ddbj_wgs_nuc = NCBI_ACC(ddbj, wgs, nuc), // BAAA
470  eAcc_ddbj_wgs_prot = NCBI_ACC(ddbj, wgs, prot), // GAA
471  eAcc_ddbj_wgsm_nuc = NCBI_ACC(ddbj, wgs_master, nuc),
472  eAcc_ddbj_wgsm_prot = NCBI_ACC(ddbj, wgs_master, prot),
473  eAcc_ddbj_wgsv_nuc = NCBI_ACC(ddbj, wgs_vdb_only, nuc),
474  eAcc_ddbj_wgsv_prot = NCBI_ACC(ddbj, wgs_vdb_only, prot),
475  eAcc_ddbj_wgsvm_nuc = NCBI_ACC(ddbj, wgs_vdb_master, nuc),
476  eAcc_ddbj_wgsvm_prot = NCBI_ACC(ddbj, wgs_vdb_master, prot),
477  eAcc_ddbj_mga = NCBI_ACC(ddbj, mga, nuc), //AAAAA
478  eAcc_ddbj_targeted_nuc = NCBI_ACC(ddbj, targeted, nuc), // TAAA
479  eAcc_ddbj_targetedm_nuc = NCBI_ACC(ddbj, targeted_master, nuc),
480  eAcc_ddbj_targetedv_nuc = NCBI_ACC(ddbj, targeted_vdb_only, nuc),
481  eAcc_ddbj_targetedvm_nuc = NCBI_ACC(ddbj, targeted_vdb_master, nuc),
482 
483  eAcc_prf = NCBI_ACC(prf, other, prot),
484  eAcc_pdb = NCBI_ACC(pdb, other, seq), // not necessarily protein!
485 
486  eAcc_gb_tpa_other = NCBI_ACC(tpg, other, seq),
487  eAcc_gb_tpa_nuc = NCBI_ACC(tpg, other, nuc), // BK
488  eAcc_gb_tpa_prot = NCBI_ACC(tpg, other, prot), // DAA
489  eAcc_gb_tpa_segset = NCBI_ACC(tpg, segset, nuc), // BL
490  eAcc_gb_tpa_con = NCBI_ACC(tpg, con, nuc), // GJ
491  eAcc_gb_tpa_wgs_nuc = NCBI_ACC(tpg, wgs, nuc), // DAAA
492  eAcc_gb_tpa_wgs_prot = NCBI_ACC(tpg, wgs, prot),
493  eAcc_gb_tpa_wgsm_nuc = NCBI_ACC(tpg, wgs_master, nuc),
494  eAcc_gb_tpa_wgsm_prot = NCBI_ACC(tpg, wgs_master, prot),
495  eAcc_gb_tpa_wgsv_nuc = NCBI_ACC(tpg, wgs_vdb_only, nuc),
496  eAcc_gb_tpa_wgsv_prot = NCBI_ACC(tpg, wgs_vdb_only, prot), // HAA
497  eAcc_gb_tpa_wgsvm_nuc = NCBI_ACC(tpg, wgs_vdb_master, nuc),
498  eAcc_gb_tpa_wgsvm_prot = NCBI_ACC(tpg, wgs_vdb_master, prot),
499  eAcc_gb_tpa_chromosome = NCBI_ACC(tpg, chromosome, nuc), // GK
500 
501  eAcc_embl_tpa_other = NCBI_ACC(tpe, other, seq),
502  eAcc_embl_tpa_nuc = NCBI_ACC(tpe, other, nuc), // BN
503  eAcc_embl_tpa_prot = NCBI_ACC(tpe, other, prot), // CAD29848
504  eAcc_embl_tpa_tsa_nuc = NCBI_ACC(tpe, tsa, nuc),
505  eAcc_embl_tpa_tsa_prot = NCBI_ACC(tpe, tsa, prot),
506  eAcc_embl_tpa_tsam_nuc = NCBI_ACC(tpe, tsa_master, nuc),
507  eAcc_embl_tpa_tsam_prot = NCBI_ACC(tpe, tsa_master, prot),
508  eAcc_embl_tpa_tsav_nuc = NCBI_ACC(tpe, tsa_vdb_only, nuc),
509  eAcc_embl_tpa_tsav_prot = NCBI_ACC(tpe, tsa_vdb_only, prot),
510  eAcc_embl_tpa_tsavm_nuc = NCBI_ACC(tpe, tsa_vdb_master, nuc),
511  eAcc_embl_tpa_tsavm_prot = NCBI_ACC(tpe, tsa_vdb_master, prot),
512  eAcc_embl_tpa_wgs_nuc = NCBI_ACC(tpe, wgs, nuc), // {}
513  eAcc_embl_tpa_wgs_prot = NCBI_ACC(tpe, wgs, prot), // {}
514  eAcc_embl_tpa_wgsm_nuc = NCBI_ACC(tpe, wgs_master, nuc),
515  eAcc_embl_tpa_wgsm_prot = NCBI_ACC(tpe, wgs_master, prot),
516  eAcc_embl_tpa_wgsv_nuc = NCBI_ACC(tpe, wgs_vdb_only, nuc),
517  eAcc_embl_tpa_wgsv_prot = NCBI_ACC(tpe, wgs_vdb_only, prot),
518  eAcc_embl_tpa_wgsvm_nuc = NCBI_ACC(tpe, wgs_vdb_master, nuc),
519  eAcc_embl_tpa_wgsvm_prot = NCBI_ACC(tpe, wgs_vdb_master, prot),
520 
521  eAcc_ddbj_tpa_other = NCBI_ACC(tpd, other, seq),
522  eAcc_ddbj_tpa_nuc = NCBI_ACC(tpd, other, nuc), //BR
523  eAcc_ddbj_tpa_prot = NCBI_ACC(tpd, other, prot),//FAA
524  eAcc_ddbj_tpa_tsa_nuc = NCBI_ACC(tpd, tsa, nuc), //YAAA
525  eAcc_ddbj_tpa_tsa_prot = NCBI_ACC(tpd, tsa, prot),//{}
526  eAcc_ddbj_tpa_tsam_nuc = NCBI_ACC(tpd, tsa_master, nuc),
527  eAcc_ddbj_tpa_tsam_prot = NCBI_ACC(tpd, tsa_master, prot),
528  eAcc_ddbj_tpa_tsav_nuc = NCBI_ACC(tpd, tsa_vdb_only, nuc),
529  eAcc_ddbj_tpa_tsav_prot = NCBI_ACC(tpd, tsa_vdb_only, prot),
530  eAcc_ddbj_tpa_tsavm_nuc = NCBI_ACC(tpd, tsa_vdb_master, nuc),
531  eAcc_ddbj_tpa_tsavm_prot = NCBI_ACC(tpd, tsa_vdb_master, prot),
532  eAcc_ddbj_tpa_con = NCBI_ACC(tpd, con, nuc), //HT
533  eAcc_ddbj_tpa_wgs_nuc = NCBI_ACC(tpd, wgs, nuc), //EAAA
534  eAcc_ddbj_tpa_wgs_prot = NCBI_ACC(tpd, wgs, prot),//IAA
535  eAcc_ddbj_tpa_wgsm_nuc = NCBI_ACC(tpd, wgs_master, nuc),
536  eAcc_ddbj_tpa_wgsm_prot = NCBI_ACC(tpd, wgs_master, prot),
537  eAcc_ddbj_tpa_wgsv_nuc = NCBI_ACC(tpd, wgs_vdb_only, nuc),
538  eAcc_ddbj_tpa_wgsv_prot = NCBI_ACC(tpd, wgs_vdb_only, prot),
539  eAcc_ddbj_tpa_wgsvm_nuc = NCBI_ACC(tpd, wgs_vdb_master, nuc),
540  eAcc_ddbj_tpa_wgsvm_prot = NCBI_ACC(tpd, wgs_vdb_master, prot),
541  eAcc_ddbj_tpa_chromosome = NCBI_ACC(tpd, chromosome, nuc), //HU
542  eAcc_ddbj_tpa_targeted_nuc = NCBI_ACC(tpd, targeted, nuc), //ZAAA
543  eAcc_ddbj_tpa_targetedm_nuc = NCBI_ACC(tpd, targeted_master, nuc),
544  eAcc_ddbj_tpa_targetedv_nuc = NCBI_ACC(tpd, targeted_vdb_only, nuc),
545  eAcc_ddbj_tpa_targetedvm_nuc = NCBI_ACC(tpd, targeted_vdb_master, nuc),
546 
547  // genome pipeline, modeled after RefSeq
548  eAcc_gpipe_other_nuc = NCBI_ACC(gpipe, other, nuc), // GPN_
549  eAcc_gpipe_prot = NCBI_ACC(gpipe, other, prot), // GPP_
550  eAcc_gpipe_scaffold = NCBI_ACC(gpipe, con, nuc), // GPS_
551  eAcc_gpipe_mrna = NCBI_ACC(gpipe, mrna, nuc), // GPM_
552  eAcc_gpipe_chromosome = NCBI_ACC(gpipe, chromosome, nuc), // GPC_
553  eAcc_gpipe_genomic = NCBI_ACC(gpipe, genomic_rgn, nuc), // GPG_
554  eAcc_gpipe_ncrna = NCBI_ACC(gpipe, ncrna, nuc), // GPR_
555  eAcc_gpipe_unreserved = NCBI_ACC(gpipe, unreserved, seq), // GPX_
556 
557  // named annotation track; mixed nucleotides and proteins
558  eAcc_named_annot_track = NCBI_ACC(named_annot_track, other, seq) // AT_
559 #undef NCBI_ACC
560  };
561 
563  { return static_cast<E_Choice>(info & eAcc_type_mask); }
564 
565  /// Deduces information from a bare accession a la WHICH_db_accession;
566  /// may report false negatives on properties.
567  static EAccessionInfo IdentifyAccession(const CTempString& accession,
568  TParseFlags flags = fParse_AnyRaw);
569  EAccessionInfo IdentifyAccession(TParseFlags flags
570  = fParse_AnyRaw | fParse_AnyLocal) const;
571 
572  static void LoadAccessionGuide(const string& filename);
573  static void LoadAccessionGuide(ILineReader& in);
574 
575  /// Match() - TRUE if SeqIds are equivalent
576  bool Match(const CSeq_id& sid2) const;
577 
578  /// Compare return values
579  enum E_SIC {
580  e_error = 0, /// some problem
581  e_DIFF, /// different SeqId types-can't compare
582  e_NO, /// SeqIds compared, but are different
583  e_YES /// SeqIds compared, are equivalent
584  };
585 
586  /// Compare() - more general
587  E_SIC Compare(const CSeq_id& sid2) const;
588  int CompareOrdered(const CSeq_id& sid2) const;
589  bool operator<(const CSeq_id& sid2) const
590  {
591  return CompareOrdered(sid2) < 0;
592  }
593 
594  /// Return embedded CTextseq_id, if any
595  const CTextseq_id* GetTextseq_Id(void) const;
596 
597  /// Implement serializable interface
598  virtual void WriteAsFasta(ostream& out) const;
599  CProxy DumpAsFasta(void) const { return Dump(eAsFasta); }
600  const string AsFastaString(void) const;
601 
602  /// return the label for a given string
603  enum ELabelType {
604  eType, ///< FASTA-style type, or database in GeneralDbIsContent mode.
605  eContent, ///< Untagged human-readable accession or the like.
606  eBoth, ///< Type and content, delimited by a vertical bar.
607  eFasta, ///< Tagged ID in NCBI's traditional FASTA style.
608  eFastaContent, ///< Like eFasta, but without any tag.
609 
610  /// default is to show type + content
611  eDefault = eBoth
612  };
613 
614  enum ELabelFlags {
615  fLabel_Version = 0x10, ///< Show the version
616  /// For type general, use the database name as the tag
617  /// and the (text or numeric) key as the content.
618  fLabel_GeneralDbIsContent = 0x20,
619  fLabel_Trimmed = 0x40, ///< Trim trailing FASTA delimeters.
620  fLabel_UpperCase = 0x80, ///< Upper case label, with special encoding for PDB chain-ids
621 
622  /// default options - always show the version
623  fLabel_Default = fLabel_Version
624  };
625  typedef int TLabelFlags;
626  /// Append a label for this Seq-id to the supplied string.
627  /// @param label
628  /// String to append to.
629  /// @param type
630  /// Type of label (human-readable type-tagged content, by default).
631  /// @param flags
632  /// Flags fine-tuning behavior for human-readable output (ignored
633  /// in eFasta and eFastaContent mode).
634  /// @sa ELabelType, ELabelFlags
635  void GetLabel(string* label,
637  TLabelFlags flags = fLabel_Default) const;
638  /// Append a label for this Seq-id to the supplied string, splitting
639  /// out the version to a separate output parameter.
640  /// @note In eFasta and eFastaContent mode, this method includes the
641  /// version (if any) in the label and does not touch *version.
642  /// @param label
643  /// String to append to.
644  /// @param version
645  /// Pointer to hold the returned version.
646  /// @param type
647  /// Type of label (human-readable type-tagged content, by default).
648  /// @sa ELabelType
649  void GetLabel(string* label,
650  int* version,
651  ELabelType type = eDefault) const;
652 
653  ///Return seqid string with optional version for text seqid type
654  string GetSeqIdString(bool with_version = false) const;
655 
656  ///Return seqid string for text seqid type with separate integer version
657  string GetSeqIdString(int* version) const;
658 
659  /// Get a string representation of the sequence IDs of a given bioseq. This
660  /// function produces strings in a number of possible formats.
662  eFormat_FastA, // FastA format
663  eFormat_ForceGI, // GI only, in FastA format
664  eFormat_BestWithoutVersion, // 'Best' accession, without the version
665  eFormat_BestWithVersion // 'Best' accession, with version
666  };
667  static string GetStringDescr(const CBioseq& bioseq, EStringFormat fmt);
668 
669  /// Write a bioseq's IDs in FASTA format
670  /// @param ostr
671  /// Stream to write to
672  /// @param bioseq
673  /// Bioseq to get IDs from
674  /// @return
675  /// The stream that was passed in, after all writes occurred
676  static CNcbiOstream& WriteAsFasta(CNcbiOstream& ostr,
677  const CBioseq& bioseq);
678 
679  /// Perform rudimentary validation on potential local IDs, whose
680  /// contents should be pure ASCII and limited to letters, digits,
681  /// and certain punctuation characters (-_.:*# as of August 2010).
682  static bool IsValidLocalID(const CTempString& s);
683 
684  /// Perform rudimentary validation on potential local IDs, whose
685  /// contents should not exceed fifty characters and are limited
686  /// to ASCII characters excluding >[]|\""
687  static TErrorFlags CheckLocalID(const CTempString& s);
688 
689  /// Parse a string representing one or more Seq-ids, appending the
690  /// results to IDS. Multiple IDs must appear in FASTA style.
691  /// @param ids
692  /// Destination ID set. Existing contents will be preserved and
693  /// appended to.
694  /// @param s
695  /// Input string to parse.
696  /// @param flags
697  /// How to interpret anything other than well-formed FASTA IDs.
698  /// @return
699  /// The number of IDs successfully parsed.
700  static SIZE_TYPE ParseIDs(CBioseq::TId& ids, const CTempString& s,
701  TParseFlags flags = fParse_Default);
702 
703  static bool IsValid(const CBioseq::TId& ids, TParseFlags flags = fParse_Default);
704  static bool IsValid(const CSeq_id& id, TParseFlags flags = fParse_Default);
705 
706  /// Parse an entire set of |-delimited FASTA-style IDs, appending
707  /// the results to IDS.
708  /// @param ids
709  /// Destination ID set. Existing contents will be preserved and
710  /// appended to.
711  /// @param s
712  /// Input string to parse.
713  /// @param allow_partial_failure
714  /// If s contains invalid IDs, warn about them and try to
715  /// process the remainder of the string, rather than throwing
716  /// any exceptions.
717  /// @return
718  /// The number of IDs successfully parsed.
719  static SIZE_TYPE ParseFastaIds(CBioseq::TId& ids, const CTempString& s,
720  bool allow_partial_failure = false);
721 
722  /// Numerical quality ranking; lower is better. (Text)Score, aka
723  /// WorstRank, corresponds to the C Toolkit's SeqIdFindWorst,
724  /// which favors textual accessions, whereas BestRank corresponds
725  /// to the C Toolkit's SeqIdFindBest and favors GIs. In addition,
726  /// there is a pair of methods corresponding to the C Toolkit's
727  /// GetOrderBySeqId, used when generating FASTA deflines.
728  ///
729  /// All rankings give a slight bonus to accessions that carry
730  /// versions.
731 
732  enum EMaxScore {
733  kMaxScore = 99999
734  };
735 
737  fRequireAccessions = 1 << 0
738  };
740 
741  int AdjustScore (int base_score,
742  TAdjustScoreFlags flags = TAdjustScoreFlags())
743  const;
744  int StrictAdjustScore (int base_score) const
745  { return AdjustScore(base_score, fRequireAccessions); }
746  int BaseTextScore (void) const;
747  int BaseBestRankScore (void) const;
748  int BaseWorstRankScore(void) const { return BaseTextScore(); }
749  int BaseFastaAAScore (void) const;
750  int BaseFastaNAScore (void) const;
751  int BaseBlastScore (void) const;
752 
753  int TextScore (void) const { return AdjustScore(BaseTextScore()); }
754  int BestRankScore (void) const { return AdjustScore(BaseBestRankScore()); }
755  int WorstRankScore(void) const { return TextScore(); }
756  int FastaAAScore (void) const { return AdjustScore(BaseFastaAAScore()); }
757  int FastaNAScore (void) const { return AdjustScore(BaseFastaNAScore()); }
758  int BlastScore (void) const { return AdjustScore(BaseBlastScore()); }
759 
760  int StrictTextScore (void) const
761  { return StrictAdjustScore(BaseTextScore()); }
762  int StrictBestRankScore (void) const
763  { return StrictAdjustScore(BaseBestRankScore()); }
764  int StrictFastaAAScore (void) const
765  { return StrictAdjustScore(BaseFastaAAScore()); }
766  int StrictFastaNAScore (void) const
767  { return StrictAdjustScore(BaseFastaNAScore()); }
768  int StrictBlastScore (void) const
769  { return StrictAdjustScore(BaseBlastScore()); }
770 
771  /// Wrappers for use with FindBestChoice from <corelib/ncbiutil.hpp>
772  static int Score(const CRef<CSeq_id>& id)
773  { return id ? id->TextScore() : kMax_Int; }
774  static int BestRank(const CRef<CSeq_id>& id)
775  { return id ? id->BestRankScore() : kMax_Int; }
776  static int WorstRank(const CRef<CSeq_id>& id)
777  { return Score(id); }
778  static int FastaAARank(const CRef<CSeq_id>& id)
779  { return id ? id->FastaAAScore() : kMax_Int; }
780  static int FastaNARank(const CRef<CSeq_id>& id)
781  { return id ? id->FastaNAScore() : kMax_Int; }
782  static int BlastRank(const CRef<CSeq_id>& id)
783  { return id ? id->BlastScore() : kMax_Int; }
784 
785  static int StrictScore(const CRef<CSeq_id>& id)
786  { return id ? id->StrictTextScore() : kMax_Int; }
787  static int StrictBestRank(const CRef<CSeq_id>& id)
788  { return id ? id->StrictBestRankScore() : kMax_Int; }
789  static int StrictFastaAARank(const CRef<CSeq_id>& id)
790  { return id ? id->StrictFastaAAScore() : kMax_Int; }
791  static int StrictFastaNARank(const CRef<CSeq_id>& id)
792  { return id ? id->StrictFastaNAScore() : kMax_Int; }
793  static int StrictBlastRank(const CRef<CSeq_id>& id)
794  { return id ? id->StrictBlastScore() : kMax_Int; }
795 
796  /// Optimized implementation of CSerialObject::Assign, which is
797  /// not so efficient.
798  virtual void Assign(const CSerialObject& source,
800 
802  /// Collect partially matching seq-ids: no-version, no-name etc.
803  /// The original id is not added to the set.
804  void GetMatchingIds(TSeqIdHandles& matches) const;
805  /// Collect partially matching textseq-ids.
806  /// @sa GetMatchingIds
807  void GetMatchingTextseqIds(TSeqIdHandles& matches) const;
808 
809  /// Check if the option to prefer accession.version over GI is enabled
810  /// (SeqId/PreferAccessionOverGi or SEQ_ID_PREFER_ACCESSION_OVER_GI).
811  static bool PreferAccessionOverGi(void);
812  /// Check if the option to avoid GI ids is enabled
813  /// (SeqId/AvoidGi or SEQ_ID_AVOID_GI).
814  static bool AvoidGi(void);
815 
816  /// Flags specifying special treatment for certain types of Seq-ids in
817  /// ComposeOSLT().
818  /// @sa ComposeOSLT
820  fAllowLocalId = (1 << 0), ///<
821  fGpipeAddSecondary = (1 << 1) ///< Add "ACC.VER(=1)" for a 2ndary id
822  };
823  typedef int TComposeOSLTFlags;
824 
825  /// JIRA ID-5188 : Compose OSLT string for the primary id, as well as OSLT
826  /// strings for the secondary ids, if any.
827  /// NB: given a single Seq-id, it is not always possible to determine
828  /// whether it should be treated as primary or secondary if it were part of
829  /// a list of Seq-ids in a Bioseq. In that case, this function returns it as
830  /// primary, and the final judgement needs to be made by the caller.
831  /// @param secondary_ids
832  /// OSLT strings for the secondary ids
833  /// @param parse_flags
834  /// Flags specifying special treatment for certain types of Seq-ids.
835  /// @return
836  /// OSLT string for the primary id
837  string ComposeOSLT(list<string>* secondary_ids = nullptr,
838  TComposeOSLTFlags parse_flags = 0) const;
839 
840  /// ID length restrictions
841  const static size_t kMaxLocalIDLength = 50;
842  const static size_t kMaxGeneralDBLength = 20;
843  const static size_t kMaxGeneralTagLength = 50;
844  const static size_t kMaxAccessionLength = 30;
845 
846  /// SNP annotation scale limits
848  eSNPScaleLimit_Default, // Use server defaults
852  eSNPScaleLimit_Chromosome
853  };
854 
855  static const char* GetSNPScaleLimit_Name(ESNPScaleLimit value);
856  static ESNPScaleLimit GetSNPScaleLimit_Value(const string& name);
857  bool IsAllowedSNPScaleLimit(ESNPScaleLimit scale_limit) const;
858 
859 private:
862  eTV_tr, // variant of sp
863  eTV_pgp // variant of pat
864  };
865 
866  static ETypeVariant x_IdentifyTypeVariant(E_Choice type,
867  const CTempString& str);
868 
869  // returns next type if determined along the way
870  E_Choice x_Init(list<CTempString>& fasta_pieces, E_Choice type,
871  ETypeVariant tv);
872 
873  // Prohibit copy constructor & assignment operator
874  CSeq_id(const CSeq_id&);
876 
877  static EAccessionInfo x_IdentifyAccession(const CTempString& main_acc,
879  bool has_version);
880 
881  void x_WriteContentAsFasta(ostream& out) const;
882 
883  //CRef<CAbstractObjectManager> m_ObjectManager;
884 
885 };
886 
887 
888 /////////////////////////////////////////////////////////////////////////////
889 ///
890 /// SSeqIdRange --
891 ///
892 /// Represent a range of contiguous INSD-style accessions.
893 
895 {
896  enum EFlags {
897  fAllowUnderscores = 0x1 ///< Allow prefixes to contain underscores.
898  };
899  typedef int TFlags; ///< binary OR of EFlags
900 
901  explicit SSeqIdRange(const CTempString& s, TFlags flags = 0);
902 
903 #ifndef NCBI_SWIG
905  {
906  public:
907  typedef random_access_iterator_tag iterator_category;
908  typedef string value_type;
909  typedef ptrdiff_t difference_type;
910  typedef const string* pointer;
911  typedef string& reference;
912 
914  : m_Range(&range), m_Number(range.start)
915  { }
916 
918  : m_Range(&range), m_Number(number)
919  { _ASSERT(number >= range.start && number <= range.stop + 1); }
920 
922  : m_Range(it.m_Range), m_Number(it.m_Number)
923  { }
924 
926  { m_Range = it.m_Range; m_Number = it.m_Number; return *this; }
927 
928  string operator* (void) const
929  { return m_Accession.empty() ? x_SetAccession() : m_Accession; }
930  const string* operator->(void) const
931  { return m_Accession.empty() ? &x_SetAccession() : &m_Accession; }
932  string operator[](int n) const
933  { return *(*this + n); }
934 
935  CRef<CSeq_id> GetID(void) const;
936 
938  { m_Accession.erase(); ++m_Number; return *this; }
940  { const_iterator orig = *this; ++*this; return orig; }
942  { m_Accession.erase(); --m_Number; return *this; }
944  { const_iterator orig = *this; --*this; return orig; }
946  { return const_iterator(*m_Range, m_Number + n); }
948  { return const_iterator(*m_Range, m_Number - n); }
950  { m_Accession.erase(); m_Number += n; return *this; }
952  { m_Accession.erase(); m_Number -= n; return *this; }
953 
954  bool operator==(const const_iterator& it) const
955  { return m_Number == it.m_Number; }
956  bool operator!=(const const_iterator& it) const
957  { return m_Number != it.m_Number; }
958  bool operator< (const const_iterator& it) const
959  { return m_Number < it.m_Number; }
960  bool operator> (const const_iterator& it) const
961  { return m_Number > it.m_Number; }
962  bool operator<=(const const_iterator& it) const
963  { return m_Number <= it.m_Number; }
964  bool operator>=(const const_iterator& it) const
965  { return m_Number >= it.m_Number; }
966  int operator-(const const_iterator& it) const
967  { return m_Number - it.m_Number; }
968 
969  private:
970  const string& x_SetAccession(void) const;
971 
973  int m_Number;
974  mutable string m_Accession;
975  };
976 
977  const_iterator begin(void) const
978  { return const_iterator(*this); }
979  const_iterator end(void) const
980  { return const_iterator(*this, stop + 1); }
981 #endif
982 
983  size_t size(void) const
984  { return stop - start + 1; }
985 
986  string prefix;
987  int start;
988  int stop;
989  int digits;
991 };
992 
993 
994 /////////////////////////////////////////////////////////////////////////////
995 ///
996 /// CSeqIdException --
997 ///
998 /// Define exceptions generated by CSeq_id.
999 
1001 {
1002 public:
1003  /// Error types that CSeq_id can generate.
1004  enum EErrCode {
1005  eUnknownType, ///< Unrecognized Seq-id type
1006  eFormat ///< Contents not parsable as expected
1007  };
1008 
1009  /// Translate from the error code value to its string representation.
1010  virtual const char* GetErrCodeString(void) const override;
1011 
1012  // Standard exception boilerplate code.
1014 };
1015 
1016 
1017 /// Dummy convertor for container search functions
1018 template<class TId>
1020 {
1021  return CConstRef<CSeq_id>(id);
1022 }
1023 
1024 
1025 /// Search the container of CRef<CSeq_id> for the id of given type.
1026 /// Return the id of requested type, or null CRef.
1027 template<class container>
1029  CSeq_id::E_Choice choice)
1030 {
1031  ITERATE (typename container, iter, ids) {
1032  if ( *iter && Get_ConstRef_Seq_id(*iter)->Which() == choice ) {
1033  return Get_ConstRef_Seq_id(*iter);
1034  }
1035  }
1036  return CConstRef<CSeq_id>(0);
1037 }
1038 
1039 /// Return gi from id list if exists, return 0 otherwise
1040 template<class container>
1041 TGi FindGi(const container& ids)
1042 {
1044  return id ? id->GetGi() : ZERO_GI;
1045 }
1046 
1047 
1048 /// Return text seq-id from id list if exists, return 0 otherwise
1049 template<class container>
1050 CConstRef<CSeq_id> FindTextseq_id(const container& ids)
1051 {
1052  ITERATE (typename container, iter, ids) {
1053  if ( *iter && Get_ConstRef_Seq_id(*iter)->GetTextseq_Id() ) {
1054  return Get_ConstRef_Seq_id(*iter);
1055  }
1056  }
1057  return CConstRef<CSeq_id>(0);
1058 }
1059 
1060 
1061 /////////////////// CSeq_id inline methods
1062 
1063 // Match - just uses Compare
1064 inline
1065 bool CSeq_id::Match (const CSeq_id& sid2) const
1066 {
1067  return Compare(sid2) == e_YES;
1068 }
1069 
1070 
1071 /////////////////// end of CSeq_id inline methods
1072 
1073 /* @} */
1074 
1075 
1076 END_objects_SCOPE // namespace ncbi::objects::
1078 
1079 #endif // OBJECTS_SEQ
Data storage class.
@ eBoth
Both preliminary and traceback stages.
Definition: blast_def.h:332
Definition: Dbtag.hpp:53
CSeqIdException –.
Definition: Seq_id.hpp:1001
*** Sequence identifiers ******************************** *
Definition: Seq_id_.hpp:80
Base class for all serializable objects.
Definition: serialbase.hpp:150
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:54
Definition: set.hpp:45
string GetSeqIdString(const CSeq_id &id)
Definition: compartp.cpp:100
static uch flags
bool operator<(const CEquivRange &A, const CEquivRange &B)
std::ofstream out("events_result.xml")
main entry point for tests
static const char * str(char *buf, int n)
Definition: stats.c:84
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
Int8 TIntId
Definition: ncbimisc.hpp:999
#define ZERO_GI
Definition: ncbimisc.hpp:1088
EErrCode
Error types that an application can generate.
Definition: ncbiexpt.hpp:884
CVect2< NCBI_PROMOTE(int,U) > operator*(int v1, const CVect2< U > &v2)
Definition: globals.hpp:371
ESerialRecursionMode
How to assign and compare child sub-objects of serial objects.
Definition: serialdef.hpp:191
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
virtual void WriteAsFasta(CNcbiOstream &out) const
@ eRecursive
Recursively.
Definition: serialdef.hpp:192
const_iterator operator--(int)
Definition: Seq_id.hpp:943
ELabelFlags
Definition: Seq_id.hpp:614
int TErrorFlags
Definition: Seq_id.hpp:113
TGi FindGi(const container &ids)
Return gi from id list if exists, return 0 otherwise.
Definition: Seq_id.hpp:1041
int WorstRankScore(void) const
Definition: Seq_id.hpp:755
int StrictFastaAAScore(void) const
Definition: Seq_id.hpp:764
static int StrictBestRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:787
const_iterator(const SSeqIdRange &range)
Definition: Seq_id.hpp:913
CProxy DumpAsFasta(void) const
Definition: Seq_id.hpp:599
const_iterator & operator++(void)
Definition: Seq_id.hpp:937
CConstRef< CSeq_id > GetSeq_idByType(const container &ids, CSeq_id::E_Choice choice)
Search the container of CRef<CSeq_id> for the id of given type.
Definition: Seq_id.hpp:1028
static int StrictScore(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:785
int TLabelFlags
Definition: Seq_id.hpp:625
int TComposeOSLTFlags
Definition: Seq_id.hpp:823
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
const_iterator & operator+=(int n)
Definition: Seq_id.hpp:949
CConstRef< CSeq_id > Get_ConstRef_Seq_id(TId &id)
Dummy convertor for container search functions.
Definition: Seq_id.hpp:1019
int BestRankScore(void) const
Definition: Seq_id.hpp:754
const SSeqIdRange * m_Range
Definition: Seq_id.hpp:972
static E_Choice GetAccType(EAccessionInfo info)
Definition: Seq_id.hpp:562
static int StrictBlastRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:793
E_SIC
Compare return values.
Definition: Seq_id.hpp:579
const_iterator(const SSeqIdRange &range, int number)
Definition: Seq_id.hpp:917
EFastaAsTypeAndContent
Tag for method variants that would otherwise be ambiguous.
Definition: Seq_id.hpp:116
int BaseWorstRankScore(void) const
Definition: Seq_id.hpp:748
int TextScore(void) const
Definition: Seq_id.hpp:753
bool operator<=(const const_iterator &it) const
Definition: Seq_id.hpp:962
EComposeOSLTFlags
Flags specifying special treatment for certain types of Seq-ids in ComposeOSLT().
Definition: Seq_id.hpp:819
const_iterator end(void) const
Definition: Seq_id.hpp:979
EParseFlags
Definition: Seq_id.hpp:75
int BlastScore(void) const
Definition: Seq_id.hpp:758
const string * operator->(void) const
Definition: Seq_id.hpp:930
int TFlags
binary OR of EFlags
Definition: Seq_id.hpp:899
const_iterator operator+(int n) const
Definition: Seq_id.hpp:945
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1065
int operator-(const const_iterator &it) const
Definition: Seq_id.hpp:966
static int WorstRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:776
const_iterator operator++(int)
Definition: Seq_id.hpp:939
static int StrictFastaNARank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:791
const_iterator(const const_iterator &it)
Definition: Seq_id.hpp:921
int digits
Definition: Seq_id.hpp:989
const_iterator begin(void) const
Definition: Seq_id.hpp:977
const string * pointer
Definition: Seq_id.hpp:910
int StrictBestRankScore(void) const
Definition: Seq_id.hpp:762
static int StrictFastaAARank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:789
EMaxScore
Numerical quality ranking; lower is better.
Definition: Seq_id.hpp:732
NCBI_EXCEPTION_DEFAULT(CSeqIdException, CException)
int StrictTextScore(void) const
Definition: Seq_id.hpp:760
int StrictBlastScore(void) const
Definition: Seq_id.hpp:768
bool operator==(const const_iterator &it) const
Definition: Seq_id.hpp:954
const_iterator operator-(int n) const
Definition: Seq_id.hpp:947
CSeq_id_Base Tparent
Definition: Seq_id.hpp:72
EStringFormat
Get a string representation of the sequence IDs of a given bioseq.
Definition: Seq_id.hpp:661
CSeq_id::EAccessionInfo acc_info
Definition: Seq_id.hpp:990
#define NCBI_ACC(type, div, mol)
Definition: Seq_id.hpp:317
static int Score(const CRef< CSeq_id > &id)
Wrappers for use with FindBestChoice from <corelib/ncbiutil.hpp>
Definition: Seq_id.hpp:772
const_iterator & operator-=(int n)
Definition: Seq_id.hpp:951
bool operator>=(const const_iterator &it) const
Definition: Seq_id.hpp:964
static int BestRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:774
bool operator<(const CSeq_id &sid2) const
Definition: Seq_id.hpp:589
random_access_iterator_tag iterator_category
Definition: Seq_id.hpp:907
string operator[](int n) const
Definition: Seq_id.hpp:932
string prefix
Definition: Seq_id.hpp:986
int TParseFlags
Definition: Seq_id.hpp:104
int StrictAdjustScore(int base_score) const
Definition: Seq_id.hpp:744
static int BlastRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:782
CSeq_id(const CSeq_id &)
ELabelType
return the label for a given string
Definition: Seq_id.hpp:603
const_iterator & operator--(void)
Definition: Seq_id.hpp:941
DECLARE_SAFE_FLAGS_TYPE(EAdjustScoreFlags, TAdjustScoreFlags)
string GetLabel(const CSeq_id &id)
static int FastaNARank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:780
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
Definition: Seq_id.cpp:169
int FastaAAScore(void) const
Definition: Seq_id.hpp:756
const_iterator & operator=(const const_iterator &it)
Definition: Seq_id.hpp:925
ETypeVariant
Definition: Seq_id.hpp:860
set< CSeq_id_Handle > TSeqIdHandles
Definition: Seq_id.hpp:801
int StrictFastaNAScore(void) const
Definition: Seq_id.hpp:766
bool operator!=(const const_iterator &it) const
Definition: Seq_id.hpp:956
CConstRef< CSeq_id > FindTextseq_id(const container &ids)
Return text seq-id from id list if exists, return 0 otherwise.
Definition: Seq_id.hpp:1050
EErrorFlags
Definition: Seq_id.hpp:107
EAdjustScoreFlags
Definition: Seq_id.hpp:736
E_SIC Compare(const CSeq_id &sid2) const
Compare() - more general.
Definition: Seq_id.cpp:411
ESNPScaleLimit
SNP annotation scale limits.
Definition: Seq_id.hpp:847
int FastaNAScore(void) const
Definition: Seq_id.hpp:757
size_t size(void) const
Definition: Seq_id.hpp:983
static int FastaAARank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:778
@ eUnknownType
Unrecognized Seq-id type.
Definition: Seq_id.hpp:1005
@ e_NO
different SeqId types-can't compare
Definition: Seq_id.hpp:582
@ e_DIFF
some problem
Definition: Seq_id.hpp:581
@ e_YES
SeqIds compared, but are different.
Definition: Seq_id.hpp:583
@ eFormat_BestWithoutVersion
Definition: Seq_id.hpp:664
@ eFormat_FastA
Definition: Seq_id.hpp:662
@ eFormat_ForceGI
Definition: Seq_id.hpp:663
@ eFastaContent
Like eFasta, but without any tag.
Definition: Seq_id.hpp:608
@ eContent
Untagged human-readable accession or the like.
Definition: Seq_id.hpp:605
@ eBoth
Type and content, delimited by a vertical bar.
Definition: Seq_id.hpp:606
@ eFasta
Tagged ID in NCBI's traditional FASTA style.
Definition: Seq_id.hpp:607
@ eType
FASTA-style type, or database in GeneralDbIsContent mode.
Definition: Seq_id.hpp:604
@ eTV_tr
Definition: Seq_id.hpp:862
@ eTV_plain
Definition: Seq_id.hpp:861
@ eSNPScaleLimit_Supercontig
Definition: Seq_id.hpp:851
@ eSNPScaleLimit_Unit
Definition: Seq_id.hpp:849
@ eSNPScaleLimit_Default
Definition: Seq_id.hpp:848
@ eSNPScaleLimit_Contig
Definition: Seq_id.hpp:850
bool IsValid(const CSeq_point &pt, CScope *scope)
Checks that point >= 0 and point < length of Bioseq.
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
#define kMax_Int
Definition: ncbi_limits.h:184
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define kEmptyStr
Definition: ncbistr.hpp:123
#define NCBI_SEQLOC_EXPORT
Definition: ncbi_export.h:776
static const char label[]
CSeq_id_Base & operator=(const CSeq_id_Base &)
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
@ e_Gibbmt
Geninfo backbone moltype.
Definition: Seq_id_.hpp:97
@ e_Giim
Geninfo import id.
Definition: Seq_id_.hpp:98
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
Definition: Seq_id_.hpp:113
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Gibbsq
Geninfo backbone seqid.
Definition: Seq_id_.hpp:96
@ e_General
for other databases
Definition: Seq_id_.hpp:105
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ e_Named_annot_track
Internal named annotation tracking ID.
Definition: Seq_id_.hpp:114
@ e_not_set
No variant selected.
Definition: Seq_id_.hpp:94
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Local
local use
Definition: Seq_id_.hpp:95
@ e_Pdb
PDB sequence.
Definition: Seq_id_.hpp:109
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is orig
yy_size_t n
static MDB_envinfo info
Definition: mdb_load.c:37
static int version
Definition: mdb_load.c:29
range(_Ty, _Ty) -> range< _Ty >
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
const CharType(& source)[N]
Definition: pointer.h:1149
const char * tag
std::istream & in(std::istream &in_, double &x_)
static BOOL number
Definition: pcregrep.c:193
string GetStringDescr(const CBioseq &bioseq)
Definition: shortcuts.cpp:309
static const string kMaxScore
Definition: showdefline.cpp:85
void Dump(CSplitCacheApp *app, const C &obj, ESerialDataFormat format, const string &key, const string &suffix=kEmptyStr)
SSeqIdRange –.
Definition: Seq_id.hpp:895
Definition: type.c:6
#define _ASSERT
bool operator>(const typename tree< T, tree_node_allocator >::iterator_base &one, const typename tree< T, tree_node_allocator >::iterator_base &two)
Definition: tree_msvc7.hpp:426
static bool ambig(char c)
#define local
Definition: zutil.h:33
Modified on Wed Apr 17 13:09:59 2024 by modify_doxy.py rev. 669887