NCBI C++ ToolKit
flatfile_parse_info.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: flatfile_parse_info.hpp 102382 2024-04-28 12:37:20Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: flatfile_parse_info.hpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen, Alexey Dobronadezhdin
29  *
30  * File Description:
31  *
32  */
33 
34 #ifndef __FLATFILE_PARSE_INFO__
35 #define __FLATFILE_PARSE_INFO__
36 
37 #include <list>
38 #include <optional>
40 
42 
43 // some forward declarations
44 struct Indexblk;
45 struct ProtBlk;
46 class CKeywordParser;
49 
50 using TEntryList = list<CRef<objects::CSeq_entry>>;
51 
52 struct FileBuf {
53  const char* start = nullptr;
54  const char* current = nullptr;
55 
56  void set(const char* p, size_t offs = 0)
57  {
58  start = p;
59  current = p + offs;
60  }
61 
62  size_t get_offs() const { return size_t(current - start); }
63  void set_offs(size_t offs) { current = start + offs; }
64 };
65 
66 struct Parser {
67 
68  enum class EOutput {
69  BioseqSet,
70  Seqsubmit
71  };
72 
73  enum class EMode {
74  Release,
75  HTGS,
76  HTGSCON,
77  Relaxed
78  };
79 
80  enum class ESource {
81  unknown,
82  NCBI,
83  EMBL,
84  GenBank,
85  DDBJ,
86  LANL,
87  SPROT,
88  Refseq,
89  Flybase,
90  USPTO,
91  All
92  };
93 
94  enum class EFormat {
95  unknown,
96  EMBL,
97  GenBank,
98  SPROT,
99  DDBJ,
100  XML,
101  ALL
102  };
103 
104 
105  Int4 indx = 0; /* total number of records in the
106  flat file, exclude BadLocusName entries */
107  vector<IndexblkPtr> entrylist; /* the index block */
108  Int4 curindx = 0; /* current index of the entrylist */
110  size_t GetNumEntries() const { return (Uint4)indx; }
111 
112  /* all the files will be produced in the directory where the program was
113  * executed except the input file which located in the argument path
114  */
116 
117  string release_str;
118  string authors_str;
119 
121 
122  /* next 4 + 3 variables record data from command arguments
123  */
124  Int4 limit = 0; /* limit to sequence length.
125  As of June, 2004 sequence length
126  limitation removed. This variable
127  will be always 0 */
128  EFormat format = EFormat::unknown; /* flat file format */
129  ESource source = ESource::unknown; /* source of flat file */
130  bool all = false; /* any source of flat file */
131  Uint1 seqtype = 0; /* sequence type based on source
132  of flat file */
133  Int4 num_drop = 0; /* number of entries with foregn
134  acc# (dropped) */
135  const char* acprefix = nullptr; /* decide the drop value, s.t.
136  checking the prefix character of
137  the accession number, an option
138  user provided from the command
139  line argument */
140  Uint1 entrez_fetch = 0; /* PUBSEQBioseqFetchEnable()
141  0 - do not need this connection;
142  1 - need it and got it;
143  2 - need it and failed, will
144  reconnect */
145  Uint1 taxserver = 0; /* if != 0, call TaxArchInit() */
146  ProtBlkPtr pbp = nullptr; /* for processing nucleic acid
147  protein sequence */
148  Uint1 medserver = 0; /* == 1, if MedArchInit() call
149  succeeded */
150  bool normalize = false;
151 
153  bool always_look = true; /* if TRUE, look up even if muid in
154  Pub-equiv */
155  bool replace_cit = true; /* if TRUE, replace Cit-art w/ replace
156  from MEDLINE */
157  int lookups_attempted; /* citartmatch tries */
158  int lookups_succeeded; /* citartmatch worked */
159  int fetches_attempted; /* FetchPubs tried */
160  int fetches_succeeded; /* FetchPubs that worked */
161  bool merge_ids = true; /* If TRUE then merges Cit-art.ids from
162  input Cit-sub and one gotten from
163  med server. */
164  };
165 
166 
167  SFindPubOptions fpo; /* for medline uid lookup */
168  bool date = false; /* if TRUE, replace update date
169  from LOCUS */
170  bool no_date = false; /* if TRUE, if no update and curr
171  date come out */
172  bool citat = false; /* if TRUE, removes serial-numbers */
173  bool transl = false; /* if TRUE program replaces translation */
174  bool sort = false; /* if TRUE, program doesn't sort entries */
175  bool debug = false; /* output everthing */
176  bool segment = false; /* treat the input file as segment in embl format */
177  bool no_code = false; /* no genetic code from server try to guess */
178  bool seg_acc = false; /* use accession for segmented set Id */
179  bool convert = false; /* convert to new asn.1 spec (ver. 4.0) */
180  const char** accpref = nullptr; /* a list of allowable 2-letter
181  prefixes in new format of accession
182  numbers 2 letters + 6 digits */
183  bool accver = false; /* ACCESSION.VERSION */
184  bool histacc = false; /* Populate Seq-inst.hist.replaces with secondaries */
185  bool ign_toks = false; /* Ignore multiple tokens in DDBJ's VERSION line. Default = FALSE */
186  bool ign_prot_src = false; /* If set to TRUE, then does not reject record if protein accession
187  prefix does not fit sequence owner */
188  bool ign_bad_qs = false; /* If TRUE, then does not reject the record with bad quality score */
189  EMode mode = EMode::Release; /* Known so far: RELEASE and HTGS. For now only difference between
190  severity of error messages. */
191  bool diff_lt = false; /* If TRUE, then will allow to have same genes with different
192  locus_tags. Default is FALSE. */
193  Int4 errstat = 0; /* Just a temporary storage */
194  bool allow_uwsec = false; /* Allows unusual secondary WGS accessions with prefixes not
195  matching the primary one */
196  //struct FTAOperon* operon=nullptr;
197  bool xml_comp = false; /* INSDSeq/GenBank/EMBL compatible */
198  bool sp_dt_seq_ver = true; /* For SwissProt "Reviewed" records
199  only: puts the sequence version
200  number from "sequence version" DT
201  line into Seq-id.version slot */
202  bool simple_genes = false; /* If set to TRUE, then will always
203  merge join locations to the single
204  ones while generating genes */
205  Int4 cleanup = 0; /* pick the required cleanup function:
206  0 - legacy parser version of SSEC;
207  1 - SSEC;
208  2 - none.
209  Default is 0. */
210  bool allow_crossdb_featloc = false;
211  bool genenull = false;
212  const char* qsfile = nullptr; /* Do not free, just a pointer */
213 
214 
215  FILE* qsfd = nullptr;
216  bool qamode = false;
217  optional<string> buf; /* Temporary storage for locations checks */
218  EOutput output_format = EOutput::BioseqSet; /* Bioseq-set or Seq-submit */
219 
220  // buffer based parsing
221  bool ffdb = false; /* Use FlatFile database */
222  bool farseq = false;
223  void* user_data = nullptr;
224  char* (*ff_get_entry)(const char* accession) = nullptr;
225  char* (*ff_get_entry_v)(const char* accession, Int2 vernum) = nullptr;
226  char* (*ff_get_qscore)(const char* accession, Int2 v) = nullptr;
227  char* (*ff_get_qscore_pp)(const char* accession, Int2 v, Parser* pp) = nullptr;
228  char* (*ff_get_entry_pp)(const char* accession, Parser* pp) = nullptr;
229  char* (*ff_get_entry_v_pp)(const char* accession, Int2 vernum, Parser* pp) = nullptr;
230 
231  Parser();
232  virtual ~Parser();
233 
234  // not a good place but until CFlatFileParser develops, pretty much the only
235  // possible place.
236  // and unique_ptr didn't work here because of template initialization issues.
239 
240 private:
242 };
243 
244 using ParserPtr = Parser*;
245 
246 /**************************************************************************/
247 void fta_init_pp(Parser& pp);
248 
250 
251 #endif
list< CRef< objects::CSeq_entry > > TEntryList
void fta_init_pp(Parser &pp)
Definition: ftamain.cpp:941
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
const char * current
const char * start
void set(const char *p, size_t offs=0)
void set_offs(size_t offs)
size_t get_offs() const
CKeywordParser * mpKeywordParser
const char * acprefix
vector< IndexblkPtr > entrylist
bool allow_crossdb_featloc
virtual ~Parser()
optional< string > buf
const char ** accpref
CKeywordParser & KeywordParser()
void InitializeKeywordParser(EFormat)
const char * qsfile
SFindPubOptions fpo
Indexblk * CurEntry()
ProtBlkPtr pbp
EOutput output_format
TEntryList entries
size_t GetNumEntries() const
Modified on Fri Sep 20 14:57:49 2024 by modify_doxy.py rev. 669887