NCBI C++ ToolKit
indx_blk.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: indx_blk.cpp 103112 2024-09-10 13:39:01Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: indx_blk.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Common for all format functions.
32  *
33  */
34 
35 #include <ncbi_pch.hpp>
36 
37 #include "ftacpp.hpp"
38 
39 #include "index.h"
41 
42 #include "ftaerr.hpp"
43 #include "indx_blk.h"
44 #include "indx_def.h"
45 #include "utilfun.h"
46 #include <map>
47 
48 #ifdef THIS_FILE
49 # undef THIS_FILE
50 #endif
51 #define THIS_FILE "indx_blk.cpp"
52 
53 
56 
57 // clang-format off
58 static const char* XML_STRAND_array[] = {
59  " ", "single", "double", "mixed", nullptr
60 };
61 
62 static const char* XML_TPG_array[] = {
63  " ", "Linear", "Circular", "Tandem", nullptr
64 };
65 
66 static const char* ParFlat_NA_array_DDBJ[] = {
67  "cDNA", nullptr
68 };
69 
70 static const char* ParFlat_AA_array_DDBJ[] = {
71  "PRT", nullptr
72 };
73 
74 static const char* ParFlat_NA_array[] = {
75  " ", "NA", "DNA", "genomic DNA", "other DNA", "unassigned DNA", "RNA",
76  "mRNA", "rRNA", "tRNA", "uRNA", "scRNA", "snRNA", "snoRNA", "pre-RNA",
77  "pre-mRNA", "genomic RNA", "other RNA", "unassigned RNA", "cRNA",
78  "viral cRNA", nullptr
79 };
80 
81 static const char* ParFlat_DIV_array[] = {
82  " ", "PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", "RNA",
83  "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS", "ORG", "GSS",
84  "HUM", "HTG", "CON", "HTC", "ENV", "TSA", nullptr
85 };
86 
87 static const char* embl_accpref[] = {
88  "AJ", "AL", "AM", "AN", "AX", "BN", "BX", "CQ", "CR", "CS", "CT", "CU",
89  "FB", "FM", "FN", "FO", "FP", "FQ", "FR", "GM", "GN", "HA", "HB", "HC",
90  "HD", "HE", "HF", "HG", "HH", "HI", "JA", "JB", "JC", "JD", "JE", "LK",
91  "LL", "LM", "LN", "LO", "LP", "LQ", "LR", "LS", "LT", "MP", "MQ", "MR",
92  "MS", "OA", "OB", "OC", "OD", "OE", "OU", "OV", "OW", "OX", "OY", "OZ",
93  nullptr
94 };
95 
96 static const char* lanl_accpref[] = {
97  "AD", nullptr
98 };
99 
100 static const char* sprot_accpref[] = {
101  "DD", nullptr
102 };
103 
104 static const char* ddbj_accpref[] = {
105  "AB", "AG", "AK", "AP", "AT", "AU", "AV", "BA", "BB", "BD", "BJ", "BP",
106  "BR", "BS", "BW", "BY", "CI", "CJ", "DA", "DB", "DC", "DD", "DE", "DF",
107  "DG", "DH", "DI", "DJ", "DK", "DL", "DM", "FS", "FT", "FU", "FV", "FW",
108  "FX", "FY", "FZ", "GA", "GB", "HT", "HU", "HV", "HW", "HX", "HY", "HZ",
109  "LA", "LB", "LC", "LD", "LE", "LF", "LG", "LH", "LI", "LJ", "LU", "LV",
110  "LX", "LY", "LZ", "MA", "MB", "MC", "MD", "ME", "OF", "OG", "OH", "OI",
111  "OJ", "PA", "PE", "PF", "PG", "PH", "PI", "PJ", "PK", nullptr
112 };
113 
114 static const char* ncbi_accpref[] = {
115  "AA", "AC", "AD", "AE", "AF", "AH", "AI", "AQ", "AR", "AS", "AW", "AY",
116  "AZ", "BC", "BE", "BF", "BG", "BH", "BI", "BK", "BL", "BM", "BQ", "BT",
117  "BU", "BV", "BZ", "CA", "CB", "CC", "CD", "CE", "CF", "CG", "CH", "CK",
118  "CL", "CM", "CN", "CO", "CP", "CV", "CW", "CX", "CY", "CZ", "DN", "DP",
119  "DQ", "DR", "DS", "DT", "DU", "DV", "DW", "DX", "DY", "DZ", "EA", "EB",
120  "EC", "ED", "EE", "EF", "EG", "EH", "EI", "EJ", "EK", "EL", "EM", "EN",
121  "EP", "EQ", "ER", "ES", "ET", "EU", "EV", "EW", "EX", "EY", "EZ", "FA",
122  "FC", "FD", "FE", "FF", "FG", "FH", "FI", "FJ", "FK", "FL", "GC", "GD",
123  "GE", "GF", "GG", "GH", "GJ", "GK", "GL", "GO", "GP", "GQ", "GR", "GS",
124  "GT", "GU", "GV", "GW", "GX", "GY", "GZ", "HJ", "HK", "HL", "HM", "HN",
125  "HO", "HP", "HQ", "HR", "HS", "JF", "JG", "JH", "JI", "JJ", "JK", "JL",
126  "JM", "JN", "JO", "JP", "JQ", "JR", "JS", "JT", "JU", "JV", "JW", "JX",
127  "JY", "JZ", "KA", "KB", "KC", "KD", "KE", "KF", "KG", "KH", "KI", "KJ",
128  "KK", "KL", "KM", "KN", "KO", "KP", "KQ", "KR", "KS", "KT", "KU", "KV",
129  "KX", "KY", "KZ", "MF", "MG", "MH", "MI", "MJ", "MK", "ML", "MM", "MN",
130  "MO", "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "OK", "OL", "OM", "ON",
131  "OO", "OP", "OQ", "OR", "OS", "OT", "PP", "PQ", "PR", "PS", "PT", "PU",
132  nullptr
133 };
134 
135 static const char* refseq_accpref[] = {
136  "NC_", "NG_", "NM_", "NP_", "NR_", "NT_", "NW_", "XM_", "XP_", "XR_",
137  "NZ_", nullptr
138 };
139 
140 /*
141 static const char* refseq_prot_accpref[] = {
142  "AP_", "NP_", "WP_", "XP_", "YP_", "ZP_", nullptr
143 };
144 */
145 
146 static const char* acc_tsa_allowed[] = {
147  "AF", "AY", "DQ", "EF", "EU", "FJ", "GQ", "HQ", "JF", "JN", "JQ", "JX",
148  "KC", "KF", "KJ", "KM", "KP", "KR", "KT", "KU", "KX", "KY", "MF", "MG",
149  "MH", "MK", "MN", "MT", nullptr
150 };
151 
152 static const char* ncbi_tpa_accpref[] = {
153  "BK", "BL", "GJ", "GK", nullptr
154 };
155 
156 static const char* ddbj_tpa_accpref[] = {
157  "BR", "HT", "HU", nullptr
158 };
159 
160 static const char* ncbi_wgs_accpref[] = {
161  "GJ", "GK", nullptr
162 };
163 
164 static const char* ddbj_wgs_accpref[] = {
165  "HT", "HU", nullptr
166 };
167 
169  "CH", "CT", "CU", "DF", "DG", "DS",
170  "EM", "EN", "EP", "EQ", "FA", "FM",
171  "GG", "GJ", "GK", "GL", "HT", "HU",
172  "JH", "KB", "KD", "KE", "KI", "KK",
173  "KL", "KN", "KQ", "KV", "KZ", "LD",
174  "ML", "MU", "PS"
175 };
176 
178  { Parser::ESource::unknown, "unknown" },
179  { Parser::ESource::EMBL, "EMBL" },
180  { Parser::ESource::GenBank, "GENBANK" },
181  { Parser::ESource::SPROT, "Swiss-Prot" },
182  { Parser::ESource::NCBI, "NCBI" },
183  { Parser::ESource::LANL, "GSDB" },
184  { Parser::ESource::Flybase, "FlyBase" },
185  { Parser::ESource::Refseq, "RefSeq" }
186 };
187 
188 static const char* month_name[] = {
189  "Ill", "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC", nullptr
190 };
191 
192 static const char* ParFlat_RESIDUE_STR[] = {
193  "bp", "bp.", "bp,", "AA", "AA.", "AA,", nullptr
194 };
195 
196 static const char* ValidMolTypes[] = {
197  "genomic DNA",
198  "genomic RNA",
199  "mRNA",
200  "tRNA",
201  "rRNA",
202  "snoRNA",
203  "snRNA",
204  "scRNA",
205  "pre-RNA",
206  "pre-mRNA",
207  "other RNA",
208  "other DNA",
209  "transcribed RNA",
210  "unassigned RNA",
211  "unassigned DNA",
212  "viral cRNA",
213  nullptr
214 };
215 // clang-format on
216 
217 // functions below are implemented in different source files
218 bool EmblIndex(ParserPtr pp, void (*fun)(IndexblkPtr entry, char* offset, Int4 len));
219 bool GenBankIndex(ParserPtr pp);
220 bool SprotIndex(ParserPtr pp, void (*fun)(IndexblkPtr entry, char* offset, Int4 len));
221 bool XMLIndex(ParserPtr pp);
222 
223 /**********************************************************
224  *
225  * static char* GetResidue(stoken):
226  *
227  * Return a string pointer in the "stoken" which its
228  * next token string match any one string in the
229  * ParFlat_RESIDUE_STR but ignore case for all alphabetic
230  * characters; return NULL if not found.
231  *
232  * 3-25-93
233  *
234  **********************************************************/
235 static const char* GetResidue(TokenStatBlkPtr stoken)
236 {
237  const char** b;
238  Int2 i;
239 
240  auto ptr = stoken->list.begin();
241  auto sptr = next(ptr);
242  for (i = 1; i < stoken->num; i++, ptr = sptr, sptr = next(ptr)) {
243  for (b = ParFlat_RESIDUE_STR; *b; b++)
244  if (NStr::CompareNocase(*b, *sptr) == 0)
245  return ptr->c_str();
246  }
247 
248  return nullptr;
249 }
250 
251 /**********************************************************
252  *
253  * bool XReadFile(fp, finfo):
254  *
255  * Record position and line # of the file, loop stop
256  * when got a none blank line.
257  * Return TRUE if END_OF_FILE.
258  *
259  * 2-26-93
260  *
261  **********************************************************/
262 static bool XReadFile(FILE* fp, FinfoBlk& finfo)
263 {
264  bool end_of_file = false;
265 
266  StringCpy(finfo.str, "\n");
267  while (! end_of_file && StringEquN(finfo.str, "\n", 1)) {
268  finfo.pos = (size_t)ftell(fp);
269  if (! fgets(finfo.str, sizeof(finfo.str) - 1, fp))
270  end_of_file = true;
271  else
272  ++finfo.line;
273  }
274 
275  auto n = strlen(finfo.str);
276  while (n) {
277  n--;
278  if (finfo.str[n] != '\n' && finfo.str[n] != '\r') {
279  break;
280  }
281  finfo.str[n] = 0;
282  }
283 
284  return (end_of_file);
285 }
286 
287 /**********************************************************/
288 static Int2 FileGetsBuf(char* res, Int4 size, FileBuf& fbuf)
289 {
290  const char* p = nullptr;
291  char* q;
292  Int4 l;
293  Int4 i;
294 
295  if (fbuf.current == nullptr || *fbuf.current == '\0')
296  return (0);
297 
298  l = size - 1;
299  for (p = fbuf.current, q = res, i = 0; i < l; i++, p++) {
300  *q++ = *p;
301  if (*p == '\n' || *p == '\r') {
302  p++;
303  break;
304  }
305  }
306 
307  *q = '\0';
308  fbuf.current = p;
309  return (1);
310 }
311 
312 /**********************************************************/
313 bool XReadFileBuf(FileBuf& fbuf, FinfoBlk& finfo)
314 {
315  bool end_of_file = false;
316 
317  StringCpy(finfo.str, "\n");
318  while (! end_of_file && StringEquN(finfo.str, "\n", 1)) {
319  finfo.pos = fbuf.get_offs();
320  if (FileGetsBuf(finfo.str, sizeof(finfo.str) - 1, fbuf) == 0)
321  end_of_file = true;
322  else
323  ++finfo.line;
324  }
325 
326  return (end_of_file);
327 }
328 
329 /**********************************************************
330  *
331  * bool SkipTitle(fp, finfo, str, len):
332  *
333  * Return TRUE if file contains no entry in which no
334  * match in keyword "str".
335  * Skip any title declaration lines.
336  *
337  * 3-5-93
338  *
339  **********************************************************/
341 bool SkipTitle(FILE* fp, FinfoBlk& finfo, const char* str, size_t len)
342 {
343  bool end_of_file = XReadFile(fp, finfo);
344  while (! end_of_file && ! StringEquN(finfo.str, str, len))
345  end_of_file = XReadFile(fp, finfo);
346 
347  return (end_of_file);
348 }
349 
351 bool SkipTitle(FILE* fp, FinfoBlk& finfo, const CTempString& keyword)
352 {
353  return SkipTitle(fp, finfo, keyword.data(), keyword.size());
354 }
355 
356 // ----------------------------------------------------------------------------
357 bool SkipTitleBuf(FileBuf& fbuf, FinfoBlk& finfo, const CTempString& keyword)
358 // ----------------------------------------------------------------------------
359 {
360  const char* p = keyword.data();
361  size_t len = keyword.size();
362  bool end_of_file = XReadFileBuf(fbuf, finfo);
363  while (! end_of_file && ! StringEquN(finfo.str, p, len))
364  end_of_file = XReadFileBuf(fbuf, finfo);
365 
366  return end_of_file;
367 }
368 
369 
370 /**********************************************************
371  *
372  * static bool CheckLocus(locus):
373  *
374  * Locus name only allow A-Z, 0-9, characters,
375  * reject if not.
376  *
377  **********************************************************/
378 static bool CheckLocus(const char* locus, Parser::ESource source)
379 {
380  const char* p = locus;
381  if (StringEquN(locus, "SEG_", 4) &&
383  p += 4;
384  for (; *p != '\0'; p++) {
385  if ((*p >= '0' && *p <= '9') || (*p >= 'A' && *p <= 'Z') ||
386  (*p == '.' && source == Parser::ESource::Flybase))
387  continue;
388  if (((*p >= 'a' && *p <= 'z') || *p == '_' || *p == '-' || *p == '(' ||
389  *p == ')' || *p == '/') &&
391  continue;
392 
393  ErrPostEx(SEV_ERROR, ERR_LOCUS_BadLocusName, "Bad locusname, <%s> for this entry", locus);
394  break;
395  }
396 
397  return (*p != '\0');
398 }
399 
400 /**********************************************************
401  *
402  * static bool CheckLocusSP(locus):
403  *
404  * Locus name consists of up tp 10 uppercase
405  * alphanumeric characters.
406  * Rule: X_Y format (SWISS-PROT), reject if not
407  * - X is a mnemonic code, up to 4 alphanumeric
408  * characters to represent the protein name.
409  * - Y is a mnemonic species identification code of
410  * at most 5 alphanumeric characters to representing
411  * the biological source of the protein.
412  * Checking the defined species identification code
413  * has not been implemented.
414  *
415  * Example: RL1_ECOLI FER_HALHA
416  *
417  **********************************************************/
418 static bool CheckLocusSP(const char* locus)
419 {
420  const char* p;
421 
422  bool underscore = false;
423  Int2 x;
424  Int2 y;
425 
426  for (p = locus, x = y = 0; *p != '\0'; p++) {
427  if ((*p >= '0' && *p <= '9') || (*p >= 'A' && *p <= 'Z')) {
428  if (! underscore)
429  x++;
430  else
431  y++;
432  } else if (*p == '_')
433  underscore = true;
434  else
435  break;
436  }
437 
438  if (*p != '\0' || x == 0 || y == 0) {
439  ErrPostEx(SEV_ERROR, ERR_LOCUS_BadLocusName, "Bad locusname, <%s> for this entry", locus);
440  return true;
441  }
442 
443  return false;
444 }
445 
446 /**********************************************************
447  *
448  * static bool CkDateFormat(date):
449  *
450  * Return FALSE if date != dd-mmm-yyyy format.
451  *
452  **********************************************************/
453 static bool CkDateFormat(const char* date)
454 {
455  if (date[2] == '-' && date[6] == '-' &&
456  isdigit(date[0]) != 0 && isdigit(date[1]) != 0 &&
457  isdigit(date[7]) != 0 && isdigit(date[8]) != 0 &&
458  isdigit(date[9]) != 0 && isdigit(date[10]) != 0 &&
459  MatchArraySubString(month_name, date) >= 0)
460  return true;
461 
462  return false;
463 }
464 
465 /**********************************************************/
466 int CheckSTRAND(const string& str)
467 {
468  static const vector<string> strandSpecs = {
469  " ", "ss-", "ds-", "ms-"
470  };
471  static const auto strandSpecCount = strandSpecs.size();
472 
473  string compare(str);
474  NStr::ToLower(compare);
475  for (int i = 0; i < strandSpecCount; ++i) {
476  if (NStr::StartsWith(compare, strandSpecs[i])) {
477  return i;
478  }
479  }
480  return -1;
481 }
482 
483 /**********************************************************/
484 Int2 XMLCheckSTRAND(string_view str)
485 {
487 }
488 
489 /**********************************************************/
490 Int2 XMLCheckTPG(string_view str)
491 {
492  Int2 i;
493 
495  if (i == 0)
496  i = 1;
497  return (i);
498 }
499 
500 /**********************************************************/
501 int CheckTPG(const string& str)
502 {
503  static const vector<string> topologies = {
504  " ", "linear ", "circular ", "tandem "
505  };
506  static const auto topologyCount = topologies.size();
507 
508  string compare(str);
509  NStr::ToLower(compare);
510  for (int i = 0; i < topologyCount; ++i) {
511  if (NStr::StartsWith(compare, topologies[i])) {
512  return i;
513  }
514  }
515  return -1;
516 }
517 
518 /**********************************************************/
519 Int2 CheckNADDBJ(const char* str)
520 {
522 }
523 
524 /**********************************************************/
525 Int2 CheckNA(const char* str)
526 {
528 }
529 
530 /**********************************************************/
531 Int2 CheckDIV(const char* str)
532 {
534 }
535 
536 /**********************************************************/
538 {
539  Char date[12];
540  bool ret = true;
541  char* p;
542  Int4 i;
543 
544  p = StringChr(offset, '\n');
545  if (p)
546  *p = '\0';
547 
548  if (is_mga == false && ! StringEquN(offset + lcp->bp, "bp", 2) &&
549  ! StringEquN(offset + lcp->bp, "rc", 2) &&
550  ! StringEquN(offset + lcp->bp, "aa", 2)) {
551  i = lcp->bp + 1;
552  ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition, "bp/rc string unrecognized in column %d-%d: %s", i, i + 1, offset + lcp->bp);
553  ret = false;
554  }
555  if (CheckSTRAND(offset + lcp->strand) == -1) {
556  i = lcp->strand + 1;
557  ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition, "Strand unrecognized in column %d-%d : %s", i, i + 2, offset + lcp->strand);
558  }
559 
560  p = offset + lcp->molecule;
561  if (is_mga) {
562  if (! StringEquNI(p, "mRNA", 4) && ! StringEquN(p, "RNA", 3)) {
563  ErrPostEx(SEV_REJECT, ERR_FORMAT_IllegalCAGEMoltype, "Illegal molecule type provided in CAGE record in LOCUS line: \"%s\". Must be \"mRNA\"or \"RNA\". Entry dropped.", p);
564  ret = false;
565  }
566  } else if (StringMatchIcase(ParFlat_NA_array, p) < 0) {
568  i = lcp->molecule + 1;
569  if (source != Parser::ESource::DDBJ ||
571  ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition, "Molecule unrecognized in column %d-%d: %s", i, i + 5, p);
572  ret = false;
573  }
574  }
575  }
576 
577  if (CheckTPG(offset + lcp->topology) == -1) {
578  i = lcp->topology + 1;
579  ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition, "Topology unrecognized in column %d-%d: %s", i, i + 7, offset + lcp->topology);
580  ret = false;
581  }
582  if (CheckDIV(offset + lcp->div) == -1) {
583  i = lcp->div + 1;
584  ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition, "Division code unrecognized in column %d-%d: %s", i, i + 2, offset + lcp->div);
585  ret = (source == Parser::ESource::LANL);
586  }
587  MemCpy(date, offset + lcp->date, 11);
588  date[11] = '\0';
589  if (StringEquN(date, "NODATE", 6)) {
590  ErrPostStr(SEV_WARNING, ERR_FORMAT_LocusLinePosition, "NODATE in LOCUS line will be replaced by current system date");
591  } else if (! CkDateFormat(date)) {
592  i = lcp->date + 1;
593  ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition, "Date should be in column %d-%d, and format dd-mmm-yyyy: %s", i, i + 10, date);
594  ret = false;
595  }
596 
597  if (p)
598  *p = '\n';
599  return (ret);
600 }
601 
602 /**********************************************************
603  *
604  * CRef<CDate_std> GetUpdateDate(ptr, source):
605  *
606  * Return NULL if ptr does not have dd-mmm-yyyy format
607  * or "NODATE"; otherwise, return Date-std pointer.
608  *
609  **********************************************************/
611 {
612  Char date[12];
613 
614  if (StringEquN(ptr, "NODATE", 6))
616 
617  if (ptr[11] != '\0' && ptr[11] != '\n' && ptr[11] != ' ' &&
618  (source != Parser::ESource::SPROT || ptr[11] != ','))
619  return CRef<CDate_std>();
620 
621  MemCpy(date, ptr, 11);
622  date[11] = '\0';
623 
624  if (! CkDateFormat(date))
625  return CRef<CDate_std>();
626 
627  return get_full_date(ptr, false, source);
628 }
629 
630 
631 /**********************************************************/
632 static bool fta_check_embl_moltype(char* str)
633 {
634  const char** b;
635  char* p;
636  char* q;
637 
638  p = StringChr(str, ';');
639  p = StringChr(p + 1, ';');
640  p = StringChr(p + 1, ';');
641 
642  for (p++; *p == ' ';)
643  p++;
644 
645  q = StringChr(p, ';');
646  *q = '\0';
647 
648  for (b = ValidMolTypes; *b; b++)
649  if (StringEqu(p, *b))
650  break;
651 
652  if (*b) {
653  *q = ';';
654  return true;
655  }
656 
657  ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidIDlineMolType, "Invalid moltype value \"%s\" provided in ID line of EMBL record.", p);
658  *q = ';';
659  return false;
660 }
661 
662 /*********************************************************
663 Indexblk constructor
664 **********************************************************/
666 {
667  acnum[0] = 0;
668  locusname[0] = 0;
669  division[0] = 0;
670  blocusname[0] = 0;
671  wgssec[0] = 0;
672 }
673 
674 static bool isSpace(char c)
675 {
676  return isspace(c);
677 }
678 
680 sFindNextSpace(const CTempString& tempString,
681  CTempString::const_iterator current_it)
682 {
683  return find_if(current_it, tempString.end(), isSpace);
684 }
685 
686 
688 sFindNextNonSpace(const CTempString& tempString,
689  CTempString::const_iterator current_it)
690 {
691  return find_if_not(current_it, tempString.end(), isSpace);
692 }
693 
694 
695 static void sSetLocusLineOffsets(const CTempString& locusLine, LocusCont& offsets)
696 {
697  offsets.bases = -1;
698  offsets.bp = -1;
699  offsets.strand = -1;
700  offsets.molecule = -1;
701  offsets.topology = -1;
702  offsets.div = -1;
703  offsets.date = -1;
704 
705  if (locusLine.substr(0, 5) != "LOCUS") {
706  // throw an exception - invalid locus line
707  }
708 
709 
710  auto it = sFindNextNonSpace(locusLine, locusLine.begin() + 5);
711  if (it == locusLine.end()) {
712  // throw an exception - no locus name
713  }
714 
715  it = sFindNextSpace(locusLine, it);
716  if (it == locusLine.end()) {
717  return;
718  }
719 
720  // find the number of bases
721  it = sFindNextNonSpace(locusLine, it);
722  if (it == locusLine.end()) {
723  return;
724  }
725  auto space_it = sFindNextSpace(locusLine, it);
726  if (NStr::StringToNonNegativeInt(locusLine.substr(it - begin(locusLine), space_it - it)) == -1) {
727  return;
728  }
729 
730  offsets.bases = Int4(it - begin(locusLine));
731  it = sFindNextNonSpace(locusLine, space_it);
732  offsets.bp = Int4(it - begin(locusLine));
733 
734  it = sFindNextSpace(locusLine, it);
735  it = sFindNextNonSpace(locusLine, it);
736 
737  // the next one might be a strand
738  // or might be a molecule
739  space_it = sFindNextSpace(locusLine, it);
740  offsets.strand = -1;
741  if ((space_it - it) == 3) {
742  auto currentSubstr = locusLine.substr(it - begin(locusLine), 3);
743  if (currentSubstr == "ss-" ||
744  currentSubstr == "ds-" ||
745  currentSubstr == "ms-") {
746  offsets.strand = Int4(it - begin(locusLine));
747  it = sFindNextNonSpace(locusLine, space_it);
748  }
749  offsets.molecule = Int4(it - begin(locusLine));
750  } else {
751  offsets.molecule = Int4(it - begin(locusLine));
752  }
753 
754  // topology
755  it = sFindNextSpace(locusLine, it);
756  it = sFindNextNonSpace(locusLine, it);
757  if (it != locusLine.end()) {
758  offsets.topology = Int4(it - begin(locusLine));
759  }
760 
761  // find division
762  it = sFindNextSpace(locusLine, it);
763  it = sFindNextNonSpace(locusLine, it);
764  if (it != locusLine.end()) {
765  offsets.div = Int4(it - begin(locusLine));
766  }
767 
768  // find date - date is optional
769  it = sFindNextSpace(locusLine, it);
770  it = sFindNextNonSpace(locusLine, it);
771  if (it != locusLine.end()) {
772  offsets.date = Int4(it - begin(locusLine));
773  }
774 }
775 
776 /**********************************************************
777  *
778  * IndexblkPtr InitialEntry(pp, finfo):
779  *
780  * Assign the entry's value to offset, locusname,
781  * bases, linenum, drop blocusname.
782  * Swiss-prot locusname checking is different from
783  * others.
784  * Check LOCUS line column position, genbank format.
785  *
786  **********************************************************/
788 {
789  Int2 i;
790  Int2 j;
791  const char* bases;
792  IndexblkPtr entry;
793  char* p;
794 
795  entry = new Indexblk;
796 
797  entry->offset = finfo.pos;
798  entry->linenum = finfo.line;
799  entry->ppp = pp;
800  entry->is_tsa = false;
801  entry->is_tls = false;
802  entry->is_pat = false;
803 
804  auto stoken = TokenString(finfo.str, ' ');
805 
806  bool badlocus = false;
807  if (stoken->num > 2) {
808  p = finfo.str;
809  if (pp->mode == Parser::EMode::Relaxed) {
810  sSetLocusLineOffsets(p, entry->lc);
811  } else {
812  if (StringLen(p) > 78 && p[28] == ' ' && p[63] == ' ' && p[67] == ' ') {
813  entry->lc.bases = ParFlat_COL_BASES_NEW;
814  entry->lc.bp = ParFlat_COL_BP_NEW;
818  entry->lc.div = ParFlat_COL_DIV_NEW;
819  entry->lc.date = ParFlat_COL_DATE_NEW;
820  } else {
821  entry->lc.bases = ParFlat_COL_BASES;
822  entry->lc.bp = ParFlat_COL_BP;
823  entry->lc.strand = ParFlat_COL_STRAND;
826  entry->lc.div = ParFlat_COL_DIV;
827  entry->lc.date = ParFlat_COL_DATE;
828  }
829  }
830 
831  auto ptr = stoken->list.begin();
832  ++ptr;
833  if (pp->format == Parser::EFormat::EMBL &&
834  next(ptr) != stoken->list.end() && *next(ptr) == "SV"s) {
835  for (i = 0, p = finfo.str; *p != '\0'; p++)
836  if (*p == ';' && p[1] == ' ')
837  i++;
838 
839  entry->embl_new_ID = true;
840  if (! ptr->empty() && ptr->back() == ';')
841  ptr->pop_back();
842 
843  FtaInstallPrefix(PREFIX_LOCUS, ptr->c_str());
844  FtaInstallPrefix(PREFIX_ACCESSION, ptr->c_str());
845 
846  if (i != 6 || (stoken->num != 10 && stoken->num != 11)) {
847  ErrPostStr(SEV_REJECT, ERR_FORMAT_BadlyFormattedIDLine, "The number of fields in this EMBL record's new ID line does not fit requirements.");
848  badlocus = true;
849  } else if (fta_check_embl_moltype(finfo.str) == false)
850  badlocus = true;
851  }
852 
853  StringCpy(entry->locusname, ptr->c_str());
854  StringCpy(entry->blocusname, entry->locusname);
855 
856  if (entry->embl_new_ID == false) {
859  }
860 
861  if (pp->mode != Parser::EMode::Relaxed && ! badlocus) {
862  if (pp->format == Parser::EFormat::SPROT) {
863  auto it = next(ptr);
864  if (it == stoken->list.end() || it->empty() ||
865  (! StringEquNI(it->c_str(), "preliminary", 11) &&
866  ! StringEquNI(it->c_str(), "unreviewed", 10)))
867  badlocus = CheckLocusSP(entry->locusname);
868  else
869  badlocus = false;
870  } else
871  badlocus = CheckLocus(entry->locusname, pp->source);
872  }
873  } else if (pp->mode != Parser::EMode::Relaxed) {
874  badlocus = true;
875  ErrPostStr(SEV_ERROR, ERR_LOCUS_NoLocusName, "No locus name for this entry");
876  }
877 
878  if (badlocus) {
879  p = StringChr(finfo.str, '\n');
880  if (p)
881  *p = '\0';
882  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped. LOCUS line = \"%s\".", finfo.str);
883  if (p)
884  *p = '\n';
885  delete entry;
886  return nullptr;
887  }
888 
889  bases = GetResidue(stoken.get());
890  if (bases)
891  entry->bases = (size_t)atoi(bases);
892 
893  if (pp->format == Parser::EFormat::GenBank &&
894  entry->lc.date > -1) {
895  /* last token in the LOCUS line is date of the update's data
896  */
897  auto it = stoken->list.begin();
898  for (i = 1; i < stoken->num; ++i)
899  ++it;
900  entry->date = GetUpdateDate(it->c_str(), pp->source);
901  }
902 
904  j = stoken->num - ((pp->format == Parser::EFormat::GenBank) ? 2 : 3);
905  auto it = stoken->list.begin();
906  for (i = 1; i < j; ++i)
907  ++it;
908 
909  if (pp->format == Parser::EFormat::EMBL) {
910  if (StringEquNI(it->c_str(), "TSA", 3))
911  entry->is_tsa = true;
912  else if (StringEquNI(it->c_str(), "PAT", 3))
913  entry->is_pat = true;
914  }
915 
916  ++it;
917 
918  if (StringEquNI(it->c_str(), "EST", 3))
919  entry->EST = true;
920  else if (StringEquNI(it->c_str(), "STS", 3))
921  entry->STS = true;
922  else if (StringEquNI(it->c_str(), "GSS", 3))
923  entry->GSS = true;
924  else if (StringEquNI(it->c_str(), "HTC", 3))
925  entry->HTC = true;
926  else if (StringEquNI(it->c_str(), "PAT", 3) &&
928  entry->is_pat = true;
929  }
930 
931  return (entry);
932 }
933 
934 /**********************************************************
935  *
936  * void DelNoneDigitTail(str):
937  *
938  * Delete any non digit characters from tail
939  * of string "str".
940  *
941  * 3-25-93
942  *
943  **********************************************************/
945 {
946  char* p;
947 
948  if (! str || *str == '\0')
949  return;
950 
951  for (p = str; *str != '\0'; str++)
952  if (*str >= '0' && *str <= '9')
953  p = str + 1;
954 
955  *p = '\0';
956 }
957 
958 void DelNonDigitTail(string& str)
959 {
960  if (str.empty()) {
961  return;
962  }
963  auto pos = str.find_last_of("0123456789");
964  if (pos != string::npos) {
965  str.resize(pos + 1);
966  }
967 }
968 
969 /**********************************************************
970  *
971  * Here X is an alpha character, N - numeric one.
972  * Return values:
973  *
974  * 1 - XXN (AB123456)
975  * 2 - XX_N (NZ_123456)
976  * 3 - XXXXN (AAAA01000001)
977  * 4 - XX_XXXXN (NZ_AAAA01000001)
978  * 5 - XXXXXN (AAAAA1234512)
979  * 6 - XX_XXN (NZ_AB123456)
980  * 7 - XXXXNNSN (AAAA01S000001 - scaffolds)
981  * 8 - XXXXXXN (AAAAAA010000001)
982  * 9 - XXXXXXNNSN (AAAAAA01S0000001 - scaffolds)
983  * 0 - all others
984  *
985  */
986 
987 inline bool sIsUpperAlpha(char c)
988 {
989  return (c >= 'A' && c <= 'Z');
990 }
991 
993 {
994  const Char* p = acnum;
995 
996  if (! p || *p == '\0')
997  return 0;
998 
999  if (sIsUpperAlpha(p[0]) && sIsUpperAlpha(p[1])) {
1000  if (isdigit(p[2]))
1001  return 1;
1002 
1003  if (p[2] == '_') {
1004  if (isdigit(p[3])) {
1005  return 2;
1006  }
1007  if (sIsUpperAlpha(p[3]) && sIsUpperAlpha(p[4])) {
1008  if (sIsUpperAlpha(p[5]) && sIsUpperAlpha(p[6]) &&
1009  isdigit(p[7]))
1010  return 4;
1011  if (isdigit(p[5]))
1012  return 6;
1013  }
1014  return 0;
1015  }
1016 
1017  if (sIsUpperAlpha(p[2]) && sIsUpperAlpha(p[3])) {
1018  if (sIsUpperAlpha(p[4]) && sIsUpperAlpha(p[5]) &&
1019  isdigit(p[6])) {
1020  if (isdigit(p[7]) && p[8] == 'S' &&
1021  isdigit(p[9])) {
1022  return 9;
1023  }
1024  return 8;
1025  }
1026 
1027  if (isdigit(p[4])) {
1028  if (isdigit(p[5]) && p[6] == 'S' &&
1029  isdigit(p[7])) {
1030  return 7;
1031  }
1032  return 3;
1033  }
1034 
1035  if (sIsUpperAlpha(p[4]) && isdigit(p[5]))
1036  return 5;
1037  }
1038  }
1039  return 0;
1040 }
1041 
1042 
1043 /**********************************************************/
1044 static bool IsValidAccessPrefix(const char* acc, const char** accpref)
1045 {
1046  Int4 i = IsNewAccessFormat(acc);
1047  if (i == 0 || ! accpref)
1048  return false;
1049 
1050  if (2 < i && i < 10)
1051  return true;
1052 
1053  const char** b = accpref;
1054  for (; *b; b++) {
1055  if (StringEquN(acc, *b, StringLen(*b)))
1056  return true;
1057  }
1058 
1059  return false;
1060 }
1061 
1062 /**********************************************************/
1063 static bool fta_if_master_wgs_accession(const char* acnum, Int4 accformat)
1064 {
1065  const char* p;
1066 
1067  if (accformat == 3)
1068  p = acnum + 4;
1069  else if (accformat == 8)
1070  p = acnum + 6;
1071  else if (accformat == 4)
1072  p = acnum + 7;
1073  else
1074  return false;
1075 
1076  if (p[0] >= '0' && p[0] <= '9' && p[1] >= '0' && p[1] <= '9') {
1077  for (p += 2; *p == '0';)
1078  p++;
1079  if (*p == '\0')
1080  return true;
1081  return false;
1082  }
1083  return false;
1084 }
1085 
1086 
1087 static bool s_IsVDBWGSScaffold(string_view accession)
1088 {
1089  // 4+2+S+[6,7,8]
1090  if (accession.length() < 13 ||
1091  accession.length() > 15 ||
1092  accession[6] != 'S') {
1093  return false;
1094  }
1095 
1096  // check that the first 4 chars are letters
1097  if (any_of(begin(accession),
1098  begin(accession) + 4,
1099  [](const char c) { return ! isalpha(c); })) {
1100  return false;
1101  }
1102 
1103  // check that the next 2 chars are letters
1104  if (! isdigit(accession[4]) ||
1105  ! isdigit(accession[5])) {
1106  return false;
1107  }
1108 
1109  // The characters after 'S' should all be digits
1110  // with at least one non-zero digit
1111 
1112  // First check for digits
1113  if (any_of(begin(accession) + 7,
1114  end(accession),
1115  [](const char c) { return ! isdigit(c); })) {
1116  return false;
1117  }
1118 
1119  // Now check to see if at least one is not zero
1120  if (all_of(begin(accession) + 7,
1121  end(accession),
1122  [](const char c) { return c == '0'; })) {
1123  return false;
1124  }
1125 
1126  return true;
1127 }
1128 
1129 static int s_RefineWGSType(string_view accession, int initialType)
1130 {
1131  if (initialType == -1) {
1132  return initialType;
1133  }
1134  // Identify as TSA or TLS
1135  if (accession[0] == 'G') /* TSA-WGS */
1136  {
1137  switch (initialType) {
1138  case 0:
1139  return 4;
1140  case 1:
1141  return 5;
1142  case 3:
1143  return 6;
1144  default:
1145  return initialType;
1146  }
1147  }
1148 
1149  if (accession[0] == 'K' || accession[1] == 'T') { // TLS
1150  switch (initialType) {
1151  case 0:
1152  return 10;
1153  case 1:
1154  return 11;
1155  case 3:
1156  return 12;
1157  default:
1158  return initialType;
1159  }
1160  }
1161 
1162  if (initialType == 1) { // TSA again
1163  if (accession[0] == 'I') {
1164  return 8;
1165  }
1166  if (accession[0] == 'H') {
1167  return 9;
1168  }
1169  }
1170 
1171  return initialType;
1172 }
1173 
1174 /**********************************************************/
1175 /* Returns: 0 - if WGS project accession;
1176  * 1 - WGS contig accession;
1177  * 2 - WGS scaffold accession (2+6);
1178  * 3 - WGS master accession (XXXX00000000);
1179  * 4 - TSA-WGS project accession;
1180  * 5 - TSA-WGS contig accession
1181  * 6 - TSA-WGS master accession;
1182  * 7 - VDB WGS scaffold accession (4+2+S+[6,7,8]);
1183  * 8 - TSA-WGS contig DDBJ accession
1184  * 9 - TSA-WGS contig EMBL accession
1185  * 10 - TLS-WGS project accession;
1186  * 11 - TLS-WGS contig accession
1187  * 12 - TLS-WGS master accession;
1188  * -1 - something else.
1189  */
1190 int fta_if_wgs_acc(string_view accession)
1191 {
1192  if (accession.empty() || NStr::IsBlank(accession)) {
1193  return -1;
1194  }
1195 
1196  auto length = accession.length();
1197 
1198  if (length == 8 &&
1199  k_WgsScaffoldPrefix.find(accession.substr(0, 2)) != k_WgsScaffoldPrefix.end() &&
1200  all_of(begin(accession) + 2, end(accession), [](const char c) { return isdigit(c); })) {
1201  return 2;
1202  }
1203 
1204  if (length > 12 && length < 16 && accession[6] == 'S') {
1205  if (s_IsVDBWGSScaffold(accession)) {
1206  return 7;
1207  }
1208  return -1;
1209  }
1210 
1211  if (accession.substr(0, 3) == "NZ_"sv) {
1212  accession = accession.substr(3);
1213  }
1214  length = accession.length();
1215  if (length < 12 || length > 17) {
1216  return -1;
1217  }
1218 
1219  if (isdigit(accession[4])) {
1220  if (all_of(begin(accession), begin(accession) + 4, [](const char c) { return isalpha(c); }) &&
1221  all_of(begin(accession) + 4, end(accession), [](const char c) { return isdigit(c); })) {
1222 
1223  int i = -1;
1224  if (any_of(begin(accession) + 6, end(accession), [](const char c) { return c != '0'; })) {
1225  i = 1; // WGS contig
1226  } else if (accession[4] == '0' && accession[5] == '0') {
1227  i = 3; // WGS master
1228  } else {
1229  i = 0; // WGS project
1230  }
1231  return s_RefineWGSType(accession, i);
1232  }
1233  return -1;
1234  }
1235 
1236  // 6 letters + 2 digits
1237  if (all_of(begin(accession), begin(accession) + 6, [](const char c) { return isalpha(c); }) &&
1238  all_of(begin(accession) + 6, end(accession), [](const char c) { return isdigit(c); })) {
1239 
1240  if (any_of(begin(accession) + 8, end(accession), [](const char c) { return c != '0'; })) {
1241  return 1; // WGS contig
1242  }
1243 
1244  if (accession[6] == '0' && accession[7] == '0') {
1245  return 3; // WGS master
1246  }
1247  return 0; // WGS project
1248  }
1249 
1250  return -1; // unknown
1251 }
1252 
1253 /**********************************************************/
1254 bool IsSPROTAccession(const char* acc)
1255 {
1256  const char** b;
1257 
1258  if (! acc || acc[0] == '\0')
1259  return false;
1260  size_t len = StringLen(acc);
1261  if (len != 6 && len != 8 && len != 10)
1262  return false;
1263  if (len == 8) {
1264  for (b = sprot_accpref; *b; b++) {
1265  if (StringEquN(*b, acc, 2))
1266  break;
1267  }
1268 
1269  return (*b != nullptr);
1270  }
1271 
1272  if (acc[0] < 'A' || acc[0] > 'Z' || acc[1] < '0' || acc[1] > '9' ||
1273  ((acc[3] < '0' || acc[3] > '9') && (acc[3] < 'A' || acc[3] > 'Z')) ||
1274  ((acc[4] < '0' || acc[4] > '9') && (acc[4] < 'A' || acc[4] > 'Z')) ||
1275  acc[5] < '0' || acc[5] > '9')
1276  return false;
1277 
1278  if (acc[0] >= 'O' && acc[0] <= 'Q') {
1279  if ((acc[2] < '0' || acc[2] > '9') && (acc[2] < 'A' || acc[2] > 'Z'))
1280  return false;
1281  } else if (acc[2] < 'A' || acc[2] > 'Z')
1282  return false;
1283 
1284  if (len == 6)
1285  return true;
1286 
1287  if (acc[0] >= 'O' && acc[0] <= 'Q')
1288  return false;
1289 
1290  if (acc[6] < 'A' || acc[6] > 'Z' || acc[9] < '0' || acc[9] > '9' ||
1291  ((acc[7] < 'A' || acc[7] > 'Z') && (acc[7] < '0' || acc[7] > '9')) ||
1292  ((acc[8] < 'A' || acc[8] > 'Z') && (acc[8] < '0' || acc[8] > '9')))
1293  return false;
1294 
1295  return true;
1296 }
1297 
1298 #if 0
1299 static bool sCheckAccession(const list<string>& tokens,
1302  const char* priacc, int skip)
1303 {
1304  bool badac;
1305  bool res = true;
1306  bool iswgs;
1307  Char acnum[200];
1308  Int4 accformat;
1309  Int4 priformat;
1310  Int4 count;
1311  size_t i;
1312 
1313  if (! priacc || mode == Parser::EMode::Relaxed)
1314  return true;
1315 
1316  auto it = tokens.begin();
1317  if (skip) {
1318  advance(it, skip);
1319  }
1320 
1321  priformat = IsNewAccessFormat(priacc);
1322  if((priformat == 3 || priformat == 4 || priformat == 8) &&
1323  fta_if_master_wgs_accession(priacc, priformat) == false)
1324  iswgs = true;
1325  else
1326  iswgs = false;
1327 
1328  count = 0;
1329  for(; it != tokens.end(); ++it)
1330  {
1331  StringCpy(acnum, it->c_str());
1332  if(acnum[0] == '-' && acnum[1] == '\0')
1333  continue;
1334 
1335  if(skip == 2 && count == 0)
1336  accformat = priformat;
1337  else
1338  accformat = IsNewAccessFormat(acnum);
1339 
1340  size_t len = StringLen(acnum);
1341  if(acnum[len-1] == ';')
1342  {
1343  len--;
1344  acnum[len] = '\0';
1345  }
1346  badac = false;
1347  if(accformat == 1)
1348  {
1349  if(len != 8 && len != 10)
1350  badac = true;
1351  else
1352  {
1353  for(i = 2; i < 8 && badac == false; i++)
1354  if(acnum[i] < '0' || acnum[i] > '9')
1355  badac = true;
1356  }
1357  }
1358  else if(accformat == 2)
1359  {
1360  if(len != 9 && len != 12)
1361  badac = true;
1362  else
1363  {
1364  for(i = 3; i < len && badac == false; i++)
1365  if(acnum[i] < '0' || acnum[i] > '9')
1366  badac = true;
1367  }
1368  }
1369  else if(accformat == 3)
1370  {
1371  if(len < 12 || len > 14)
1372  badac = true;
1373  else
1374  {
1375  for(i = 4; i < len && badac == false; i++)
1376  if(acnum[i] < '0' || acnum[i] > '9')
1377  badac = true;
1378  }
1379  }
1380  else if(accformat == 8)
1381  {
1382  if(len < 15 || len > 17)
1383  badac = true;
1384  else
1385  {
1386  for(i = 6; i < len && !badac; i++)
1387  if(acnum[i] < '0' || acnum[i] > '9')
1388  badac = true;
1389  }
1390  }
1391  else if(accformat == 4)
1392  {
1393  if(len < 15 || len > 17)
1394  badac = true;
1395  else
1396  {
1397  for(i = 7; i < len && badac == false; i++)
1398  if(acnum[i] < '0' || acnum[i] > '9')
1399  badac = true;
1400  }
1401  }
1402  else if(accformat == 5)
1403  {
1404  if(len != 12)
1405  badac = true;
1406  else
1407  {
1408  for(i = 5; i < len && badac == false; i++)
1409  if(acnum[i] < '0' || acnum[i] > '9')
1410  badac = true;
1411  }
1412  }
1413  else if(accformat == 6)
1414  {
1415  if(len != 11 || acnum[0] != 'N' || acnum[1] != 'Z' ||
1416  acnum[2] != '_' || acnum[3] < 'A' || acnum[3] > 'Z' ||
1417  acnum[4] < 'A' || acnum[4] > 'Z')
1418  badac = true;
1419  else
1420  {
1421  for(i = 5; i < len && badac == false; i++)
1422  if(acnum[i] < '0' || acnum[i] > '9')
1423  badac = true;
1424  }
1425  }
1426  else if(accformat == 7)
1427  {
1428  if(len < 13 || len > 15)
1429  badac = true;
1430  else
1431  {
1432  for(i = 7; i < len && badac == false; i++)
1433  if(acnum[i] < '0' || acnum[i] > '9')
1434  badac = true;
1435  }
1436  }
1437  else if(accformat == 0)
1438  {
1439  if(len != 6 && len != 10)
1440  badac = true;
1441  else if(acnum[0] >= 'A' && acnum[0] <= 'Z')
1442  {
1444  {
1445  if(!IsSPROTAccession(acnum))
1446  badac = true;
1447  }
1448  else if(len == 10)
1449  {
1450  badac = true;
1451  }
1452  else
1453  {
1454  for(i = 1; i < 6 && badac == false; i++)
1455  if(acnum[i] < '0' || acnum[i] > '9')
1456  badac = true;
1457  }
1458  }
1459  else
1460  badac = true;
1461  }
1462  else
1463  badac = true;
1464 
1465  if(badac)
1466  {
1468  "Bad accession #, %s for this entry", acnum);
1469  res = false;
1470  count++;
1471  continue;
1472  }
1473 
1474  if(skip == 2 && count == 0 && !iswgs &&
1475  (accformat == 3 || accformat == 4 || accformat == 8))
1476  {
1478  "This record has a WGS 'project' accession as its primary accession number. WGS project-accessions are only expected to be used as secondary accession numbers.");
1479  res = false;
1480  }
1481  count++;
1482  }
1483 
1484  return(res);
1485 }
1486 #endif
1487 
1488 
1489 inline bool sNotAllDigits(const char* first, const char* last)
1490 {
1491  return any_of(first, last, [](char c) { return ! isdigit(c); });
1492 }
1493 
1494 /**********************************************************
1495  *
1496  * static bool CheckAccession(stoken, source, entryacc,
1497  * skip):
1498  *
1499  * A valid accession number should be an upper case
1500  * letter (A-Z) followed by 5 digits, put "reject" message
1501  * if not.
1502  *
1503  * 7-6-93
1504  *
1505  **********************************************************/
1506 static bool CheckAccession(
1507  TokenStatBlkPtr stoken,
1510  const char* priacc,
1511  unsigned skip)
1512 {
1513  bool badac;
1514  bool res = true;
1515  bool iswgs;
1516  Char acnum[200];
1517  Int4 accformat;
1518  Int4 priformat;
1519  Int4 count;
1520 
1521  if (! priacc || mode == Parser::EMode::Relaxed)
1522  return true;
1523 
1524  auto tbp = stoken->list.begin();
1525  if (skip > 0)
1526  ++tbp;
1527  priformat = IsNewAccessFormat(priacc);
1528  if ((priformat == 3 || priformat == 4 || priformat == 8) &&
1529  fta_if_master_wgs_accession(priacc, priformat) == false)
1530  iswgs = true;
1531  else
1532  iswgs = false;
1533 
1534  count = 0;
1535  for (; tbp != stoken->list.end(); ++tbp) {
1536  StringCpy(acnum, tbp->c_str());
1537  if (acnum[0] == '-' && acnum[1] == '\0')
1538  continue;
1539 
1540  if (skip == 2 && count == 0)
1541  accformat = priformat;
1542  else
1543  accformat = IsNewAccessFormat(acnum);
1544 
1545  size_t len = StringLen(acnum);
1546  if (acnum[len - 1] == ';') {
1547  len--;
1548  acnum[len] = '\0';
1549  }
1550  badac = false;
1551  if (accformat == 1) {
1552  badac = (len != 8 && len != 10) || sNotAllDigits(acnum + 2, acnum + 8);
1553  } else if (accformat == 2) {
1554  badac = (len != 9 && len != 12) || sNotAllDigits(acnum + 3, acnum + len);
1555  } else if (accformat == 3) {
1556  badac = (len < 12 || len > 14) || sNotAllDigits(acnum + 4, acnum + len);
1557  } else if (accformat == 8) {
1558  badac = (len < 15 || len > 17) || sNotAllDigits(acnum + 6, acnum + len);
1559  } else if (accformat == 4) {
1560  badac = (len < 15 || len > 17) || sNotAllDigits(acnum + 7, acnum + len);
1561  } else if (accformat == 5) {
1562  badac = (len != 12) || sNotAllDigits(acnum + 5, acnum + len);
1563  } else if (accformat == 6) {
1564  badac = (len != 11 || acnum[0] != 'N' || acnum[1] != 'Z' ||
1565  acnum[2] != '_' || acnum[3] < 'A' || acnum[3] > 'Z' ||
1566  acnum[4] < 'A' || acnum[4] > 'Z') ||
1567  sNotAllDigits(acnum + 5, acnum + len);
1568  } else if (accformat == 7) {
1569  badac = (len < 13 || len > 15) || sNotAllDigits(acnum + 7, acnum + len);
1570  } else if (accformat == 9) {
1571  badac = (len < 16 || len > 17) || sNotAllDigits(acnum + 9, acnum + len);
1572  } else if (accformat == 0) {
1573  if (len != 6 && len != 10)
1574  badac = true;
1575  else if (sIsUpperAlpha(acnum[0])) {
1576  if (source == Parser::ESource::SPROT) {
1577  if (! IsSPROTAccession(acnum))
1578  badac = true;
1579  } else {
1580  badac = (len == 10) || sNotAllDigits(acnum + 1, acnum + 6);
1581  }
1582  } else
1583  badac = true;
1584  } else
1585  badac = true;
1586 
1587  if (badac) {
1588  ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum, "Bad accession #, %s for this entry", acnum);
1589  res = false;
1590  count++;
1591  continue;
1592  }
1593 
1594  if (skip == 2 && count == 0 && ! iswgs &&
1595  (accformat == 3 || accformat == 4 || accformat == 8)) {
1596  ErrPostStr(SEV_REJECT, ERR_ACCESSION_WGSProjectAccIsPri, "This record has a WGS 'project' accession as its primary accession number. WGS project-accessions are only expected to be used as secondary accession numbers.");
1597  res = false;
1598  }
1599  count++;
1600  }
1601 
1602  return (res);
1603 }
1604 
1605 /**********************************************************/
1606 static bool IsPatentedAccPrefix(const Parser& parseInfo, const char* acc)
1607 {
1608  if (acc[2] == '\0') {
1609  if ((StringEqu(acc, "AR") || StringEqu(acc, "DZ") ||
1610  StringEqu(acc, "EA") || StringEqu(acc, "GC") ||
1611  StringEqu(acc, "GP") || StringEqu(acc, "GV") ||
1612  StringEqu(acc, "GX") || StringEqu(acc, "GY") ||
1613  StringEqu(acc, "GZ") || StringEqu(acc, "HJ") ||
1614  StringEqu(acc, "HK") || StringEqu(acc, "HL") ||
1615  StringEqu(acc, "KH") || StringEqu(acc, "MI") ||
1616  StringEqu(acc, "MM") || StringEqu(acc, "MO") ||
1617  StringEqu(acc, "MV") || StringEqu(acc, "MX") ||
1618  StringEqu(acc, "MY") || StringEqu(acc, "OO") ||
1619  StringEqu(acc, "OS") || StringEqu(acc, "OT") ||
1620  StringEqu(acc, "PR") || StringEqu(acc, "PT") ||
1621  StringEqu(acc, "PU")) &&
1622  (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI))
1623  return true;
1624  if ((StringEquN(acc, "AX", 2) || StringEquN(acc, "CQ", 2) ||
1625  StringEquN(acc, "CS", 2) || StringEquN(acc, "FB", 2) ||
1626  StringEquN(acc, "HA", 2) || StringEquN(acc, "HB", 2) ||
1627  StringEquN(acc, "HC", 2) || StringEquN(acc, "HD", 2) ||
1628  StringEquN(acc, "HH", 2) || StringEquN(acc, "GM", 2) ||
1629  StringEquN(acc, "GN", 2) || StringEquN(acc, "JA", 2) ||
1630  StringEquN(acc, "JB", 2) || StringEquN(acc, "JC", 2) ||
1631  StringEquN(acc, "JD", 2) || StringEquN(acc, "JE", 2) ||
1632  StringEquN(acc, "HI", 2) || StringEquN(acc, "LP", 2) ||
1633  StringEquN(acc, "LQ", 2) || StringEquN(acc, "MP", 2) ||
1634  StringEquN(acc, "MQ", 2) || StringEquN(acc, "MR", 2) ||
1635  StringEquN(acc, "MS", 2)) &&
1636  (parseInfo.all == true || parseInfo.source == Parser::ESource::EMBL))
1637  return true;
1638  if ((StringEquN(acc, "BD", 2) || StringEquN(acc, "DD", 2) ||
1639  StringEquN(acc, "DI", 2) || StringEquN(acc, "DJ", 2) ||
1640  StringEquN(acc, "DL", 2) || StringEquN(acc, "DM", 2) ||
1641  StringEquN(acc, "FU", 2) || StringEquN(acc, "FV", 2) ||
1642  StringEquN(acc, "FW", 2) || StringEquN(acc, "FZ", 2) ||
1643  StringEquN(acc, "GB", 2) || StringEquN(acc, "HV", 2) ||
1644  StringEquN(acc, "HW", 2) || StringEquN(acc, "HZ", 2) ||
1645  StringEquN(acc, "LF", 2) || StringEquN(acc, "LG", 2) ||
1646  StringEquN(acc, "LV", 2) || StringEquN(acc, "LX", 2) ||
1647  StringEquN(acc, "LY", 2) || StringEquN(acc, "LZ", 2) ||
1648  StringEquN(acc, "MA", 2) || StringEquN(acc, "MB", 2) ||
1649  StringEquN(acc, "MC", 2) || StringEquN(acc, "MD", 2) ||
1650  StringEquN(acc, "ME", 2) || StringEquN(acc, "OF", 2) ||
1651  StringEquN(acc, "OG", 2) || StringEquN(acc, "OI", 2) ||
1652  StringEquN(acc, "OJ", 2) || StringEquN(acc, "PA", 2) ||
1653  StringEquN(acc, "PE", 2) || StringEquN(acc, "PF", 2) ||
1654  StringEquN(acc, "PG", 2) || StringEquN(acc, "PH", 2) ||
1655  StringEquN(acc, "PI", 2) || StringEquN(acc, "PJ", 2) ||
1656  StringEquN(acc, "PK", 2)) &&
1657  (parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ))
1658  return true;
1659 
1660  return false;
1661  }
1662 
1663  if (acc[1] == '\0' && (*acc == 'I' || *acc == 'A' || *acc == 'E')) {
1664  if (parseInfo.all == true ||
1665  (*acc == 'I' && parseInfo.source == Parser::ESource::NCBI) ||
1666  (*acc == 'A' && parseInfo.source == Parser::ESource::EMBL) ||
1667  (*acc == 'E' && parseInfo.source == Parser::ESource::DDBJ))
1668  return true;
1669  }
1670  return false;
1671 }
1672 
1673 /**********************************************************/
1674 static bool IsTPAAccPrefix(const Parser& parseInfo, const char* acc)
1675 {
1676  if (! acc)
1677  return (false);
1678 
1679  size_t i = StringLen(acc);
1680  if (i != 2 && i != 4)
1681  return (false);
1682 
1683  if (i == 4) {
1684  if (acc[0] == 'D' &&
1685  (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI))
1686  return (true);
1687  if ((acc[0] == 'E' || acc[0] == 'Y') &&
1688  (parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ))
1689  return (true);
1690  return (false);
1691  }
1692 
1693  if (fta_StringMatch(ncbi_tpa_accpref, acc) >= 0 &&
1694  (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI))
1695  return (true);
1696  if (fta_StringMatch(ddbj_tpa_accpref, acc) >= 0 &&
1697  (parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ))
1698  return (true);
1699  return (false);
1700 }
1701 
1702 /**********************************************************/
1703 static bool IsWGSAccPrefix(const Parser& parseInfo, const char* acc)
1704 {
1705  if (! acc || StringLen(acc) != 2)
1706  return (false);
1707 
1708  if (fta_StringMatch(ncbi_wgs_accpref, acc) >= 0 &&
1709  (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI))
1710  return (true);
1711  if (fta_StringMatch(ddbj_wgs_accpref, acc) >= 0 &&
1712  (parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ))
1713  return (true);
1714  return (false);
1715 }
1716 
1717 /**********************************************************/
1718 static void IsTSAAccPrefix(const Parser& parseInfo, const char* acc, IndexblkPtr ibp)
1719 {
1720  if (! acc || *acc == '\0')
1721  return;
1722 
1723  if (parseInfo.source == Parser::ESource::EMBL ||
1724  parseInfo.source == Parser::ESource::DDBJ) {
1725  ibp->tsa_allowed = true;
1726  return;
1727  }
1728 
1729  if (acc[0] == 'U' && acc[1] == '\0' &&
1730  (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI)) {
1731  ibp->tsa_allowed = true;
1732  return;
1733  }
1734 
1735  if (StringLen(acc) != 2 && StringLen(acc) != 4)
1736  return;
1737 
1738  if (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI) {
1739  if ((StringLen(acc) == 2 &&
1740  (StringEqu(acc, "EZ") || StringEqu(acc, "HP") ||
1741  StringEqu(acc, "JI") || StringEqu(acc, "JL") ||
1742  StringEqu(acc, "JO") || StringEqu(acc, "JP") ||
1743  StringEqu(acc, "JR") || StringEqu(acc, "JT") ||
1744  StringEqu(acc, "JU") || StringEqu(acc, "JV") ||
1745  StringEqu(acc, "JW") || StringEqu(acc, "KA"))) ||
1746  fta_if_wgs_acc(ibp->acnum) == 5) {
1747  ibp->is_tsa = true;
1748  ibp->tsa_allowed = true;
1749  }
1750  if (fta_StringMatch(acc_tsa_allowed, acc) >= 0)
1751  ibp->tsa_allowed = true;
1752  }
1753 
1754  if (parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ) {
1755  if (StringEquN(acc, "FX", 2) || StringEquN(acc, "LA", 2) ||
1756  StringEquN(acc, "LE", 2) || StringEquN(acc, "LH", 2) ||
1757  StringEquN(acc, "LI", 2) || StringEquN(acc, "LJ", 2) ||
1758  fta_if_wgs_acc(ibp->acnum) == 8) {
1759  ibp->is_tsa = true;
1760  ibp->tsa_allowed = true;
1761  }
1762  }
1763 
1764  if (parseInfo.all == true || parseInfo.source == Parser::ESource::EMBL) {
1765  if (fta_if_wgs_acc(ibp->acnum) == 9) {
1766  ibp->is_tsa = true;
1767  ibp->tsa_allowed = true;
1768  }
1769  }
1770 }
1771 
1772 /**********************************************************/
1773 static void IsTLSAccPrefix(const Parser& parseInfo, const char* acc, IndexblkPtr ibp)
1774 {
1775  if (! acc || *acc == '\0' || StringLen(acc) != 4)
1776  return;
1777 
1778  if (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI ||
1779  parseInfo.source == Parser::ESource::DDBJ)
1780  if (fta_if_wgs_acc(ibp->acnum) == 11)
1781  ibp->is_tls = true;
1782 }
1783 /*
1784 static bool sIsAccPrefixChar(char c) {
1785  return (c >= 'A' && c <= 'Z');
1786 }
1787 */
1788 /**********************************************************
1789  *
1790  * bool GetAccession(pp, str, entry, skip):
1791  *
1792  * Only record the first line of the first accession
1793  * number.
1794  * PIR format, accession number does not follow
1795  * the rule.
1796  *
1797  * 3-4-93
1798  *
1799  **********************************************************/
1800 /*
1801 bool GetAccession(const Parser& parseInfo, const CTempString& str, IndexblkPtr entry, unsigned skip)
1802 {
1803  string accession;
1804  list<string> tokens;
1805  bool get = true;
1806 
1807  if ((skip != 2 && parseInfo.source == Parser::ESource::Flybase) ||
1808  parserInfo.source == Parser::ESource::USPTO)
1809  return true;
1810 
1811  NStr::Split(str, " ;", tokens, NStr::fSplit_Tokenize);
1812 
1813 
1814  if (skip != 2) {
1815  get = ParseAccessionRange(tokens, skip);
1816  if (get)
1817  get = sCheckAccession(tokens, parseInfo.source, parseInfo.mode, entry->acnum, skip);
1818  if (! get)
1819  entry->drop = true;
1820 
1821  if (tokens.size() > skip && skip < 2) { // Not sure about the logic
1822  auto it = tokens.begin();
1823  if (skip > 0)
1824  it = next(it, skip);
1825  move(it, tokens.end(), entry->secondary_accessions.end());
1826  }
1827  return get;
1828  }
1829 
1830  // skip == 2
1831  entry->is_tpa = false;
1832  if (tokens.size() < 2) {
1833  if (parseInfo.mode != Parser::EMode::Relaxed) {
1834  ErrPostEx(SEV_ERROR, ERR_ACCESSION_NoAccessNum, "No accession # for this entry, about line %ld", (long int)entry->linenum);
1835  entry->drop = true;
1836  }
1837  return false;
1838  }
1839 
1840 
1841  accession = *next(tokens.begin());
1842  sDelNonDigitTail(accession);
1843 
1844  StringCpy(entry->acnum, accession.c_str());
1845 
1846  if (parseInfo.format != Parser::EFormat::XML) {
1847  string temp = accession;
1848  if (parseInfo.accver && entry->vernum > 0) {
1849  temp += "." + NStr::NumericToString(entry->vernum);
1850  }
1851  if (temp.empty()) {
1852  if (entry->locusname[0] != '\0') {
1853  temp = entry->locusname;
1854  } else {
1855  temp = "???";
1856  }
1857  }
1858  FtaInstallPrefix(PREFIX_ACCESSION, temp.c_str());
1859  }
1860 
1861  if (parseInfo.source == Parser::ESource::Flybase) {
1862  return true;
1863  }
1864 
1865  if (accession.size() < 2) {
1866  ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum, "Wrong accession [%s] for this entry.", accession.c_str());
1867  entry->drop = true;
1868  return false;
1869  }
1870 
1871  if (sIsAccPrefixChar(accession[0]) && sIsAccPrefixChar(accession[1])) {
1872  if (parseInfo.accpref && ! IsValidAccessPrefix(accession.c_str(), parseInfo.accpref)) {
1873  get = false;
1874  }
1875 
1876  if (sIsAccPrefixChar(accession[2]) && sIsAccPrefixChar(accession[3])) {
1877  if (sIsAccPrefixChar(accession[4])) {
1878  accession = accession.substr(0, 5);
1879  } else {
1880  accession = accession.substr(0, 4);
1881  }
1882  } else if (accession[2] == '_') {
1883  accession = accession.substr(0, 3);
1884  } else {
1885  accession = accession.substr(0, 2);
1886  }
1887  } else {
1888  if (parseInfo.acprefix && ! StringChr(parseInfo.acprefix, accession[0])) {
1889  get = false;
1890  }
1891  accession = accession.substr(0, 1);
1892  }
1893 
1894  if (get) {
1895  if (tokens.size() > 2) {
1896  get = ParseAccessionRange(tokens, 2);
1897  if (get) {
1898  get = sCheckAccession(tokens, parseInfo.source, parseInfo.mode, entry->acnum, 2);
1899  }
1900  }
1901  } else {
1902  string sourceName = sourceNames.at(parseInfo.source);
1903  ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum, "Wrong accession # prefix [%s] for this source: %s", accession.c_str(), sourceName.c_str());
1904  }
1905 
1906  entry->secondary_accessions.clear(); // Is this necessary?
1907  move(next(tokens.begin(), 2), tokens.end(), entry->secondary_accessions.begin());
1908 
1909  if (! entry->is_pat) {
1910  entry->is_pat = IsPatentedAccPrefix(parseInfo, accession.c_str());
1911  }
1912  entry->is_tpa = IsTPAAccPrefix(parseInfo, accession.c_str());
1913  entry->is_wgs = IsWGSAccPrefix(parseInfo, accession.c_str());
1914  IsTSAAccPrefix(parseInfo, accession.c_str(), entry);
1915  IsTLSAccPrefix(parseInfo, accession.c_str(), entry);
1916 
1917  auto i = IsNewAccessFormat(entry->acnum);
1918  if (i == 3 || i == 8) {
1919  entry->is_wgs = true;
1920  entry->wgs_and_gi |= 02;
1921  } else if (i == 5) {
1922  char* p = entry->acnum;
1923  if (parseInfo.source != Parser::ESource::DDBJ || *p != 'A' || StringLen(p) != 12 ||
1924  ! StringEqu(p + 5, "0000000")) {
1925  string sourceName = sourceNames.at(parseInfo.source);
1926  ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum, "Wrong accession \"%s\" for this source: %s", p, sourceName.c_str());
1927  get = false;
1928  }
1929  entry->is_mga = true;
1930  }
1931 
1932  if (! get)
1933  entry->drop = true;
1934 
1935  return get;
1936 }
1937 */
1938 
1939 
1940 bool GetAccession(const Parser* pp, string_view str, IndexblkPtr entry, unsigned skip)
1941 {
1942  Char acc[200];
1943  bool get = true;
1944 
1945  if ((skip != 2 && pp->source == Parser::ESource::Flybase) ||
1947  return true;
1948 
1949  string line(str);
1950  auto stoken = TokenString(line.c_str(), ';');
1951 
1952  if (skip != 2) {
1953  get = ParseAccessionRange(stoken.get(), skip);
1954  if (get)
1955  get = CheckAccession(stoken.get(), pp->source, pp->mode, entry->acnum, skip);
1956  if (! get)
1957  entry->drop = true;
1958 
1959  if (skip == 1 && ! stoken->list.empty()) {
1960  stoken->list.pop_front();
1961  skip = 0;
1962  }
1963  if (skip == 0 && ! stoken->list.empty()) {
1964  auto tail = entry->secaccs.before_begin();
1965  for (; next(tail) != entry->secaccs.end();)
1966  ++tail;
1967  entry->secaccs.splice_after(tail, stoken->list);
1968  }
1969 
1970  return (get);
1971  }
1972 
1973  entry->is_tpa = false;
1974  acc[0] = '\0';
1975  if (stoken->num < 2) {
1976  if (pp->mode != Parser::EMode::Relaxed) {
1977  ErrPostEx(SEV_ERROR, ERR_ACCESSION_NoAccessNum, "No accession # for this entry, about line %ld", (long int)entry->linenum);
1978  entry->drop = true;
1979  }
1980  return false;
1981  }
1982 
1983  StringCpy(acc, next(stoken->list.begin())->c_str()); /* get first accession */
1984 
1985  if (pp->mode != Parser::EMode::Relaxed) {
1986  DelNoneDigitTail(acc);
1987  }
1988 
1989  StringCpy(entry->acnum, acc);
1990 
1991  if (pp->format != Parser::EFormat::XML) {
1992  string temp = acc;
1993  if (pp->accver && entry->vernum > 0) {
1994  temp += '.';
1995  temp += to_string(entry->vernum);
1996  }
1997 
1998  if (temp.empty()) {
1999  if (entry->locusname[0] != '\0')
2000  temp = entry->locusname;
2001  else
2002  temp = "???";
2003  }
2004  FtaInstallPrefix(PREFIX_ACCESSION, temp.c_str());
2005  }
2006 
2007  if (pp->source == Parser::ESource::Flybase) {
2008  return true;
2009  }
2010 
2011  if ((StringLen(acc) < 2) &&
2012  pp->mode != Parser::EMode::Relaxed) {
2013  ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum, "Wrong accession [%s] for this entry.", acc);
2014  entry->drop = true;
2015  return false;
2016  }
2017 
2018  if (pp->mode != Parser::EMode::Relaxed) {
2019  if (acc[0] >= 'A' && acc[0] <= 'Z' && acc[1] >= 'A' && acc[1] <= 'Z') {
2020  if (pp->accpref && ! IsValidAccessPrefix(acc, pp->accpref))
2021  get = false;
2022  if (acc[2] >= 'A' && acc[2] <= 'Z' && acc[3] >= 'A' && acc[3] <= 'Z') {
2023  if (acc[4] >= 'A' && acc[4] <= 'Z') {
2024  acc[5] = '\0';
2025  } else {
2026  acc[4] = '\0';
2027  }
2028  } else if (acc[2] == '_') {
2029  acc[3] = '\0';
2030  } else {
2031  acc[2] = '\0';
2032  }
2033  } else {
2034  /* Processing of accession numbers in old format */
2035  /* check valid prefix accession number */
2036  if (pp->acprefix && ! StringChr(pp->acprefix, *acc))
2037  get = false;
2038  acc[1] = '\0';
2039  }
2040  }
2041 
2042  if (get) {
2043  if (stoken->num > 2)
2044  get = ParseAccessionRange(stoken.get(), 2);
2045  if (get) {
2046  get = CheckAccession(stoken.get(), pp->source, pp->mode, entry->acnum, 2);
2047  }
2048  } else {
2049  string sourceName = sourceNames.at(pp->source);
2050  ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum, "Wrong accession # prefix [%s] for this source: %s", acc, sourceName.c_str());
2051  }
2052 
2053  stoken->list.pop_front();
2054  stoken->list.pop_front();
2055  entry->secaccs = std::move(stoken->list);
2056  stoken.reset();
2057 
2058  if (! entry->is_pat)
2059  entry->is_pat = IsPatentedAccPrefix(*pp, acc);
2060  entry->is_tpa = IsTPAAccPrefix(*pp, acc);
2061  entry->is_wgs = IsWGSAccPrefix(*pp, acc);
2062  IsTSAAccPrefix(*pp, acc, entry);
2063  IsTLSAccPrefix(*pp, acc, entry);
2064 
2065  auto i = IsNewAccessFormat(entry->acnum);
2066  if (i == 3 || i == 8) {
2067  entry->is_wgs = true;
2068  entry->wgs_and_gi |= 02;
2069  } else if (i == 5) {
2070  const char* p = entry->acnum;
2071  if (pp->source != Parser::ESource::DDBJ || *p != 'A' || StringLen(p) != 12 ||
2072  ! StringEqu(p + 5, "0000000")) {
2073  string sourceName = sourceNames.at(pp->source);
2074  ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum, "Wrong accession \"%s\" for this source: %s", p, sourceName.c_str());
2075  get = false;
2076  }
2077  entry->is_mga = true;
2078  }
2079 
2080  if (! get)
2081  entry->drop = true;
2082 
2083  return (get);
2084 }
2085 
2086 /**********************************************************/
2088 {
2089  if (! pp)
2090  return;
2091 
2092  if (! pp->entrylist.empty()) {
2093  for (auto* ibp : pp->entrylist)
2094  if (ibp)
2095  FreeIndexblk(ibp);
2096 
2097  pp->entrylist.clear();
2098  }
2099 
2100  pp->indx = 0;
2101  pp->curindx = 0;
2102 
2103  if (pp->pbp) {
2104  if (pp->pbp->ibp)
2105  delete pp->pbp->ibp;
2106  delete pp->pbp;
2107  pp->pbp = nullptr;
2108  }
2109 }
2110 
2111 /**********************************************************
2112  *
2113  * void FreeParser(pp):
2114  *
2115  * 3-5-93
2116  *
2117  **********************************************************/
2118 /*
2119 void FreeParser(ParserPtr pp)
2120 {
2121  if (! pp)
2122  return;
2123 
2124  ResetParserStruct(pp);
2125 
2126  if (pp->fpo)
2127  MemFree(pp->fpo);
2128  delete pp;
2129 }
2130 */
2131 
2132 /**********************************************************
2133  *
2134  * void CloseFiles(pp):
2135  *
2136  * 3-4-93
2137  *
2138  **********************************************************/
2140 {
2141  if (pp->qsfd) {
2142  fclose(pp->qsfd);
2143  pp->qsfd = nullptr;
2144  }
2145 }
2146 
2147 /**********************************************************
2148  *
2149  * void MsgSkipTitleFail(flatfile, finfo):
2150  *
2151  * 7-2-93
2152  *
2153  **********************************************************/
2154 void MsgSkipTitleFail(const char* flatfile, FinfoBlk& finfo)
2155 {
2156  ErrPostEx(SEV_ERROR, ERR_ENTRY_Begin, "No valid beginning of entry found in %s file", flatfile);
2157 
2158  // delete finfo;
2159 }
2160 
2161 
2162 bool FindNextEntryBuf(bool end_of_file, FileBuf& fbuf, FinfoBlk& finfo, const CTempString& keyword)
2163 {
2164  const char* p = keyword.data();
2165  size_t len = keyword.size();
2166  bool done = end_of_file;
2167  while (! done && ! StringEquN(finfo.str, p, len))
2168  done = XReadFileBuf(fbuf, finfo);
2169 
2170  return (done);
2171 }
2172 
2173 
2174 /**********************************************************
2175  *
2176  * bool FlatFileIndex(pp, (*fun)()):
2177  *
2178  * 10-6-93
2179  *
2180  **********************************************************/
2181 bool FlatFileIndex(ParserPtr pp, void (*fun)(IndexblkPtr entry, char* offset, Int4 len))
2182 {
2183  bool index;
2184 
2185  switch (pp->format) {
2187  index = GenBankIndex(pp);
2188  break;
2189  case Parser::EFormat::EMBL:
2190  index = EmblIndex(pp, fun);
2191  break;
2193  index = SprotIndex(pp, fun);
2194  break;
2195  case Parser::EFormat::XML:
2196  index = XMLIndex(pp);
2197  break;
2198  default:
2199  index = false;
2200  fprintf(stderr, "Unknown flatfile format.\n");
2201  break;
2202  }
2203  return (index);
2204 }
2205 
2206 /**********************************************************/
2208 {
2210  return (embl_accpref);
2212  return (sprot_accpref);
2214  return (lanl_accpref);
2216  return (ddbj_accpref);
2218  return (ncbi_accpref);
2220  return (refseq_accpref);
2221  return nullptr;
2222 }
2223 
2225 {
2226  switch (type) {
2227  case CSeq_id::e_Genbank:
2228  case CSeq_id::e_Ddbj:
2229  case CSeq_id::e_Embl:
2230  case CSeq_id::e_Other:
2231  case CSeq_id::e_Tpg:
2232  case CSeq_id::e_Tpd:
2233  case CSeq_id::e_Tpe:
2234  return true;
2235  default:
2236  break;
2237  }
2238 
2239  return false;
2240 }
2241 
2242 
2243 /**********************************************************/
2245 {
2246  auto info = CSeq_id::IdentifyAccession(acc);
2247  if (CSeq_id::fAcc_prot & info) {
2248  return CSeq_id::e_not_set;
2249  }
2250 
2251  if (auto type = CSeq_id::GetAccType(info);
2253  return type;
2254  }
2255 
2256  return CSeq_id::e_not_set;
2257 }
2258 
2259 
2260 /**********************************************************/
2262 {
2263  auto info = CSeq_id::IdentifyAccession(acc);
2264  if (CSeq_id::fAcc_prot & info) {
2265  if (auto type = CSeq_id::GetAccType(info);
2267  return type;
2268  }
2269  }
2270 
2271  return CSeq_id::e_not_set;
2272 }
2273 
void FreeIndexblk(IndexblkPtr ibp)
Definition: block.cpp:143
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CTime –.
Definition: ncbitime.hpp:296
Definition: map.hpp:338
Definition: set.hpp:45
static const char fp[]
Definition: des.c:87
#define ERR_ENTRY_Skipped
Definition: flat2err.h:80
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:131
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:121
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:111
void StringCpy(char *d, const char *s)
Definition: ftacpp.hpp:89
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
void MemCpy(void *p, const void *q, size_t sz)
Definition: ftacpp.hpp:50
void FtaInstallPrefix(int prefix, const char *name, const char *location)
Definition: ftaerr.cpp:321
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static int type
Definition: getdata.c:31
static const char * str(char *buf, int n)
Definition: stats.c:84
int offset
Definition: replacements.h:160
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
static E_Choice GetAccType(EAccessionInfo info)
Definition: Seq_id.hpp:562
@ fAcc_prot
Definition: Seq_id.hpp:252
#define NCBI_UNUSED
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
const_iterator end() const
Return an iterator to the string's ending position (one past the end of the represented sequence)
Definition: tempstr.hpp:306
static int StringToNonNegativeInt(const CTempString str, TStringToNumFlags flags=0)
Convert string to non-negative integer value.
Definition: ncbistr.cpp:457
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
const char * const_iterator
Definition: tempstr.hpp:71
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:776
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
const_iterator begin() const
Return an iterator to the string's starting position.
Definition: tempstr.hpp:299
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_not_set
No variant selected.
Definition: Seq_id_.hpp:94
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ ParFlat_COL_DATE
Definition: index.h:49
@ ParFlat_COL_STRAND_NEW
Definition: index.h:54
@ ParFlat_COL_MOLECULE_NEW
Definition: index.h:55
@ ParFlat_COL_BP_NEW
Definition: index.h:53
@ ParFlat_COL_TOPOLOGY_NEW
Definition: index.h:56
@ ParFlat_COL_BP
Definition: index.h:44
@ ParFlat_COL_TOPOLOGY
Definition: index.h:47
@ ParFlat_COL_DATE_NEW
Definition: index.h:58
@ ParFlat_COL_DIV
Definition: index.h:48
@ ParFlat_COL_BASES
Definition: index.h:43
@ ParFlat_COL_DIV_NEW
Definition: index.h:57
@ ParFlat_COL_BASES_NEW
Definition: index.h:52
@ ParFlat_COL_MOLECULE
Definition: index.h:46
@ ParFlat_COL_STRAND
Definition: index.h:45
static int s_RefineWGSType(string_view accession, int initialType)
Definition: indx_blk.cpp:1129
static const char * ParFlat_NA_array_DDBJ[]
Definition: indx_blk.cpp:66
USING_SCOPE(objects)
bool sIsUpperAlpha(char c)
Definition: indx_blk.cpp:987
static const char * ddbj_accpref[]
Definition: indx_blk.cpp:104
static const char * ddbj_wgs_accpref[]
Definition: indx_blk.cpp:164
static const char * ncbi_tpa_accpref[]
Definition: indx_blk.cpp:152
NCBI_UNUSED bool SkipTitle(FILE *fp, FinfoBlk &finfo, const char *str, size_t len)
Definition: indx_blk.cpp:341
static bool CheckAccession(TokenStatBlkPtr stoken, Parser::ESource source, Parser::EMode mode, const char *priacc, unsigned skip)
Definition: indx_blk.cpp:1506
CRef< CDate_std > GetUpdateDate(const char *ptr, Parser::ESource source)
Definition: indx_blk.cpp:610
static CTempString::const_iterator sFindNextSpace(const CTempString &tempString, CTempString::const_iterator current_it)
Definition: indx_blk.cpp:680
static bool s_IsVDBWGSScaffold(string_view accession)
Definition: indx_blk.cpp:1087
CSeq_id::E_Choice GetNucAccOwner(const CTempString &acc)
Definition: indx_blk.cpp:2244
static const set< string_view > k_WgsScaffoldPrefix
Definition: indx_blk.cpp:168
static const char * GetResidue(TokenStatBlkPtr stoken)
Definition: indx_blk.cpp:235
int fta_if_wgs_acc(string_view accession)
Definition: indx_blk.cpp:1190
static const char * XML_STRAND_array[]
Definition: indx_blk.cpp:58
static bool IsWGSAccPrefix(const Parser &parseInfo, const char *acc)
Definition: indx_blk.cpp:1703
static const char * ParFlat_NA_array[]
Definition: indx_blk.cpp:74
const char ** GetAccArray(Parser::ESource source)
Definition: indx_blk.cpp:2207
static const char * refseq_accpref[]
Definition: indx_blk.cpp:135
bool isSupportedAccession(CSeq_id::E_Choice type)
Definition: indx_blk.cpp:2224
static const char * acc_tsa_allowed[]
Definition: indx_blk.cpp:146
int CheckTPG(const string &str)
Definition: indx_blk.cpp:501
static void sSetLocusLineOffsets(const CTempString &locusLine, LocusCont &offsets)
Definition: indx_blk.cpp:695
void ResetParserStruct(ParserPtr pp)
Definition: indx_blk.cpp:2087
Int2 CheckNA(const char *str)
Definition: indx_blk.cpp:525
bool FindNextEntryBuf(bool end_of_file, FileBuf &fbuf, FinfoBlk &finfo, const CTempString &keyword)
Definition: indx_blk.cpp:2162
static CTempString::const_iterator sFindNextNonSpace(const CTempString &tempString, CTempString::const_iterator current_it)
Definition: indx_blk.cpp:688
bool sNotAllDigits(const char *first, const char *last)
Definition: indx_blk.cpp:1489
static const char * sprot_accpref[]
Definition: indx_blk.cpp:100
CSeq_id::E_Choice GetProtAccOwner(const CTempString &acc)
Definition: indx_blk.cpp:2261
static const char * XML_TPG_array[]
Definition: indx_blk.cpp:62
int CheckSTRAND(const string &str)
Definition: indx_blk.cpp:466
bool XMLIndex(ParserPtr pp)
Definition: xm_index.cpp:1401
static const char * ParFlat_RESIDUE_STR[]
Definition: indx_blk.cpp:192
static const char * ParFlat_DIV_array[]
Definition: indx_blk.cpp:81
static Int2 FileGetsBuf(char *res, Int4 size, FileBuf &fbuf)
Definition: indx_blk.cpp:288
IndexblkPtr InitialEntry(ParserPtr pp, FinfoBlk &finfo)
Definition: indx_blk.cpp:787
bool SprotIndex(ParserPtr pp, void(*fun)(IndexblkPtr entry, char *offset, Int4 len))
Definition: sp_index.cpp:109
static const char * lanl_accpref[]
Definition: indx_blk.cpp:96
void DelNonDigitTail(string &str)
Definition: indx_blk.cpp:958
static const char * month_name[]
Definition: indx_blk.cpp:188
bool GenBankIndex(ParserPtr pp)
Definition: gb_index.cpp:337
static const char * ddbj_tpa_accpref[]
Definition: indx_blk.cpp:156
bool SkipTitleBuf(FileBuf &fbuf, FinfoBlk &finfo, const CTempString &keyword)
Definition: indx_blk.cpp:357
Int4 IsNewAccessFormat(const Char *acnum)
Definition: indx_blk.cpp:992
bool EmblIndex(ParserPtr pp, void(*fun)(IndexblkPtr entry, char *offset, Int4 len))
Definition: em_index.cpp:192
static const map< Parser::ESource, string > sourceNames
Definition: indx_blk.cpp:177
static bool CheckLocusSP(const char *locus)
Definition: indx_blk.cpp:418
Int2 XMLCheckTPG(string_view str)
Definition: indx_blk.cpp:490
Int2 CheckDIV(const char *str)
Definition: indx_blk.cpp:531
bool FlatFileIndex(ParserPtr pp, void(*fun)(IndexblkPtr entry, char *offset, Int4 len))
Definition: indx_blk.cpp:2181
bool GetAccession(const Parser *pp, string_view str, IndexblkPtr entry, unsigned skip)
Definition: indx_blk.cpp:1940
static const char * ValidMolTypes[]
Definition: indx_blk.cpp:196
void CloseFiles(ParserPtr pp)
Definition: indx_blk.cpp:2139
static void IsTLSAccPrefix(const Parser &parseInfo, const char *acc, IndexblkPtr ibp)
Definition: indx_blk.cpp:1773
bool IsSPROTAccession(const char *acc)
Definition: indx_blk.cpp:1254
static bool isSpace(char c)
Definition: indx_blk.cpp:674
static void IsTSAAccPrefix(const Parser &parseInfo, const char *acc, IndexblkPtr ibp)
Definition: indx_blk.cpp:1718
Int2 XMLCheckSTRAND(string_view str)
Definition: indx_blk.cpp:484
static bool fta_check_embl_moltype(char *str)
Definition: indx_blk.cpp:632
void DelNoneDigitTail(char *str)
Definition: indx_blk.cpp:944
bool XReadFileBuf(FileBuf &fbuf, FinfoBlk &finfo)
Definition: indx_blk.cpp:313
bool CkLocusLinePos(char *offset, Parser::ESource source, LocusContPtr lcp, bool is_mga)
Definition: indx_blk.cpp:537
static bool XReadFile(FILE *fp, FinfoBlk &finfo)
Definition: indx_blk.cpp:262
static bool IsValidAccessPrefix(const char *acc, const char **accpref)
Definition: indx_blk.cpp:1044
void MsgSkipTitleFail(const char *flatfile, FinfoBlk &finfo)
Definition: indx_blk.cpp:2154
static bool IsTPAAccPrefix(const Parser &parseInfo, const char *acc)
Definition: indx_blk.cpp:1674
static bool CheckLocus(const char *locus, Parser::ESource source)
Definition: indx_blk.cpp:378
static const char * embl_accpref[]
Definition: indx_blk.cpp:87
static const char * ncbi_accpref[]
Definition: indx_blk.cpp:114
static const char * ncbi_wgs_accpref[]
Definition: indx_blk.cpp:160
static bool CkDateFormat(const char *date)
Definition: indx_blk.cpp:453
static bool IsPatentedAccPrefix(const Parser &parseInfo, const char *acc)
Definition: indx_blk.cpp:1606
static bool fta_if_master_wgs_accession(const char *acnum, Int4 accformat)
Definition: indx_blk.cpp:1063
Int2 CheckNADDBJ(const char *str)
Definition: indx_blk.cpp:519
static const char * ParFlat_AA_array_DDBJ[]
Definition: indx_blk.cpp:70
#define ERR_FORMAT_BadlyFormattedIDLine
Definition: indx_err.h:58
#define ERR_ENTRY_Begin
Definition: indx_err.h:63
#define ERR_ACCESSION_WGSProjectAccIsPri
Definition: indx_err.h:69
#define ERR_ACCESSION_NoAccessNum
Definition: indx_err.h:68
#define ERR_FORMAT_LocusLinePosition
Definition: indx_err.h:43
#define ERR_LOCUS_BadLocusName
Definition: indx_err.h:74
#define ERR_ACCESSION_BadAccessNum
Definition: indx_err.h:67
#define ERR_FORMAT_IllegalCAGEMoltype
Definition: indx_err.h:57
#define ERR_LOCUS_NoLocusName
Definition: indx_err.h:75
#define ERR_FORMAT_InvalidIDlineMolType
Definition: indx_err.h:59
int i
yy_size_t n
int len
static MDB_envinfo info
Definition: mdb_load.c:37
mdb_mode_t mode
Definition: lmdb++.h:38
const struct ncbi::grid::netcache::search::fields::SIZE size
const CharType(& source)[N]
Definition: pointer.h:1149
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
static PCRE2_SIZE * offsets
Definition: pcre2grep.c:266
#define count
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
const char * current
size_t get_offs() const
Char str[256]
Definition: indx_blk.h:42
size_t pos
Definition: indx_blk.h:44
Int4 line
Definition: indx_blk.h:43
Char acnum[200]
Definition: ftablock.h:166
Char division[4]
Definition: ftablock.h:171
Parser * ppp
Definition: ftablock.h:250
bool is_mga
Definition: ftablock.h:199
TokenBlkList secaccs
Definition: ftablock.h:216
bool tsa_allowed
Definition: ftablock.h:211
Int4 wgs_and_gi
Definition: ftablock.h:231
bool is_tls
Definition: ftablock.h:208
Char blocusname[200]
Definition: ftablock.h:178
CRef< objects::CDate_std > date
Definition: ftablock.h:187
Int2 vernum
Definition: ftablock.h:167
bool is_tpa
Definition: ftablock.h:206
bool embl_new_ID
Definition: ftablock.h:218
bool is_wgs
Definition: ftablock.h:205
bool STS
Definition: ftablock.h:193
bool is_pat
Definition: ftablock.h:202
bool HTC
Definition: ftablock.h:195
bool drop
Definition: ftablock.h:182
size_t bases
Definition: ftablock.h:172
bool is_tsa
Definition: ftablock.h:207
bool EST
Definition: ftablock.h:192
size_t linenum
Definition: ftablock.h:180
string wgssec
Definition: ftablock.h:236
size_t offset
Definition: ftablock.h:168
Char locusname[200]
Definition: ftablock.h:170
LocusCont lc
Definition: ftablock.h:212
bool GSS
Definition: ftablock.h:194
Int4 bases
Definition: ftablock.h:106
Int4 molecule
Definition: ftablock.h:109
Int4 strand
Definition: ftablock.h:108
Int4 topology
Definition: ftablock.h:110
Int4 date
Definition: ftablock.h:112
Int4 bp
Definition: ftablock.h:107
Int4 div
Definition: ftablock.h:111
const char * acprefix
vector< IndexblkPtr > entrylist
const char ** accpref
ProtBlkPtr pbp
InfoBioseq * ibp
Definition: ftablock.h:99
TokenBlkList list
Definition: ftablock.h:137
Definition: type.c:6
done
Definition: token1.c:1
Int2 StringMatchIcase(const Char **array, string_view text)
Definition: utilfun.cpp:507
Int2 MatchArraySubString(const Char **array, string_view text)
Definition: utilfun.cpp:578
unique_ptr< TokenStatBlk > TokenString(const char *str, Char delimiter)
Definition: utilfun.cpp:445
CRef< CDate_std > get_full_date(const char *s, bool is_ref, Parser::ESource source)
Definition: utilfun.cpp:827
bool ParseAccessionRange(list< string > &tokens, unsigned skip)
Definition: utilfun.cpp:265
Int2 fta_StringMatch(const Char **array, string_view text)
Definition: utilfun.cpp:486
Modified on Fri Sep 20 14:57:19 2024 by modify_doxy.py rev. 669887