NCBI C++ ToolKit
gb_index.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gb_index.cpp 99284 2023-03-06 16:28:57Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: gb_index.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Parsing genbank to memory blocks. Build Genbank format index block.
32  *
33  */
34 
35 #include <ncbi_pch.hpp>
36 
37 #include "ftacpp.hpp"
38 
39 #include "index.h"
40 #include "genbank.h"
41 
42 #include "ftaerr.hpp"
43 #include "indx_blk.h"
44 #include "indx_def.h"
45 #include "utilfun.h"
46 #include "entry.h"
47 
48 #ifdef THIS_FILE
49 # undef THIS_FILE
50 #endif
51 #define THIS_FILE "gb_index.cpp"
52 
53 
55 
56 vector<string> genbankKeywords = {
57  "LOCUS",
58  "DEFINITION",
59  "ACCESSION",
60  "NID",
61  "GSDB ID",
62  "KEYWORDS",
63  "SEGMENT",
64  "SOURCE",
65  "REFERENCE",
66  "COMMENT",
67  "FEATURES",
68  "BASE COUNT",
69  "ORIGIN",
70  "//",
71  "GSDBID",
72  "CONTIG",
73  "VERSION",
74  "USER",
75  "WGS",
76  "PRIMARY",
77  "MGA",
78  "PROJECT",
79  "DBLINK",
80 };
81 
82 
83 // LCOV_EXCL_START
84 // Excluded per Mark's request on 12/14/2016
85 /**********************************************************
86  *
87  * static bool DelSegnum(str, segnum, len2):
88  *
89  * Strip off segnum which has number of "len1" digits,
90  * then check if any tailing zero existed.
91  * Subroutine return:
92  * TRUE if
93  * - there is no tailing zero or
94  * - the number of the tailing zero is equal or greater
95  * than (len2-len1) (i.e. strip off len2-len1 of "0").
96  * FALSE and no change in the string "str" if
97  * - len2-len1 less than zero or
98  * - there is not enough "len1" digits at end of
99  * the string "str" or
100  * - there is not enough len2-len1 zero at end of
101  * the string "str".
102  *
103  * February 25 1993
104  *
105  **********************************************************/
106 static bool DelSegnum(IndexblkPtr entry, const char* segnum, size_t len2)
107 {
108  char* str;
109  const char* p;
110  char* q;
111 
112  if (! segnum)
113  return false;
114  size_t len1 = StringLen(segnum);
115  if (len2 < len1)
116  return false;
117 
118  /* check, is there enough digits to delete
119  */
120  size_t tlen = len1;
121  str = entry->blocusname;
122  size_t i = StringLen(str);
123  for (; tlen > 0; tlen--) {
124  char c = str[--i];
125  if (! ('0' <= c && c <= '9'))
126  break;
127  if (i <= 0)
128  return false;
129  }
130 
131  if (tlen > 0)
132  return false;
133 
134  if (len2 > len1 && str[i] == '0') {
135  /* check, is there enough "0" appended
136  */
137  for (tlen = len2 - len1; tlen > 0 && str[i] == '0'; i--)
138  tlen--;
139 
140  if (tlen != 0)
141  return false;
142  }
143 
144  for (q = &str[i + 1], p = q; *p == '0';)
145  p++;
146 
147  int j = atoi(segnum);
148  if (atoi(p) != j) {
149  ErrPostEx(SEV_REJECT, ERR_SEGMENT_BadLocusName, "Segment suffix in locus name \"%s\" does not match number in SEGMENT line = \"%d\". Entry dropped.", str, j);
150  entry->drop = true;
151  }
152 
153  *q = '\0'; /* strip off "len" characters */
154  return true;
155 }
156 
157 /**********************************************************/
158 static void GetSegment(char* str, IndexblkPtr entry)
159 {
160  TokenStatBlkPtr stoken;
161  TokenBlkPtr ptr2;
162  TokenBlkPtr ptr4;
163 
164  stoken = TokenString(str, ' ');
165 
166  if (stoken->num > 3) {
167  ptr2 = stoken->list->next;
168  ptr4 = ptr2->next->next;
169  entry->segnum = (Uint2)atoi(ptr2->str);
170 
171  if (! DelSegnum(entry, ptr2->str, StringLen(ptr4->str))) {
172  ErrPostEx(SEV_ERROR, ERR_SEGMENT_BadLocusName, "Bad locus name %s in %d", entry->blocusname, entry->linenum);
173  }
174 
175  entry->segtotal = (Uint2)atoi(ptr4->str);
176  } else {
177  ErrPostEx(SEV_ERROR, ERR_SEGMENT_IncompSeg, "Incomplete Segment information at linenum %d", entry->linenum);
178  }
179 
180  FreeTokenstatblk(stoken);
181 }
182 // LCOV_EXCL_STOP
183 
184 /**********************************************************/
185 static bool gb_err_field(const char* str)
186 {
187  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField, "No %s data in GenBank format file, entry dropped", str);
188  return true;
189 }
190 
191 /**********************************************************/
192 static void ParseGenBankVersion(IndexblkPtr entry, char* line, char* nid, Parser::ESource source, Parser::EMode mode, bool ign_toks)
193 {
194  bool gi;
195  char* p;
196  char* q;
197  char* r;
198  Char ch;
199  Char ch1;
200 
201  if (! line)
202  return;
203 
204  for (p = line; *p != '\0' && *p != ' ' && *p != '\t';)
205  p++;
206  gi = (*p == '\0') ? false : true;
207 
208  ch1 = *p;
209  *p = '\0';
210  q = StringRChr(line, '.');
211  if (! q) {
212  if (mode != Parser::EMode::Relaxed) {
213  *p = ch1;
214  ErrPostEx(SEV_FATAL, ERR_VERSION_MissingVerNum, "Missing VERSION number in VERSION line: \"%s\".", line);
215  entry->drop = true;
216  }
217  return;
218  }
219 
220  for (r = q + 1; *r >= '0' && *r <= '9';)
221  r++;
222  if (*r != '\0') {
223  if (mode != Parser::EMode::Relaxed) {
224  *p = ch1;
225  ErrPostEx(SEV_FATAL, ERR_VERSION_NonDigitVerNum, "Incorrect VERSION number in VERSION line: \"%s\".", line);
226  entry->drop = true;
227  }
228  return;
229  }
230  ch = *q;
231  *q = '\0';
232  if (! StringEqu(entry->acnum, line)) {
233  *q = ch;
234  *p = ch1;
235  if (mode != Parser::EMode::Relaxed) {
236  ErrPostEx(SEV_FATAL, ERR_VERSION_AccessionsDontMatch, "Accessions in VERSION and ACCESSION lines don't match: \"%s\" vs \"%s\".", line, entry->acnum);
237  entry->drop = true;
238  }
239  return;
240  }
241  entry->vernum = atoi(q + 1);
242  *q = ch;
243 
244  if (entry->vernum < 1) {
245  *p = ch1;
246  ErrPostEx(SEV_FATAL, ERR_VERSION_InvalidVersion, "Version number \"%d\" from Accession.Version value \"%s.%d\" is not a positive integer.", entry->vernum, entry->acnum, entry->vernum);
247  entry->drop = true;
248  return;
249  }
250 
251  if (ch1 != '\0')
252  for (*p++ = ch1; *p == ' ' || *p == '\t';)
253  p++;
254 
255  if (source == Parser::ESource::DDBJ) {
256  if (*p != '\0' && ! ign_toks) {
257  ErrPostEx(SEV_ERROR, ERR_VERSION_BadVersionLine, "DDBJ's VERSION line has too many tokens: \"%s\".", line);
258  }
259  return;
260  }
261 
262  if (! gi)
263  return;
264 
265  if (! StringEquN(p, "GI:", 3)) {
266  ErrPostEx(SEV_FATAL, ERR_VERSION_IncorrectGIInVersion, "Incorrect GI entry in VERSION line: \"%s\".", line);
267  entry->drop = true;
268  return;
269  }
270  p += 3;
271  for (q = p; *q >= '0' && *q <= '9';)
272  q++;
273  if (*q != '\0') {
274  ErrPostEx(SEV_FATAL, ERR_VERSION_NonDigitGI, "Incorrect GI number in VERSION line: \"%s\".", line);
275  entry->drop = true;
276  }
277 }
278 
279 /**********************************************************/
280 static bool fta_check_mga_line(char* line, IndexblkPtr ibp)
281 {
282  char* p;
283  char* q;
284  char* str;
285  Int4 from;
286  Int4 to;
287 
288  if (! line || ! ibp)
289  return false;
290 
291  for (p = line; *p == ' ' || *p == '\t';)
292  p++;
293  str = StringSave(p);
294  p = StringChr(str, '\n');
295  if (p)
296  *p = '\0';
297  p = StringChr(str, '-');
298  if (! p) {
299  MemFree(str);
300  return false;
301  }
302  *p++ = '\0';
303 
304  if (StringLen(str) != 12 || StringLen(p) != 12 ||
305  ! StringEquN(str, ibp->acnum, 5) ||
306  ! StringEquN(p, ibp->acnum, 5)) {
307  MemFree(str);
308  return false;
309  }
310 
311  for (q = str + 5; *q >= '0' && *q <= '9';)
312  q++;
313  if (*q != '\0') {
314  MemFree(str);
315  return false;
316  }
317  for (q = p + 5; *q >= '0' && *q <= '9';)
318  q++;
319  if (*q != '\0') {
320  MemFree(str);
321  return false;
322  }
323 
324  for (q = str + 5; *q == '0';)
325  q++;
326  from = atoi(q);
327  for (q = p + 5; *q == '0';)
328  q++;
329  to = atoi(q);
330 
331  if (from > to) {
332  MemFree(str);
333  return false;
334  }
335 
336  ibp->bases = to - from + 1;
337  MemFree(str);
338  return true;
339 }
340 
341 
342 /**********************************************************/
344 {
345  FinfoBlk finfo;
346 
347  bool acwflag;
348  bool end_of_file;
349  bool after_LOCUS;
350  bool after_DEFNTN;
351  bool after_SOURCE;
352  bool after_REFER;
353  bool after_FEAT;
354  bool after_ORIGIN;
355  bool after_COMMENT;
356  bool after_VERSION;
357  bool after_MGA;
358 
359  IndexblkPtr entry;
360  int currentKeyword;
361  Int4 indx = 0;
362  IndBlkNextPtr ibnp;
363  IndBlkNextPtr tibnp;
364  char* p;
365  char* q;
366  char* line_ver;
367  char* line_nid;
368  char* line_locus;
369  size_t i;
370  ValNodePtr kwds;
371  ValNodePtr tkwds;
372  ValNodePtr dbl;
373  ValNodePtr tdbl;
374 
375  end_of_file = SkipTitleBuf(pp->ffbuf, finfo, "LOCUS");
376 
377  if (end_of_file) {
378  MsgSkipTitleFail("GenBank", finfo);
379  return false;
380  }
381 
382  bool tpa_check = (pp->source == Parser::ESource::EMBL);
383 
384  ibnp = new IndBlkNode(nullptr);
385  tibnp = ibnp;
386 
387  pp->num_drop = 0;
388  kwds = nullptr;
389  dbl = nullptr;
390  while (! end_of_file) {
391  entry = InitialEntry(pp, finfo);
392  if (entry) {
393  pp->curindx = indx;
394  tibnp->next = new IndBlkNode(entry);
395  tibnp = tibnp->next;
396 
397  indx++;
398 
399  entry->is_contig = false;
400  entry->origin = false;
401  entry->is_mga = false;
402  acwflag = false;
403  after_LOCUS = false;
404  after_DEFNTN = false;
405  after_SOURCE = false;
406  after_REFER = false;
407  after_FEAT = false;
408  after_ORIGIN = false;
409  after_COMMENT = false;
410  after_VERSION = false;
411  after_MGA = false;
412 
413  currentKeyword = ParFlat_LOCUS;
414  line_ver = nullptr;
415  line_nid = nullptr;
416  line_locus = nullptr;
417  if (kwds)
418  kwds = ValNodeFreeData(kwds);
419  tkwds = nullptr;
420  size_t kwds_len = 0;
421  if (dbl)
422  dbl = ValNodeFreeData(dbl);
423  tdbl = nullptr;
424  size_t dbl_len = 0;
425  while (currentKeyword != ParFlat_END && ! end_of_file) {
426  switch (currentKeyword) {
427  case ParFlat_LOCUS:
428  if (after_LOCUS) {
429  ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "More than two lines LOCUS in one entry");
430  entry->drop = true;
431  } else {
432  after_LOCUS = true;
433  line_locus = StringSave(finfo.str);
434  }
435  break;
436  case ParFlat_COMMENT:
437  if (after_COMMENT) {
438  ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "Multiple COMMENT lines in one entry");
439  entry->drop = true;
440  } else
441  after_COMMENT = true;
442 
443  break;
444  case ParFlat_VERSION:
445  p = StringStr(finfo.str + ParFlat_COL_DATA, "GI:");
446  if (p && atol(p + 3) > 0)
447  entry->wgs_and_gi |= 01;
448  if (pp->accver == false)
449  break;
450  if (after_VERSION) {
451  ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "Multiple VERSION lines in one entry");
452  entry->drop = true;
453  break;
454  }
455  after_VERSION = true;
456  p = finfo.str + ParFlat_COL_DATA;
457  while (*p == ' ' || *p == '\t')
458  p++;
459  for (q = p; *q != '\0' && *q != '\r' && *q != '\n';)
460  q++;
461  while (q > p) {
462  q--;
463  if (*q != ' ' && *q != '\t') {
464  q++;
465  break;
466  }
467  }
468  i = q - p;
469  line_ver = MemNew(i + 1);
470  StringNCpy(line_ver, p, i);
471  line_ver[i] = '\0';
472  break;
473  case ParFlat_NCBI_GI:
474  if (pp->source == Parser::ESource::DDBJ || pp->accver == false || line_nid)
475  break;
476  p = finfo.str + ParFlat_COL_DATA;
477  while (*p == ' ' || *p == '\t')
478  p++;
479  for (q = p; *q != '\0' && *q != ' ' && *q != '\t' &&
480  *q != '\r' && *q != '\n';)
481  q++;
482  i = q - p;
483  line_nid = MemNew(i + 1);
484  StringNCpy(line_nid, p, i);
485  line_nid[i] = '\0';
486  break;
487  case ParFlat_DEFINITION:
488  if (after_DEFNTN) {
489  ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "More than two lines 'DEFINITION'");
490  entry->drop = true;
491  } else if (after_LOCUS == false) {
492  ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "DEFINITION field out of order");
493  entry->drop = true;
494  } else
495  after_DEFNTN = true;
496 
497  break;
498  case ParFlat_SOURCE:
499  if (after_SOURCE) {
500  ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "More than two lines 'SOURCE'");
501  entry->drop = true;
502  } else if (after_LOCUS == false || after_DEFNTN == false) {
503  ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "SOURCE field out of order");
504  entry->drop = true;
505  } else
506  after_SOURCE = true;
507 
508  break;
509  case ParFlat_REFERENCE:
510  after_REFER = true;
511  break;
512  case ParFlat_CONTIG:
513  if (entry->is_contig) {
514  ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "More than one line CONTIG in one entry");
515  entry->drop = true;
516  } else
517  entry->is_contig = true;
518  break;
519  case ParFlat_MGA:
520  if (entry->is_mga == false) {
521  ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType, "Line type \"MGA\" is allowed for CAGE records only. Entry dropped.");
522  entry->drop = true;
523  }
524  if (fta_check_mga_line(finfo.str + ParFlat_COL_DATA, entry) == false) {
525  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectMGALine, "Incorrect range of accessions supplied in MGA line of CAGE record. Entry dropped.");
526  entry->drop = true;
527  }
528  after_MGA = true;
529  break;
530  case ParFlat_FEATURES:
531  if (after_FEAT) {
532  ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "More than two lines 'FEATURES'");
533  entry->drop = true;
534  } else if (pp->mode != Parser::EMode::Relaxed &&
535  (after_LOCUS == false ||
536  after_DEFNTN == false ||
537  after_SOURCE == false)) {
538  ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "FEATURES field out of order");
539  entry->drop = true;
540  } else
541  after_FEAT = true;
542 
543  break;
544  case ParFlat_ORIGIN:
545  if (after_ORIGIN) {
546  ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "More than two lines 'ORIGIN'");
547  entry->drop = true;
548  } else if (
549  pp->mode != Parser::EMode::Relaxed &&
550  (after_LOCUS == false ||
551  after_DEFNTN == false ||
552  after_SOURCE == false ||
553  after_FEAT == false)) {
554  ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "ORIGIN field out of order");
555  entry->drop = true;
556  } else {
557  after_ORIGIN = true;
558  entry->origin = true;
559  }
560  break;
561  case ParFlat_ACCESSION:
562  if (acwflag == false) /* first accession line */
563  {
564  acwflag = true;
565  if (! GetAccession(pp, finfo.str, entry, 2)) {
566  if (pp->mode != Parser::EMode::Relaxed) {
567  pp->num_drop++;
568  }
569  }
570  }
571  break;
572  case ParFlat_SEGMENT:
573  // LCOV_EXCL_START
574  // Excluded per Mark's request on 12/14/2016
575  GetSegment(finfo.str, entry);
576  // LCOV_EXCL_STOP
577  break;
578  case ParFlat_USER:
579  if (pp->source != Parser::ESource::Flybase) {
580  ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType, "Line type \"USER\" is allowed for source \"FLYBASE\" only. Entry dropped.");
581  entry->drop = true;
582  }
583  break;
584  case ParFlat_PRIMARY:
585  if (entry->is_tpa == false &&
586  entry->tsa_allowed == false &&
588  ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType, "Line type \"PRIMARY\" is allowed for TPA or TSA records only. Continue anyway.");
589  }
590  break;
591  case ParFlat_KEYWORDS:
592  if (pp->source != Parser::ESource::DDBJ &&
594  break;
595  if (kwds)
596  ValNodeFreeData(kwds);
598  tkwds = kwds;
599  kwds_len = StringLen(finfo.str) - 8;
600  break;
601  case ParFlat_DBLINK:
602  if (dbl)
603  ValNodeFreeData(dbl);
605  tdbl = dbl;
606  dbl_len = StringLen(finfo.str) - 8;
607  break;
608  default:
609  break;
610  } /* switch */
611 
612  end_of_file = XReadFileBuf(pp->ffbuf, finfo);
613 
614  while (! end_of_file && (finfo.str[0] == ' ' || finfo.str[0] == '\t')) {
615  if (currentKeyword == ParFlat_KEYWORDS && tkwds) {
616  tkwds = ValNodeNew(tkwds, finfo.str);
617  kwds_len += StringLen(finfo.str);
618  }
619 
620  if (currentKeyword == ParFlat_DBLINK && tdbl) {
621  tdbl = ValNodeNew(tdbl, finfo.str);
622  dbl_len += StringLen(finfo.str);
623  }
624 
625  if (currentKeyword == ParFlat_ACCESSION && ! entry->drop &&
626  GetAccession(pp, finfo.str, entry, 0) == false)
627  pp->num_drop++;
628 
629  end_of_file = XReadFileBuf(pp->ffbuf, finfo);
630  }
631 
632 
633  if (kwds) {
634  check_est_sts_gss_tpa_kwds(kwds, kwds_len, entry, tpa_check, entry->specialist_db, entry->inferential, entry->experimental, entry->assembly);
635  kwds = ValNodeFreeData(kwds);
636  kwds_len = 0;
637  }
638 
639  if (pp->mode == Parser::EMode::Relaxed &&
640  NStr::IsBlank(finfo.str)) {
641  currentKeyword = ParFlat_UNKW;
642  continue;
643  }
644 
645  currentKeyword = SrchKeyword(finfo.str, genbankKeywords);
646 
647  if (finfo.str[0] != ' ' && finfo.str[0] != '\t' &&
648  CheckLineType(finfo.str, finfo.line, genbankKeywords, after_ORIGIN) == false)
649  entry->drop = true;
650 
651  } /* while, end of one entry */
652 
653  entry->is_tpa_wgs_con = (entry->is_contig && entry->is_wgs && entry->is_tpa);
654 
655  if (! entry->drop) {
656 
657  if (pp->mode != Parser::EMode::Relaxed) {
658  if (line_locus &&
659  CkLocusLinePos(line_locus, pp->source, &entry->lc, entry->is_mga) == false)
660  entry->drop = true;
661 
662  if (entry->is_mga && after_MGA == false)
663  entry->drop = gb_err_field("MGA");
664 
665  if (after_LOCUS == false)
666  entry->drop = gb_err_field("LOCUS");
667 
668  if (after_VERSION == false && pp->accver)
669  entry->drop = gb_err_field("VERSION");
670 
671  if (after_DEFNTN == false)
672  entry->drop = gb_err_field("DEFINITION");
673 
674  if (after_SOURCE == false)
675  entry->drop = gb_err_field("SOURCE");
676 
677  if (after_REFER == false && pp->source != Parser::ESource::Flybase &&
678  entry->is_wgs == false &&
680  ! StringEquN(entry->acnum, "NW_", 3))) {
681  entry->drop = gb_err_field("REFERENCE");
682  }
683 
684  if (after_FEAT == false) {
685  entry->drop = gb_err_field("FEATURES");
686  }
687  } // !Parser::EMode::Relaxed
688 
689  if (entry->is_contig && entry->segnum != 0) {
690  ErrPostEx(SEV_ERROR, ERR_FORMAT_ContigInSegset, "CONTIG data are not allowed for members of segmented sets, entry dropped.");
691  entry->drop = true;
692  }
693  }
694  if (pp->accver) {
695  if (pp->mode == Parser::EMode::HTGSCON)
696  entry->vernum = 1;
697  else
699  entry,
700  line_ver,
701  line_nid,
702  pp->source,
703  pp->mode,
704  pp->ign_toks);
705  }
706  if (line_locus) {
707  MemFree(line_locus);
708  line_locus = nullptr;
709  }
710  if (line_ver) {
711  MemFree(line_ver);
712  line_ver = nullptr;
713  }
714  if (line_nid) {
715  MemFree(line_nid);
716  line_nid = nullptr;
717  }
718  entry->len = pp->ffbuf.get_offs() - entry->offset;
719 
720  if (acwflag == false &&
721  pp->mode != Parser::EMode::Relaxed) {
722  ErrPostEx(SEV_ERROR, ERR_ACCESSION_NoAccessNum, "No accession # for this entry, about line %ld", (long int)entry->linenum);
723  }
724 
725  if (dbl) {
726  dbl = ValNodeFreeData(dbl);
727  // dbl_len = 0;
728  }
729  } /* if, entry */
730  else {
731  end_of_file = FindNextEntryBuf(end_of_file, pp->ffbuf, finfo, "//");
732  }
733 
734  end_of_file = FindNextEntryBuf(end_of_file, pp->ffbuf, finfo, "LOCUS");
735 
736  } /* while, end_of_file */
737 
738  pp->indx = indx;
739 
741 
742  if (pp->qsfd && QSIndex(pp, ibnp->next) == false)
743  return false;
744 
745  pp->entrylist.resize(indx);
746  tibnp = ibnp->next;
747  delete ibnp;
748  for (int j = 0; j < indx && tibnp; j++, tibnp = ibnp) {
749  pp->entrylist[j] = tibnp->ibp;
750  ibnp = tibnp->next;
751  delete tibnp;
752  }
753 
754  return (end_of_file);
755 }
756 
bool QSIndex(ParserPtr pp, IndBlkNextPtr ibnp)
Definition: block.cpp:207
#define ERR_FORMAT_LineTypeOrder
Definition: flat2err.h:40
char * StringSave(const char *s)
Definition: ftacpp.hpp:61
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:106
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:96
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:75
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
char * MemNew(size_t sz)
Definition: ftacpp.hpp:43
char * StringRChr(char *s, const char c)
Definition: ftacpp.hpp:78
void FtaDeletePrefix(int prefix)
Definition: ftaerr.cpp:344
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
vector< string > genbankKeywords
Definition: gb_index.cpp:56
static void ParseGenBankVersion(IndexblkPtr entry, char *line, char *nid, Parser::ESource source, Parser::EMode mode, bool ign_toks)
Definition: gb_index.cpp:192
static bool gb_err_field(const char *str)
Definition: gb_index.cpp:185
static bool DelSegnum(IndexblkPtr entry, const char *segnum, size_t len2)
Definition: gb_index.cpp:106
bool GenBankIndex(ParserPtr pp)
Definition: gb_index.cpp:343
static bool fta_check_mga_line(char *line, IndexblkPtr ibp)
Definition: gb_index.cpp:280
static void GetSegment(char *str, IndexblkPtr entry)
Definition: gb_index.cpp:158
@ ParFlat_ACCESSION
Definition: genbank.h:43
@ ParFlat_FEATURES
Definition: genbank.h:51
@ ParFlat_SOURCE
Definition: genbank.h:48
@ ParFlat_DBLINK
Definition: genbank.h:63
@ ParFlat_SEGMENT
Definition: genbank.h:47
@ ParFlat_COMMENT
Definition: genbank.h:50
@ ParFlat_REFERENCE
Definition: genbank.h:49
@ ParFlat_VERSION
Definition: genbank.h:57
@ ParFlat_LOCUS
Definition: genbank.h:41
@ ParFlat_NCBI_GI
Definition: genbank.h:44
@ ParFlat_USER
Definition: genbank.h:58
@ ParFlat_PRIMARY
Definition: genbank.h:60
@ ParFlat_END
Definition: genbank.h:54
@ ParFlat_KEYWORDS
Definition: genbank.h:46
@ ParFlat_DEFINITION
Definition: genbank.h:42
@ ParFlat_CONTIG
Definition: genbank.h:56
@ ParFlat_MGA
Definition: genbank.h:61
@ ParFlat_ORIGIN
Definition: genbank.h:53
#define ParFlat_COL_DATA
Definition: genbank.h:37
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_FATAL
Definition: gicache.c:93
#define SEV_REJECT
Definition: gicache.c:92
#define StringStr
Definition: ncbistr.hpp:322
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
bool FindNextEntryBuf(bool end_of_file, FileBuf &fbuf, FinfoBlk &finfo, const CTempString &keyword)
Definition: indx_blk.cpp:2188
IndexblkPtr InitialEntry(ParserPtr pp, FinfoBlk &finfo)
Definition: indx_blk.cpp:788
bool SkipTitleBuf(FileBuf &fbuf, FinfoBlk &finfo, const CTempString &keyword)
Definition: indx_blk.cpp:358
bool XReadFileBuf(FileBuf &fbuf, FinfoBlk &finfo)
Definition: indx_blk.cpp:314
bool CkLocusLinePos(char *offset, Parser::ESource source, LocusContPtr lcp, bool is_mga)
Definition: indx_blk.cpp:538
void MsgSkipTitleFail(const char *flatfile, FinfoBlk &finfo)
Definition: indx_blk.cpp:2180
#define ERR_VERSION_NonDigitVerNum
Definition: indx_err.h:83
#define ERR_VERSION_MissingVerNum
Definition: indx_err.h:82
#define ERR_ACCESSION_NoAccessNum
Definition: indx_err.h:68
#define ERR_SEGMENT_BadLocusName
Definition: indx_err.h:78
#define ERR_FORMAT_ContigInSegset
Definition: indx_err.h:47
#define ERR_SEGMENT_IncompSeg
Definition: indx_err.h:79
#define ERR_FORMAT_IncorrectMGALine
Definition: indx_err.h:56
#define ERR_VERSION_IncorrectGIInVersion
Definition: indx_err.h:86
#define ERR_VERSION_NonDigitGI
Definition: indx_err.h:87
#define ERR_VERSION_BadVersionLine
Definition: indx_err.h:85
#define ERR_VERSION_InvalidVersion
Definition: indx_err.h:88
#define ERR_FORMAT_MissingField
Definition: indx_err.h:42
#define ERR_ENTRY_InvalidLineType
Definition: indx_err.h:64
#define ERR_VERSION_AccessionsDontMatch
Definition: indx_err.h:84
@ e_not_set
int i
if(yy_accept[yy_current_state])
mdb_mode_t mode
Definition: lmdb++.h:38
const CharType(& source)[N]
Definition: pointer.h:1149
static const BitmapCharRec ch1
Definition: ncbi_10x20.c:1827
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
const CConstRef< CSeq_id > GetAccession(const CSeq_id_Handle &id_handle)
static const char * str(char *buf, int n)
Definition: stats.c:84
size_t get_offs() const
Char str[256]
Definition: indx_blk.h:42
Int4 line
Definition: indx_blk.h:43
Indexblk * ibp
Definition: indx_blk.h:56
IndBlkNode * next
Definition: indx_blk.h:57
Char acnum[200]
Definition: ftablock.h:169
bool assembly
Definition: ftablock.h:244
bool is_mga
Definition: ftablock.h:202
bool tsa_allowed
Definition: ftablock.h:214
Int4 wgs_and_gi
Definition: ftablock.h:234
Char blocusname[200]
Definition: ftablock.h:181
bool is_tpa_wgs_con
Definition: ftablock.h:212
Int2 vernum
Definition: ftablock.h:170
bool is_tpa
Definition: ftablock.h:209
bool is_wgs
Definition: ftablock.h:208
bool origin
Definition: ftablock.h:204
bool is_contig
Definition: ftablock.h:200
bool drop
Definition: ftablock.h:185
bool experimental
Definition: ftablock.h:250
size_t bases
Definition: ftablock.h:175
bool inferential
Definition: ftablock.h:248
Uint2 segtotal
Definition: ftablock.h:178
size_t linenum
Definition: ftablock.h:183
size_t len
Definition: ftablock.h:187
size_t offset
Definition: ftablock.h:171
bool specialist_db
Definition: ftablock.h:246
Uint2 segnum
Definition: ftablock.h:176
LocusCont lc
Definition: ftablock.h:215
vector< IndexblkPtr > entrylist
TokenBlk * next
Definition: ftablock.h:135
char * str
Definition: ftablock.h:134
TokenBlk * list
Definition: ftablock.h:140
int SrchKeyword(const CTempString &ptr, const vector< string > &keywordList)
Definition: utilfun.cpp:1042
bool CheckLineType(char *ptr, Int4 line, const vector< string > &keywordList, bool after_origin)
Definition: utilfun.cpp:1055
void check_est_sts_gss_tpa_kwds(ValNodePtr kwds, size_t len, IndexblkPtr entry, bool tpa_check, bool &specialist_db, bool &inferential, bool &experimental, bool &assembly)
Definition: utilfun.cpp:1516
TokenStatBlkPtr TokenString(char *str, Char delimiter)
Definition: utilfun.cpp:489
void FreeTokenstatblk(TokenStatBlkPtr tsbp)
Definition: utilfun.cpp:542
ValNodePtr ConstructValNode(CSeq_id::E_Choice choice, const char *data)
Definition: utilfun.cpp:1582
#define ParFlat_UNKW
Definition: utilfun.h:44
ValNodePtr ValNodeNew(ValNodePtr prev, const char *data)
Definition: valnode.cpp:53
ValNodePtr ValNodeFreeData(ValNodePtr vnp)
Definition: valnode.cpp:96
Modified on Wed Feb 28 07:11:12 2024 by modify_doxy.py rev. 669887