NCBI C++ ToolKit
gb_index.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gb_index.cpp 102964 2024-08-11 13:12:24Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: gb_index.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Parsing genbank to memory blocks. Build Genbank format index block.
32  *
33  */
34 
35 #include <ncbi_pch.hpp>
36 
37 #include "ftacpp.hpp"
38 
39 #include "index.h"
40 #include "genbank.h"
41 
42 #include "ftaerr.hpp"
43 #include "indx_blk.h"
44 #include "indx_def.h"
45 #include "utilfun.h"
46 #include "entry.h"
47 
48 #ifdef THIS_FILE
49 # undef THIS_FILE
50 #endif
51 #define THIS_FILE "gb_index.cpp"
52 
53 
55 
56 vector<string> genbankKeywords = {
57  "LOCUS",
58  "DEFINITION",
59  "ACCESSION",
60  "NID",
61  "GSDB ID",
62  "KEYWORDS",
63  "SEGMENT",
64  "SOURCE",
65  "REFERENCE",
66  "COMMENT",
67  "FEATURES",
68  "BASE COUNT",
69  "ORIGIN",
70  "//",
71  "GSDBID",
72  "CONTIG",
73  "VERSION",
74  "USER",
75  "WGS",
76  "PRIMARY",
77  "MGA",
78  "PROJECT",
79  "DBLINK",
80 };
81 
82 
83 // LCOV_EXCL_START
84 // Excluded per Mark's request on 12/14/2016
85 /**********************************************************
86  *
87  * static bool DelSegnum(str, segnum, len2):
88  *
89  * Strip off segnum which has number of "len1" digits,
90  * then check if any tailing zero existed.
91  * Subroutine return:
92  * TRUE if
93  * - there is no tailing zero or
94  * - the number of the tailing zero is equal or greater
95  * than (len2-len1) (i.e. strip off len2-len1 of "0").
96  * FALSE and no change in the string "str" if
97  * - len2-len1 less than zero or
98  * - there is not enough "len1" digits at end of
99  * the string "str" or
100  * - there is not enough len2-len1 zero at end of
101  * the string "str".
102  *
103  * February 25 1993
104  *
105  **********************************************************/
106 static bool DelSegnum(IndexblkPtr entry, const char* segnum, size_t len2)
107 {
108  char* str;
109  const char* p;
110  char* q;
111 
112  if (! segnum)
113  return false;
114  size_t len1 = StringLen(segnum);
115  if (len2 < len1)
116  return false;
117 
118  /* check, is there enough digits to delete
119  */
120  size_t tlen = len1;
121  str = entry->blocusname;
122  size_t i = StringLen(str);
123  for (; tlen > 0; tlen--) {
124  char c = str[--i];
125  if (! ('0' <= c && c <= '9'))
126  break;
127  if (i <= 0)
128  return false;
129  }
130 
131  if (tlen > 0)
132  return false;
133 
134  if (len2 > len1 && str[i] == '0') {
135  /* check, is there enough "0" appended
136  */
137  for (tlen = len2 - len1; tlen > 0 && str[i] == '0'; i--)
138  tlen--;
139 
140  if (tlen != 0)
141  return false;
142  }
143 
144  for (q = &str[i + 1], p = q; *p == '0';)
145  p++;
146 
147  int j = atoi(segnum);
148  if (atoi(p) != j) {
149  ErrPostEx(SEV_REJECT, ERR_SEGMENT_BadLocusName, "Segment suffix in locus name \"%s\" does not match number in SEGMENT line = \"%d\". Entry dropped.", str, j);
150  entry->drop = true;
151  }
152 
153  *q = '\0'; /* strip off "len" characters */
154  return true;
155 }
156 
157 /**********************************************************/
158 static void GetSegment(const char* str, IndexblkPtr entry)
159 {
160  auto stoken = TokenString(str, ' ');
161 
162  if (stoken->num >= 4) {
163  auto ptr2 = next(stoken->list.begin());
164  auto ptr4 = next(ptr2, 2);
165  entry->segnum = (Uint2)atoi(ptr2->c_str());
166 
167  if (! DelSegnum(entry, ptr2->c_str(), StringLen(ptr4->c_str()))) {
168  ErrPostEx(SEV_ERROR, ERR_SEGMENT_BadLocusName, "Bad locus name %s in %d", entry->blocusname, entry->linenum);
169  }
170 
171  entry->segtotal = (Uint2)atoi(ptr4->c_str());
172  } else {
173  ErrPostEx(SEV_ERROR, ERR_SEGMENT_IncompSeg, "Incomplete Segment information at linenum %d", entry->linenum);
174  }
175 }
176 // LCOV_EXCL_STOP
177 
178 /**********************************************************/
179 static bool gb_err_field(const char* str)
180 {
181  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField, "No %s data in GenBank format file, entry dropped", str);
182  return true;
183 }
184 
185 /**********************************************************/
186 static void ParseGenBankVersion(IndexblkPtr entry, char* line, char* nid, Parser::ESource source, Parser::EMode mode, bool ign_toks)
187 {
188  bool gi;
189  char* p;
190  char* q;
191  char* r;
192  Char ch;
193  Char ch1;
194 
195  if (! line)
196  return;
197 
198  for (p = line; *p != '\0' && *p != ' ' && *p != '\t';)
199  p++;
200  gi = (*p == '\0') ? false : true;
201 
202  ch1 = *p;
203  *p = '\0';
204  q = StringRChr(line, '.');
205  if (! q) {
206  if (mode != Parser::EMode::Relaxed) {
207  *p = ch1;
208  ErrPostEx(SEV_FATAL, ERR_VERSION_MissingVerNum, "Missing VERSION number in VERSION line: \"%s\".", line);
209  entry->drop = true;
210  }
211  return;
212  }
213 
214  for (r = q + 1; *r >= '0' && *r <= '9';)
215  r++;
216  if (*r != '\0') {
217  if (mode != Parser::EMode::Relaxed) {
218  *p = ch1;
219  ErrPostEx(SEV_FATAL, ERR_VERSION_NonDigitVerNum, "Incorrect VERSION number in VERSION line: \"%s\".", line);
220  entry->drop = true;
221  }
222  return;
223  }
224  ch = *q;
225  *q = '\0';
226  if (! StringEqu(entry->acnum, line)) {
227  *q = ch;
228  *p = ch1;
229  if (mode != Parser::EMode::Relaxed) {
230  ErrPostEx(SEV_FATAL, ERR_VERSION_AccessionsDontMatch, "Accessions in VERSION and ACCESSION lines don't match: \"%s\" vs \"%s\".", line, entry->acnum);
231  entry->drop = true;
232  }
233  return;
234  }
235  entry->vernum = atoi(q + 1);
236  *q = ch;
237 
238  if (entry->vernum < 1) {
239  *p = ch1;
240  ErrPostEx(SEV_FATAL, ERR_VERSION_InvalidVersion, "Version number \"%d\" from Accession.Version value \"%s.%d\" is not a positive integer.", entry->vernum, entry->acnum, entry->vernum);
241  entry->drop = true;
242  return;
243  }
244 
245  if (ch1 != '\0')
246  for (*p++ = ch1; *p == ' ' || *p == '\t';)
247  p++;
248 
249  if (source == Parser::ESource::DDBJ) {
250  if (*p != '\0' && ! ign_toks) {
251  ErrPostEx(SEV_ERROR, ERR_VERSION_BadVersionLine, "DDBJ's VERSION line has too many tokens: \"%s\".", line);
252  }
253  return;
254  }
255 
256  if (! gi)
257  return;
258 
259  if (! StringEquN(p, "GI:", 3)) {
260  ErrPostEx(SEV_FATAL, ERR_VERSION_IncorrectGIInVersion, "Incorrect GI entry in VERSION line: \"%s\".", line);
261  entry->drop = true;
262  return;
263  }
264  p += 3;
265  for (q = p; *q >= '0' && *q <= '9';)
266  q++;
267  if (*q != '\0') {
268  ErrPostEx(SEV_FATAL, ERR_VERSION_NonDigitGI, "Incorrect GI number in VERSION line: \"%s\".", line);
269  entry->drop = true;
270  }
271 }
272 
273 /**********************************************************/
274 static bool fta_check_mga_line(char* line, IndexblkPtr ibp)
275 {
276  char* p;
277  char* q;
278  char* str;
279  Int4 from;
280  Int4 to;
281 
282  if (! line || ! ibp)
283  return false;
284 
285  for (p = line; *p == ' ' || *p == '\t';)
286  p++;
287  str = StringSave(p);
288  p = StringChr(str, '\n');
289  if (p)
290  *p = '\0';
291  p = StringChr(str, '-');
292  if (! p) {
293  MemFree(str);
294  return false;
295  }
296  *p++ = '\0';
297 
298  if (StringLen(str) != 12 || StringLen(p) != 12 ||
299  ! StringEquN(str, ibp->acnum, 5) ||
300  ! StringEquN(p, ibp->acnum, 5)) {
301  MemFree(str);
302  return false;
303  }
304 
305  for (q = str + 5; *q >= '0' && *q <= '9';)
306  q++;
307  if (*q != '\0') {
308  MemFree(str);
309  return false;
310  }
311  for (q = p + 5; *q >= '0' && *q <= '9';)
312  q++;
313  if (*q != '\0') {
314  MemFree(str);
315  return false;
316  }
317 
318  for (q = str + 5; *q == '0';)
319  q++;
320  from = atoi(q);
321  for (q = p + 5; *q == '0';)
322  q++;
323  to = atoi(q);
324 
325  if (from > to) {
326  MemFree(str);
327  return false;
328  }
329 
330  ibp->bases = to - from + 1;
331  MemFree(str);
332  return true;
333 }
334 
335 
336 /**********************************************************/
338 {
339  FinfoBlk finfo;
340 
341  bool acwflag;
342  bool end_of_file;
343  bool after_LOCUS;
344  bool after_DEFNTN;
345  bool after_SOURCE;
346  bool after_REFER;
347  bool after_FEAT;
348  bool after_ORIGIN;
349  bool after_COMMENT;
350  bool after_VERSION;
351  bool after_MGA;
352 
353  IndexblkPtr entry;
354  int currentKeyword;
355  Int4 indx = 0;
356  IndBlkNextPtr ibnp;
357  IndBlkNextPtr tibnp;
358  char* p;
359  char* q;
360  char* line_ver;
361  char* line_nid;
362  char* line_locus;
363  size_t i;
364  ValNodePtr kwds;
365  ValNodePtr tkwds;
366  ValNodePtr dbl;
367  ValNodePtr tdbl;
368 
369  end_of_file = SkipTitleBuf(pp->ffbuf, finfo, "LOCUS");
370 
371  if (end_of_file) {
372  MsgSkipTitleFail("GenBank", finfo);
373  return false;
374  }
375 
376  bool tpa_check = (pp->source == Parser::ESource::EMBL);
377 
378  ibnp = new IndBlkNode(nullptr);
379  tibnp = ibnp;
380 
381  pp->num_drop = 0;
382  kwds = nullptr;
383  dbl = nullptr;
384  while (! end_of_file) {
385  entry = InitialEntry(pp, finfo);
386  if (entry) {
387  pp->curindx = indx;
388  tibnp->next = new IndBlkNode(entry);
389  tibnp = tibnp->next;
390 
391  indx++;
392 
393  entry->is_contig = false;
394  entry->origin = false;
395  entry->is_mga = false;
396  acwflag = false;
397  after_LOCUS = false;
398  after_DEFNTN = false;
399  after_SOURCE = false;
400  after_REFER = false;
401  after_FEAT = false;
402  after_ORIGIN = false;
403  after_COMMENT = false;
404  after_VERSION = false;
405  after_MGA = false;
406 
407  currentKeyword = ParFlat_LOCUS;
408  line_ver = nullptr;
409  line_nid = nullptr;
410  line_locus = nullptr;
411  if (kwds)
412  kwds = ValNodeFreeData(kwds);
413  tkwds = nullptr;
414  size_t kwds_len = 0;
415  if (dbl)
416  dbl = ValNodeFreeData(dbl);
417  tdbl = nullptr;
418  size_t dbl_len = 0;
419  while (currentKeyword != ParFlat_END && ! end_of_file) {
420  switch (currentKeyword) {
421  case ParFlat_LOCUS:
422  if (after_LOCUS) {
423  ErrPostStr(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "More than two lines LOCUS in one entry");
424  entry->drop = true;
425  } else {
426  after_LOCUS = true;
427  line_locus = StringSave(finfo.str);
428  }
429  break;
430  case ParFlat_COMMENT:
431  if (after_COMMENT) {
432  ErrPostStr(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "Multiple COMMENT lines in one entry");
433  entry->drop = true;
434  } else
435  after_COMMENT = true;
436 
437  break;
438  case ParFlat_VERSION:
439  p = StringStr(finfo.str + ParFlat_COL_DATA, "GI:");
440  if (p && atol(p + 3) > 0)
441  entry->wgs_and_gi |= 01;
442  if (pp->accver == false)
443  break;
444  if (after_VERSION) {
445  ErrPostStr(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "Multiple VERSION lines in one entry");
446  entry->drop = true;
447  break;
448  }
449  after_VERSION = true;
450  p = finfo.str + ParFlat_COL_DATA;
451  while (*p == ' ' || *p == '\t')
452  p++;
453  for (q = p; *q != '\0' && *q != '\r' && *q != '\n';)
454  q++;
455  while (q > p) {
456  q--;
457  if (*q != ' ' && *q != '\t') {
458  q++;
459  break;
460  }
461  }
462  i = q - p;
463  line_ver = StringNew(i);
464  StringNCpy(line_ver, p, i);
465  line_ver[i] = '\0';
466  break;
467  case ParFlat_NCBI_GI:
468  if (pp->source == Parser::ESource::DDBJ || pp->accver == false || line_nid)
469  break;
470  p = finfo.str + ParFlat_COL_DATA;
471  while (*p == ' ' || *p == '\t')
472  p++;
473  for (q = p; *q != '\0' && *q != ' ' && *q != '\t' &&
474  *q != '\r' && *q != '\n';)
475  q++;
476  i = q - p;
477  line_nid = StringNew(i);
478  StringNCpy(line_nid, p, i);
479  line_nid[i] = '\0';
480  break;
481  case ParFlat_DEFINITION:
482  if (after_DEFNTN) {
483  ErrPostStr(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "More than two lines 'DEFINITION'");
484  entry->drop = true;
485  } else if (after_LOCUS == false) {
486  ErrPostStr(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "DEFINITION field out of order");
487  entry->drop = true;
488  } else
489  after_DEFNTN = true;
490 
491  break;
492  case ParFlat_SOURCE:
493  if (after_SOURCE) {
494  ErrPostStr(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "More than two lines 'SOURCE'");
495  entry->drop = true;
496  } else if (after_LOCUS == false || after_DEFNTN == false) {
497  ErrPostStr(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "SOURCE field out of order");
498  entry->drop = true;
499  } else
500  after_SOURCE = true;
501 
502  break;
503  case ParFlat_REFERENCE:
504  after_REFER = true;
505  break;
506  case ParFlat_CONTIG:
507  if (entry->is_contig) {
508  ErrPostStr(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "More than one line CONTIG in one entry");
509  entry->drop = true;
510  } else
511  entry->is_contig = true;
512  break;
513  case ParFlat_MGA:
514  if (entry->is_mga == false) {
515  ErrPostStr(SEV_ERROR, ERR_ENTRY_InvalidLineType, "Line type \"MGA\" is allowed for CAGE records only. Entry dropped.");
516  entry->drop = true;
517  }
518  if (fta_check_mga_line(finfo.str + ParFlat_COL_DATA, entry) == false) {
519  ErrPostStr(SEV_REJECT, ERR_FORMAT_IncorrectMGALine, "Incorrect range of accessions supplied in MGA line of CAGE record. Entry dropped.");
520  entry->drop = true;
521  }
522  after_MGA = true;
523  break;
524  case ParFlat_FEATURES:
525  if (after_FEAT) {
526  ErrPostStr(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "More than two lines 'FEATURES'");
527  entry->drop = true;
528  } else if (pp->mode != Parser::EMode::Relaxed &&
529  (after_LOCUS == false ||
530  after_DEFNTN == false ||
531  after_SOURCE == false)) {
532  ErrPostStr(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "FEATURES field out of order");
533  entry->drop = true;
534  } else
535  after_FEAT = true;
536 
537  break;
538  case ParFlat_ORIGIN:
539  if (after_ORIGIN) {
540  ErrPostStr(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "More than two lines 'ORIGIN'");
541  entry->drop = true;
542  } else if (
543  pp->mode != Parser::EMode::Relaxed &&
544  (after_LOCUS == false ||
545  after_DEFNTN == false ||
546  after_SOURCE == false ||
547  after_FEAT == false)) {
548  ErrPostStr(SEV_ERROR, ERR_FORMAT_LineTypeOrder, "ORIGIN field out of order");
549  entry->drop = true;
550  } else {
551  after_ORIGIN = true;
552  entry->origin = true;
553  }
554  break;
555  case ParFlat_ACCESSION:
556  if (acwflag == false) /* first accession line */
557  {
558  acwflag = true;
559  if (! GetAccession(pp, finfo.str, entry, 2)) {
560  if (pp->mode != Parser::EMode::Relaxed) {
561  pp->num_drop++;
562  }
563  }
564  }
565  break;
566  case ParFlat_SEGMENT:
567  // LCOV_EXCL_START
568  // Excluded per Mark's request on 12/14/2016
569  GetSegment(finfo.str, entry);
570  // LCOV_EXCL_STOP
571  break;
572  case ParFlat_USER:
573  if (pp->source != Parser::ESource::Flybase) {
574  ErrPostStr(SEV_ERROR, ERR_ENTRY_InvalidLineType, "Line type \"USER\" is allowed for source \"FLYBASE\" only. Entry dropped.");
575  entry->drop = true;
576  }
577  break;
578  case ParFlat_PRIMARY:
579  if (entry->is_tpa == false &&
580  entry->tsa_allowed == false &&
582  ErrPostStr(SEV_ERROR, ERR_ENTRY_InvalidLineType, "Line type \"PRIMARY\" is allowed for TPA or TSA records only. Continue anyway.");
583  }
584  break;
585  case ParFlat_KEYWORDS:
586  if (pp->source != Parser::ESource::DDBJ &&
588  break;
589  if (kwds)
590  ValNodeFreeData(kwds);
592  tkwds = kwds;
593  kwds_len = StringLen(finfo.str) - 8;
594  break;
595  case ParFlat_DBLINK:
596  if (dbl)
597  ValNodeFreeData(dbl);
599  tdbl = dbl;
600  dbl_len = StringLen(finfo.str) - 8;
601  break;
602  default:
603  break;
604  } /* switch */
605 
606  end_of_file = XReadFileBuf(pp->ffbuf, finfo);
607 
608  while (! end_of_file && (finfo.str[0] == ' ' || finfo.str[0] == '\t')) {
609  if (currentKeyword == ParFlat_KEYWORDS && tkwds) {
610  tkwds = ValNodeNew(tkwds, finfo.str);
611  kwds_len += StringLen(finfo.str);
612  }
613 
614  if (currentKeyword == ParFlat_DBLINK && tdbl) {
615  tdbl = ValNodeNew(tdbl, finfo.str);
616  dbl_len += StringLen(finfo.str);
617  }
618 
619  if (currentKeyword == ParFlat_ACCESSION && ! entry->drop &&
620  GetAccession(pp, finfo.str, entry, 0) == false)
621  pp->num_drop++;
622 
623  end_of_file = XReadFileBuf(pp->ffbuf, finfo);
624  }
625 
626 
627  if (kwds) {
628  check_est_sts_gss_tpa_kwds(kwds, kwds_len, entry, tpa_check, entry->specialist_db, entry->inferential, entry->experimental, entry->assembly);
629  kwds = ValNodeFreeData(kwds);
630  kwds_len = 0;
631  }
632 
633  if (pp->mode == Parser::EMode::Relaxed &&
634  NStr::IsBlank(finfo.str)) {
635  currentKeyword = ParFlat_UNKW;
636  continue;
637  }
638 
639  currentKeyword = SrchKeyword(finfo.str, genbankKeywords);
640 
641  if (finfo.str[0] != ' ' && finfo.str[0] != '\t' &&
642  CheckLineType(finfo.str, finfo.line, genbankKeywords, after_ORIGIN) == false)
643  entry->drop = true;
644 
645  } /* while, end of one entry */
646 
647  entry->is_tpa_wgs_con = (entry->is_contig && entry->is_wgs && entry->is_tpa);
648 
649  if (! entry->drop) {
650 
651  if (pp->mode != Parser::EMode::Relaxed) {
652  if (line_locus &&
653  CkLocusLinePos(line_locus, pp->source, &entry->lc, entry->is_mga) == false)
654  entry->drop = true;
655 
656  if (entry->is_mga && after_MGA == false)
657  entry->drop = gb_err_field("MGA");
658 
659  if (after_LOCUS == false)
660  entry->drop = gb_err_field("LOCUS");
661 
662  if (after_VERSION == false && pp->accver)
663  entry->drop = gb_err_field("VERSION");
664 
665  if (after_DEFNTN == false)
666  entry->drop = gb_err_field("DEFINITION");
667 
668  if (after_SOURCE == false)
669  entry->drop = gb_err_field("SOURCE");
670 
671  if (after_REFER == false && pp->source != Parser::ESource::Flybase &&
672  entry->is_wgs == false &&
674  ! StringEquN(entry->acnum, "NW_", 3))) {
675  entry->drop = gb_err_field("REFERENCE");
676  }
677 
678  if (after_FEAT == false) {
679  entry->drop = gb_err_field("FEATURES");
680  }
681  } // !Parser::EMode::Relaxed
682 
683  if (entry->is_contig && entry->segnum != 0) {
684  ErrPostStr(SEV_ERROR, ERR_FORMAT_ContigInSegset, "CONTIG data are not allowed for members of segmented sets, entry dropped.");
685  entry->drop = true;
686  }
687  }
688  if (pp->accver) {
689  if (pp->mode == Parser::EMode::HTGSCON)
690  entry->vernum = 1;
691  else
693  entry,
694  line_ver,
695  line_nid,
696  pp->source,
697  pp->mode,
698  pp->ign_toks);
699  }
700  if (line_locus) {
701  MemFree(line_locus);
702  line_locus = nullptr;
703  }
704  if (line_ver) {
705  MemFree(line_ver);
706  line_ver = nullptr;
707  }
708  if (line_nid) {
709  MemFree(line_nid);
710  line_nid = nullptr;
711  }
712  entry->len = pp->ffbuf.get_offs() - entry->offset;
713 
714  if (acwflag == false &&
715  pp->mode != Parser::EMode::Relaxed) {
716  ErrPostEx(SEV_ERROR, ERR_ACCESSION_NoAccessNum, "No accession # for this entry, about line %ld", (long int)entry->linenum);
717  }
718 
719  if (dbl) {
720  dbl = ValNodeFreeData(dbl);
721  // dbl_len = 0;
722  }
723  } /* if, entry */
724  else {
725  end_of_file = FindNextEntryBuf(end_of_file, pp->ffbuf, finfo, "//");
726  }
727 
728  end_of_file = FindNextEntryBuf(end_of_file, pp->ffbuf, finfo, "LOCUS");
729 
730  } /* while, end_of_file */
731 
732  pp->indx = indx;
733 
735 
736  if (pp->qsfd && QSIndex(pp, ibnp->next) == false)
737  return false;
738 
739  pp->entrylist.resize(indx);
740  tibnp = ibnp->next;
741  delete ibnp;
742  for (int j = 0; j < indx && tibnp; j++, tibnp = ibnp) {
743  pp->entrylist[j] = tibnp->ibp;
744  ibnp = tibnp->next;
745  delete tibnp;
746  }
747 
748  return (end_of_file);
749 }
750 
bool QSIndex(ParserPtr pp, IndBlkNextPtr ibnp)
Definition: block.cpp:202
#define ERR_FORMAT_LineTypeOrder
Definition: flat2err.h:40
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:121
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:111
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:90
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
char * StringRChr(char *s, const char c)
Definition: ftacpp.hpp:93
char * StringNew(size_t sz)
Definition: ftacpp.hpp:43
void FtaDeletePrefix(int prefix)
Definition: ftaerr.cpp:346
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static const char * str(char *buf, int n)
Definition: stats.c:84
vector< string > genbankKeywords
Definition: gb_index.cpp:56
static void ParseGenBankVersion(IndexblkPtr entry, char *line, char *nid, Parser::ESource source, Parser::EMode mode, bool ign_toks)
Definition: gb_index.cpp:186
static bool gb_err_field(const char *str)
Definition: gb_index.cpp:179
static bool DelSegnum(IndexblkPtr entry, const char *segnum, size_t len2)
Definition: gb_index.cpp:106
bool GenBankIndex(ParserPtr pp)
Definition: gb_index.cpp:337
static bool fta_check_mga_line(char *line, IndexblkPtr ibp)
Definition: gb_index.cpp:274
static void GetSegment(const char *str, IndexblkPtr entry)
Definition: gb_index.cpp:158
@ ParFlat_ACCESSION
Definition: genbank.h:43
@ ParFlat_FEATURES
Definition: genbank.h:51
@ ParFlat_SOURCE
Definition: genbank.h:48
@ ParFlat_DBLINK
Definition: genbank.h:63
@ ParFlat_SEGMENT
Definition: genbank.h:47
@ ParFlat_COMMENT
Definition: genbank.h:50
@ ParFlat_REFERENCE
Definition: genbank.h:49
@ ParFlat_VERSION
Definition: genbank.h:57
@ ParFlat_LOCUS
Definition: genbank.h:41
@ ParFlat_NCBI_GI
Definition: genbank.h:44
@ ParFlat_USER
Definition: genbank.h:58
@ ParFlat_PRIMARY
Definition: genbank.h:60
@ ParFlat_END
Definition: genbank.h:54
@ ParFlat_KEYWORDS
Definition: genbank.h:46
@ ParFlat_DEFINITION
Definition: genbank.h:42
@ ParFlat_CONTIG
Definition: genbank.h:56
@ ParFlat_MGA
Definition: genbank.h:61
@ ParFlat_ORIGIN
Definition: genbank.h:53
#define ParFlat_COL_DATA
Definition: genbank.h:37
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_FATAL
Definition: gicache.c:93
#define SEV_REJECT
Definition: gicache.c:92
#define StringStr
Definition: ncbistr.hpp:322
#define StringSave
Definition: ncbistr.hpp:326
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
bool FindNextEntryBuf(bool end_of_file, FileBuf &fbuf, FinfoBlk &finfo, const CTempString &keyword)
Definition: indx_blk.cpp:2162
IndexblkPtr InitialEntry(ParserPtr pp, FinfoBlk &finfo)
Definition: indx_blk.cpp:787
bool SkipTitleBuf(FileBuf &fbuf, FinfoBlk &finfo, const CTempString &keyword)
Definition: indx_blk.cpp:357
bool XReadFileBuf(FileBuf &fbuf, FinfoBlk &finfo)
Definition: indx_blk.cpp:313
bool CkLocusLinePos(char *offset, Parser::ESource source, LocusContPtr lcp, bool is_mga)
Definition: indx_blk.cpp:537
void MsgSkipTitleFail(const char *flatfile, FinfoBlk &finfo)
Definition: indx_blk.cpp:2154
#define ERR_VERSION_NonDigitVerNum
Definition: indx_err.h:83
#define ERR_VERSION_MissingVerNum
Definition: indx_err.h:82
#define ERR_ACCESSION_NoAccessNum
Definition: indx_err.h:68
#define ERR_SEGMENT_BadLocusName
Definition: indx_err.h:78
#define ERR_FORMAT_ContigInSegset
Definition: indx_err.h:47
#define ERR_SEGMENT_IncompSeg
Definition: indx_err.h:79
#define ERR_FORMAT_IncorrectMGALine
Definition: indx_err.h:56
#define ERR_VERSION_IncorrectGIInVersion
Definition: indx_err.h:86
#define ERR_VERSION_NonDigitGI
Definition: indx_err.h:87
#define ERR_VERSION_BadVersionLine
Definition: indx_err.h:85
#define ERR_VERSION_InvalidVersion
Definition: indx_err.h:88
#define ERR_FORMAT_MissingField
Definition: indx_err.h:42
#define ERR_ENTRY_InvalidLineType
Definition: indx_err.h:64
#define ERR_VERSION_AccessionsDontMatch
Definition: indx_err.h:84
@ e_not_set
int i
if(yy_accept[yy_current_state])
mdb_mode_t mode
Definition: lmdb++.h:38
const CharType(& source)[N]
Definition: pointer.h:1149
static const BitmapCharRec ch1
Definition: ncbi_10x20.c:1827
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
const CConstRef< CSeq_id > GetAccession(const CSeq_id_Handle &id_handle)
size_t get_offs() const
Char str[256]
Definition: indx_blk.h:42
Int4 line
Definition: indx_blk.h:43
Indexblk * ibp
Definition: indx_blk.h:56
IndBlkNode * next
Definition: indx_blk.h:57
Char acnum[200]
Definition: ftablock.h:166
bool assembly
Definition: ftablock.h:241
bool is_mga
Definition: ftablock.h:199
bool tsa_allowed
Definition: ftablock.h:211
Int4 wgs_and_gi
Definition: ftablock.h:231
Char blocusname[200]
Definition: ftablock.h:178
bool is_tpa_wgs_con
Definition: ftablock.h:209
Int2 vernum
Definition: ftablock.h:167
bool is_tpa
Definition: ftablock.h:206
bool is_wgs
Definition: ftablock.h:205
bool origin
Definition: ftablock.h:201
bool is_contig
Definition: ftablock.h:197
bool drop
Definition: ftablock.h:182
bool experimental
Definition: ftablock.h:247
size_t bases
Definition: ftablock.h:172
bool inferential
Definition: ftablock.h:245
Uint2 segtotal
Definition: ftablock.h:175
size_t linenum
Definition: ftablock.h:180
size_t len
Definition: ftablock.h:184
size_t offset
Definition: ftablock.h:168
bool specialist_db
Definition: ftablock.h:243
Uint2 segnum
Definition: ftablock.h:173
LocusCont lc
Definition: ftablock.h:212
vector< IndexblkPtr > entrylist
int SrchKeyword(const CTempString &ptr, const vector< string > &keywordList)
Definition: utilfun.cpp:897
bool CheckLineType(char *ptr, Int4 line, const vector< string > &keywordList, bool after_origin)
Definition: utilfun.cpp:910
void check_est_sts_gss_tpa_kwds(ValNodePtr kwds, size_t len, IndexblkPtr entry, bool tpa_check, bool &specialist_db, bool &inferential, bool &experimental, bool &assembly)
Definition: utilfun.cpp:1371
unique_ptr< TokenStatBlk > TokenString(const char *str, Char delimiter)
Definition: utilfun.cpp:445
ValNodePtr ConstructValNode(CSeq_id::E_Choice choice, const char *data)
Definition: utilfun.cpp:1437
#define ParFlat_UNKW
Definition: utilfun.h:44
ValNodePtr ValNodeNew(ValNodePtr prev, const char *data)
Definition: valnode.cpp:53
ValNodePtr ValNodeFreeData(ValNodePtr vnp)
Definition: valnode.cpp:96
Modified on Fri Sep 20 14:58:21 2024 by modify_doxy.py rev. 669887