NCBI C++ ToolKit
gb_ascii.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gb_ascii.cpp 102382 2024-04-28 12:37:20Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: gb_ascii.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Parse gb from blocks to asn.
32  * Build GenBank format entry block.
33  *
34  */
35 
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 
40 #include <objects/seq/Seq_inst.hpp>
43 #include <objects/seq/Bioseq.hpp>
45 #include <serial/objostr.hpp>
46 #include <serial/serial.hpp>
47 #include <objects/seq/Seq_ext.hpp>
53 #include <objmgr/scope.hpp>
59 #include <objects/seq/Pubdesc.hpp>
60 #include <objects/seq/MolInfo.hpp>
61 
62 #include "index.h"
63 #include "genbank.h"
64 
67 #include "ftanet.h"
68 
69 #include "ftaerr.hpp"
70 #include "asci_blk.h"
71 #include "indx_blk.h"
72 #include "utilref.h"
73 #include "utilfeat.h"
74 #include "loadfeat.h"
75 #include "gb_ascii.h"
76 #include "add.h"
77 #include "nucprot.h"
78 #include "fta_qscore.h"
79 #include "citation.h"
80 #include "fcleanup.h"
81 #include "utilfun.h"
82 #include "entry.h"
83 #include "ref.h"
84 #include "xgbparint.h"
85 #include "xutils.h"
86 
87 
88 #ifdef THIS_FILE
89 # undef THIS_FILE
90 #endif
91 #define THIS_FILE "gb_ascii.cpp"
92 
95 
96 /**********************************************************/
97 static char* GBDivOffset(const DataBlk& entry, Int4 div_shift)
98 {
99  return (entry.mOffset + div_shift);
100 }
101 
102 /**********************************************************/
104 {
105  bool condiv = (NStr::CompareNocase(ibp->division, "CON") == 0);
106 
107  if (condiv && ibp->segnum != 0) {
108  ErrPostEx(SEV_ERROR, ERR_DIVISION_ConDivInSegset, "Use of the CON division is not allowed for members of segmented set : %s|%s. Entry skipped.", ibp->locusname, ibp->acnum);
109  ibp->drop = true;
110  return;
111  }
112 
113  if (! condiv && ibp->is_contig == false && ibp->origin == false &&
114  ibp->is_mga == false) {
115  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingSequenceData, "Required sequence data is absent. Entry dropped.");
116  ibp->drop = true;
117  } else if (! condiv && ibp->is_contig && ibp->origin == false) {
118  ErrPostEx(SEV_WARNING, ERR_DIVISION_MappedtoCON, "Division [%s] mapped to CON based on the existence of CONTIG line.", ibp->division);
119  } else if (ibp->is_contig && ibp->origin) {
121  ErrPostEx(SEV_INFO, ERR_FORMAT_ContigWithSequenceData, "The CONTIG/CO linetype and sequence data are both present. Ignoring sequence data.");
122  } else {
123  ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigWithSequenceData, "The CONTIG/CO linetype and sequence data may not both be present in a sequence record.");
124  ibp->drop = true;
125  }
126  } else if (condiv && ibp->is_contig == false && ibp->origin == false) {
127  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingContigFeature, "No CONTIG data in GenBank format file, entry dropped.");
128  ibp->drop = true;
129  } else if (condiv && ibp->is_contig == false && ibp->origin) {
130  ErrPostEx(SEV_WARNING, ERR_DIVISION_ConDivLacksContig, "Division is CON, but CONTIG data have not been found.");
131  }
132 }
133 
134 /**********************************************************/
135 bool GetGenBankInstContig(const DataBlk& entry, CBioseq& bsp, ParserPtr pp)
136 {
137  DataBlkPtr dbp;
138 
139  char* p;
140  char* q;
141  char* r;
142 
143  bool locmap;
144 
145  bool allow_crossdb_featloc;
146  Int4 i;
147  int numerr;
148 
149  dbp = TrackNodeType(entry, ParFlat_CONTIG);
150  if (! dbp || ! dbp->mOffset)
151  return true;
152 
153  i = static_cast<Int4>(dbp->len) - ParFlat_COL_DATA;
154  if (i <= 0)
155  return false;
156 
157  p = StringNew(i);
158  StringNCpy(p, &dbp->mOffset[ParFlat_COL_DATA], i);
159  p[i - 1] = '\0';
160  for (q = p, r = p; *q != '\0'; q++)
161  if (*q != '\n' && *q != '\t' && *q != ' ')
162  *r++ = *q;
163  *r = '\0';
164 
165  for (q = p; *q != '\0'; q++)
166  if ((q[0] == ',' && q[1] == ',') || (q[0] == '(' && q[1] == ',') ||
167  (q[0] == ',' && q[1] == ')'))
168  break;
169  if (*q != '\0') {
170  ErrPostEx(SEV_REJECT, ERR_LOCATION_ContigHasNull, "The join() statement for this record's contig line contains one or more comma-delimited components which are null.");
171  MemFree(p);
172  return false;
173  }
174 
175  pp->buf.reset();
176 
177  CRef<CSeq_loc> loc = xgbparseint_ver(p, locmap, numerr, bsp.GetId(), pp->accver);
178  if (loc.Empty()) {
179  MemFree(p);
180  return true;
181  }
182 
183  allow_crossdb_featloc = pp->allow_crossdb_featloc;
184  pp->allow_crossdb_featloc = true;
185 
186  TSeqLocList locs;
187  locs.push_back(loc);
188  i = fta_fix_seq_loc_id(locs, pp, p, nullptr, true);
189 
190  if (i > 999)
192 
193  pp->allow_crossdb_featloc = allow_crossdb_featloc;
194 
195  if (loc->IsMix()) {
196  XGappedSeqLocsToDeltaSeqs(loc->GetMix(), bsp.SetInst().SetExt().SetDelta().Set());
197  bsp.SetInst().SetRepr(CSeq_inst::eRepr_delta);
198  } else
199  bsp.SetInst().ResetExt();
200 
201  MemFree(p);
202  return true;
203 }
204 
205 /**********************************************************
206  *
207  * bool GetGenBankInst(pp, entry, dnaconv):
208  *
209  * Fills in Seq-inst for an entry. Assumes Bioseq
210  * already allocated.
211  *
212  * 3-30-93
213  *
214  **********************************************************/
215 static bool GetGenBankInst(ParserPtr pp, const DataBlk& entry, unsigned char* dnaconv)
216 {
217  EntryBlkPtr ebp;
218  Int2 topology;
219  Int2 strand;
220  char* topstr;
221 
222  char* bptr = entry.mOffset;
223  IndexblkPtr ibp = pp->entrylist[pp->curindx];
224  LocusContPtr lcp = &ibp->lc;
225 
226  topstr = bptr + lcp->topology;
227 
228  ebp = static_cast<EntryBlk*>(entry.mpData);
229  CBioseq& bioseq = ebp->seq_entry->SetSeq();
230 
231  CSeq_inst& inst = bioseq.SetInst();
233 
234  /* get linear, circular, tandem topology, blank is linear which = 1
235  */
236  topology = CheckTPG(topstr);
237  if (topology > 1)
238  inst.SetTopology(static_cast<CSeq_inst::ETopology>(topology));
239 
240  strand = CheckSTRAND((lcp->strand >= 0) ? bptr + lcp->strand : " ");
241  if (strand > 0)
242  inst.SetStrand(static_cast<CSeq_inst::EStrand>(strand));
243 
244  if (GetSeqData(pp, entry, bioseq, ParFlat_ORIGIN, dnaconv, (ibp->is_prot ? eSeq_code_type_iupacaa : eSeq_code_type_iupacna)) == false)
245  return false;
246 
247  if (ibp->is_contig && ! GetGenBankInstContig(entry, bioseq, pp))
248  return false;
249 
250  return true;
251 }
252 
253 /**********************************************************/
254 static string GetGenBankLineage(string_view sv)
255 {
256  if (sv.empty())
257  return {};
258 
260 
261  while (! str.empty()) {
262  char c = str.back();
263  if (c == ' ' || c == '\t' || c == '\n' || c == '.' || c == ';')
264  str.pop_back();
265  else
266  break;
267  }
268 
269  return str;
270 }
271 
272 /**********************************************************
273  *
274  * static GBBlockPtr GetGBBlock(pp, entry, mfp, biosp):
275  *
276  * 4-7-93
277  *
278  **********************************************************/
279 static CRef<CGB_block> GetGBBlock(ParserPtr pp, const DataBlk& entry, CMolInfo& mol_info, CBioSource* bio_src)
280 {
281  LocusContPtr lcp;
282 
283  CRef<CGB_block> gbb(new CGB_block),
284  ret;
285 
286  IndexblkPtr ibp;
287  char* bptr;
288  char* eptr;
289  char* ptr;
290  Char msg[4];
291  size_t len;
292  Int2 div;
293 
294  bool if_cds;
295  bool pat_ref = false;
296  bool est_kwd = false;
297  bool sts_kwd = false;
298  bool gss_kwd = false;
299  bool htc_kwd = false;
300  bool fli_kwd = false;
301  bool wgs_kwd = false;
302  bool tpa_kwd = false;
303  bool tsa_kwd = false;
304  bool tls_kwd = false;
305  bool env_kwd = false;
306  bool mga_kwd = false;
307 
308  bool cancelled;
309  bool drop;
310 
311  char* tempdiv;
312  char* p;
313  Int4 i;
314 
315  ibp = pp->entrylist[pp->curindx];
316  ibp->wgssec[0] = '\0';
317 
318  bptr = xSrchNodeType(entry, ParFlat_SOURCE, &len);
319  string str = GetBlkDataReplaceNewLine(string_view(bptr, len), ParFlat_COL_DATA);
320  if (! str.empty()) {
321  if (str.back() == '.') {
322  if (str.size() >= 2 && *(str.end() - 2) == '.')
323  str.pop_back();
324  }
325 
326  gbb->SetSource(std::move(str));
327  }
328 
329  if (! ibp->keywords.empty()) {
330  gbb->SetKeywords().swap(ibp->keywords);
331  ibp->keywords.clear();
332  } else
333  GetSequenceOfKeywords(entry, ParFlat_KEYWORDS, ParFlat_COL_DATA, gbb->SetKeywords());
334 
335  if (ibp->is_mga && ! fta_check_mga_keywords(mol_info, gbb->GetKeywords())) {
336  return ret;
337  }
338 
339  if (ibp->is_tpa && ! fta_tpa_keywords_check(gbb->GetKeywords())) {
340  return ret;
341  }
342 
343  if (ibp->is_tsa && ! fta_tsa_keywords_check(gbb->GetKeywords(), pp->source)) {
344  return ret;
345  }
346 
347  if (ibp->is_tls && ! fta_tls_keywords_check(gbb->GetKeywords(), pp->source)) {
348  return ret;
349  }
350 
351  for (const string& key : gbb->GetKeywords()) {
352  fta_keywords_check(key.c_str(), &est_kwd, &sts_kwd, &gss_kwd, &htc_kwd, &fli_kwd, &wgs_kwd, &tpa_kwd, &env_kwd, &mga_kwd, &tsa_kwd, &tls_kwd);
353  }
354 
355  if (ibp->env_sample_qual == false && env_kwd) {
356  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ENV_NoMatchingQualifier, "This record utilizes the ENV keyword, but there are no /environmental_sample qualifiers among its source features.");
357  return ret;
358  }
359 
360  bptr = xSrchNodeType(entry, ParFlat_ORIGIN, &len);
361  eptr = bptr + len;
362  ptr = SrchTheChar(bptr, eptr, '\n');
363  if (ptr) {
364  eptr = ptr;
365  bptr += 6;
366 
367  if (eptr != bptr) {
368  while (isspace(*bptr) != 0)
369  bptr++;
370  len = eptr - bptr;
371  if (len > 0) {
372  gbb->SetOrigin(string(bptr, eptr));
373  }
374  }
375  }
376 
377  lcp = &ibp->lc;
378 
379  bptr = GBDivOffset(entry, lcp->div);
380 
381  if (*bptr != ' ') {
382  if_cds = check_cds(entry, pp->format);
383  div = CheckDIV(bptr);
384  if (div != -1) {
385  string div_str(bptr, bptr + 3);
386  gbb->SetDiv(div_str);
387 
388  if (div == 16) /* "ORG" replaced by "UNA" */
389  gbb->SetDiv("UNA");
390 
391  /* preserve the division code for later use
392  */
393  const char* p_div = gbb->GetDiv().c_str();
394  StringCpy(ibp->division, p_div);
395 
396  if (ibp->psip.NotEmpty())
397  pat_ref = true;
398 
399  if (ibp->is_tpa &&
400  (StringEqu(p_div, "EST") || StringEqu(p_div, "GSS") ||
401  StringEqu(p_div, "PAT") || StringEqu(p_div, "HTG"))) {
402  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTPADivcode, "Division code \"%s\" is not legal for TPA records. Entry dropped.", p_div);
403  return ret;
404  }
405 
406  if (ibp->is_tsa && ! StringEqu(p_div, "TSA")) {
407  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTSADivcode, "Division code \"%s\" is not legal for TSA records. Entry dropped.", p_div);
408  return ret;
409  }
410 
411  cancelled = IsCancelled(gbb->GetKeywords());
412 
413  if (StringEqu(p_div, "HTG")) {
414  if (! HasHtg(gbb->GetKeywords())) {
415  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTGKeywords, "Division is HTG, but entry lacks HTG-related keywords. Entry dropped.");
416  return ret;
417  }
418  }
419 
420  tempdiv = StringSave(gbb->GetDiv());
421 
422  if (fta_check_htg_kwds(gbb->SetKeywords(), pp->entrylist[pp->curindx], mol_info))
423  gbb->ResetDiv();
424 
425  DefVsHTGKeywords(mol_info.GetTech(), entry, ParFlat_DEFINITION, ParFlat_ORIGIN, cancelled);
426 
427  CheckHTGDivision(tempdiv, mol_info.GetTech());
428  if (tempdiv)
429  MemFree(tempdiv);
430 
431  i = 0;
432  if (est_kwd)
433  i++;
434  if (sts_kwd)
435  i++;
436  if (gss_kwd)
437  i++;
438  if (ibp->htg > 0)
439  i++;
440  if (htc_kwd)
441  i++;
442  if (fli_kwd)
443  i++;
444  if (wgs_kwd)
445  i++;
446  if (env_kwd)
447  i++;
448  if (mga_kwd) {
449  if (ibp->is_mga == false) {
450  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeCAGE, "This is apparently _not_ a CAGE record, but the special keywords are present. Entry dropped.");
451  return ret;
452  }
453  i++;
454  } else if (ibp->is_mga) {
455  ErrPostEx(SEV_ERROR, ERR_KEYWORD_NoGeneExpressionKeywords, "This is apparently a CAGE or 5'-SAGE record, but it lacks the required keywords. Entry dropped.");
456  }
457  if (tpa_kwd) {
458  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL) {
459  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA\" and/or \"Third Party Annotation\" keywords are present. Entry dropped.");
460  return ret;
461  }
462  i++;
463  } else if (ibp->is_tpa) {
464  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA\" and/or \"Third Party Annotation\" keywords. Entry dropped.");
465  return ret;
466  }
467  if (tsa_kwd) {
468  if (ibp->is_tsa == false) {
469  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords are present. Entry dropped.");
470  return ret;
471  }
472  i++;
473  } else if (ibp->is_tsa) {
474  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords. Entry dropped.");
475  return ret;
476  }
477  if (tls_kwd) {
478  if (ibp->is_tls == false) {
479  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS\" and/or \"Targeted Locus Study\" keywords are present. Entry dropped.");
480  return ret;
481  }
482  i++;
483  } else if (ibp->is_tls) {
484  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS\" and/or \"Targeted Locus Study\" keywords. Entry dropped.");
485  return ret;
486  }
487  if (i > 1) {
488  if (i == 2 && ibp->htg > 0 && env_kwd)
489  ErrPostEx(SEV_WARNING, ERR_KEYWORD_HTGPlusENV, "This HTG record also has the ENV keyword, which is an unusual combination. Confirmation that isolation and cloning steps actually occured might be appropriate.");
490  else if ((i == 2 && wgs_kwd && tpa_kwd) ||
491  (i == 2 && tsa_kwd && tpa_kwd) ||
492  (i == 2 && pp->source == Parser::ESource::DDBJ &&
493  env_kwd && tpa_kwd)) {
494  } else if (i != 2 || env_kwd == false ||
495  (est_kwd == false && gss_kwd == false && wgs_kwd == false)) {
496  if (i != 2 || pp->source != Parser::ESource::DDBJ ||
497  ibp->is_tsa == false || env_kwd == false) {
498  if (pp->source != Parser::ESource::DDBJ || ibp->is_wgs == false ||
499  (env_kwd == false && tpa_kwd == false)) {
500  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingKeywords, "This record contains more than one of the special keywords used to indicate that a sequence is an HTG, EST, GSS, STS, HTC, WGS, ENV, FLI_CDNA, TPA, CAGE, TSA or TLS sequence.");
501  return ret;
502  }
503  }
504  }
505  }
506 
507  if (wgs_kwd)
508  i--;
509 
510  if (ibp->is_contig && i > 0 &&
511  wgs_kwd == false && tpa_kwd == false && env_kwd == false) {
512  ErrPostEx(SEV_REJECT, ERR_KEYWORD_IllegalForCON, "This CON record should not have HTG, EST, GSS, STS, HTC, FLI_CDNA, CAGE, TSA or TLS special keywords. Entry dropped.");
513  return ret;
514  }
515 
516  CMolInfo::TTech thtg = mol_info.GetTech();
517  if (thtg == CMolInfo::eTech_htgs_0 || thtg == CMolInfo::eTech_htgs_1 ||
518  thtg == CMolInfo::eTech_htgs_2 || thtg == CMolInfo::eTech_htgs_3) {
519  RemoveHtgPhase(gbb->SetKeywords());
520  }
521 
522  bptr = xSrchNodeType(entry, ParFlat_KEYWORDS, &len);
523  if (bptr) {
524  string kw = GetBlkDataReplaceNewLine(string_view(bptr, len), ParFlat_COL_DATA);
525 
526  if (! est_kwd && kw.find("EST") != string::npos) {
527  ErrPostEx(SEV_WARNING, ERR_KEYWORD_ESTSubstring, "Keyword %s has substring EST, but no official EST keywords found", kw.c_str());
528  }
529  if (! sts_kwd && kw.find("STS") != string::npos) {
530  ErrPostEx(SEV_WARNING, ERR_KEYWORD_STSSubstring, "Keyword %s has substring STS, but no official STS keywords found", kw.c_str());
531  }
532  }
533 
534  if (! ibp->is_contig) {
535  drop = false;
536  CMolInfo::TTech tech = mol_info.GetTech();
537  string p_div;
538  if (gbb->IsSetDiv())
539  p_div = gbb->GetDiv();
540 
541  check_div(ibp->is_pat, pat_ref, est_kwd, sts_kwd, gss_kwd, if_cds, p_div, &tech, ibp->bases, pp->source, drop);
542 
543  if (tech != CMolInfo::eTech_unknown)
544  mol_info.SetTech(tech);
545  else
546  mol_info.ResetTech();
547 
548  if (! p_div.empty())
549  gbb->SetDiv(p_div);
550  else
551  gbb->ResetDiv();
552 
553  if (drop) {
554  return ret;
555  }
556  } else if (gbb->GetDiv() == "CON") {
557  gbb->ResetDiv();
558  }
559  } else if (pp->mode != Parser::EMode::Relaxed) {
560  MemCpy(msg, bptr, 3);
561  msg[3] = '\0';
562  ErrPostEx(SEV_REJECT, ERR_DIVISION_UnknownDivCode, "Unknown division code \"%s\" found in GenBank flatfile. Record rejected.", msg);
563  return ret;
564  }
565 
566  if (IsNewAccessFormat(ibp->acnum) == 0 && *ibp->acnum == 'T' &&
567  gbb->IsSetDiv() && gbb->GetDiv() != "EST") {
568  ErrPostStr(SEV_INFO, ERR_DIVISION_MappedtoEST, "Leading T in accession number.");
569 
570  mol_info.SetTech(CMolInfo::eTech_est);
571  gbb->ResetDiv();
572  }
573  }
574 
575  bool is_htc_div = gbb->IsSetDiv() && gbb->GetDiv() == "HTC",
576  has_htc = HasHtc(gbb->GetKeywords());
577 
578  if (is_htc_div && ! has_htc) {
579  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTCKeyword, "This record is in the HTC division, but lacks the required HTC keyword.");
580  return ret;
581  }
582 
583  if (! is_htc_div && has_htc) {
584  ErrPostEx(SEV_ERROR, ERR_DIVISION_InvalidHTCKeyword, "This record has the special HTC keyword, but is not in HTC division. If this record has graduated out of HTC, then the keyword should be removed.");
585  return ret;
586  }
587 
588  if (is_htc_div) {
589  bptr = entry.mOffset;
590  p = bptr + lcp->molecule;
591  if (*p == 'm' || *p == 'r')
592  p++;
593  else if (StringEquN(p, "pre-", 4))
594  p += 4;
595  else if (StringEquN(p, "transcribed ", 12))
596  p += 12;
597 
598  if (! StringEquN(p, "RNA", 3)) {
599  ErrPostEx(SEV_ERROR, ERR_DIVISION_HTCWrongMolType, "All HTC division records should have a moltype of pre-RNA, mRNA or RNA.");
600  return ret;
601  }
602  }
603 
604  if (fli_kwd)
606 
607  /* will be used in flat file database
608  */
609  if (gbb->IsSetDiv()) {
610  if (gbb->GetDiv() == "EST") {
611  ibp->EST = true;
612  mol_info.SetTech(CMolInfo::eTech_est);
613 
614  gbb->ResetDiv();
615  } else if (gbb->GetDiv() == "STS") {
616  ibp->STS = true;
617  mol_info.SetTech(CMolInfo::eTech_sts);
618 
619  gbb->ResetDiv();
620  } else if (gbb->GetDiv() == "GSS") {
621  ibp->GSS = true;
623 
624  gbb->ResetDiv();
625  } else if (gbb->GetDiv() == "HTC") {
626  ibp->HTC = true;
627  mol_info.SetTech(CMolInfo::eTech_htc);
628 
629  gbb->ResetDiv();
630  } else if (gbb->GetDiv() == "SYN" && bio_src && bio_src->IsSetOrigin() &&
632  gbb->ResetDiv();
633  }
634  } else if (mol_info.IsSetTech()) {
635  if (mol_info.GetTech() == CMolInfo::eTech_est)
636  ibp->EST = true;
637  if (mol_info.GetTech() == CMolInfo::eTech_sts)
638  ibp->STS = true;
639  if (mol_info.GetTech() == CMolInfo::eTech_survey)
640  ibp->GSS = true;
641  if (mol_info.GetTech() == CMolInfo::eTech_htc)
642  ibp->HTC = true;
643  }
644 
645  if (mol_info.IsSetTech())
646  fta_remove_keywords(mol_info.GetTech(), gbb->SetKeywords());
647 
648  if (ibp->is_tpa)
649  fta_remove_tpa_keywords(gbb->SetKeywords());
650 
651  if (ibp->is_tsa)
652  fta_remove_tsa_keywords(gbb->SetKeywords(), pp->source);
653 
654  if (ibp->is_tls)
655  fta_remove_tls_keywords(gbb->SetKeywords(), pp->source);
656 
657  if (bio_src) {
658  if (bio_src->IsSetSubtype()) {
659  for (const auto& subtype : bio_src->GetSubtype()) {
660  if (subtype->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
661  fta_remove_env_keywords(gbb->SetKeywords());
662  break;
663  }
664  }
665  }
666  if (bio_src->IsSetOrg()) {
667  const COrg_ref& org_ref = bio_src->GetOrg();
668  if (org_ref.IsSetOrgname() && org_ref.GetOrgname().IsSetMod()) {
669  for (const auto& mod : org_ref.GetOrgname().GetMod()) {
670  if (! mod->IsSetSubtype())
671  continue;
672 
673  COrgMod::TSubtype stype = mod->GetSubtype();
675  fta_remove_mag_keywords(gbb->SetKeywords());
676  break;
677  }
678  }
679  }
680  }
681  }
682 
683  if (pp->source == Parser::ESource::DDBJ && gbb->IsSetDiv() && bio_src &&
684  bio_src->IsSetOrg() && bio_src->GetOrg().IsSetOrgname() &&
685  bio_src->GetOrg().GetOrgname().IsSetDiv()) {
686  gbb->ResetDiv();
687  } else if (gbb->IsSetDiv() &&
688  bio_src &&
689  bio_src->IsSetOrg() &&
690  bio_src->GetOrg().IsSetOrgname() &&
691  bio_src->GetOrg().GetOrgname().IsSetDiv() &&
692  bio_src->GetOrg().GetOrgname().GetDiv() == gbb->GetDiv()) {
693  gbb->ResetDiv();
694  }
695 
696  GetExtraAccession(ibp, pp->allow_uwsec, pp->source, gbb->SetExtra_accessions());
697  ret.Reset(gbb.Release());
698 
699  return ret;
700 }
701 
702 /**********************************************************
703  *
704  * static MolInfoPtr GetGenBankMolInfo(pp, entry, orp):
705  *
706  * Data from :
707  * LOCUS ... column 37, or column 53 if "EST"
708  *
709  **********************************************************/
710 static CRef<CMolInfo> GetGenBankMolInfo(ParserPtr pp, const DataBlk& entry, const COrg_ref* org_ref)
711 {
712  IndexblkPtr ibp;
713  char* bptr;
714  char* molstr = nullptr;
715 
716  CRef<CMolInfo> mol_info(new CMolInfo);
717 
718  bptr = entry.mOffset;
719  ibp = pp->entrylist[pp->curindx];
720 
721  molstr = bptr + ibp->lc.molecule;
722 
723  bptr = GBDivOffset(entry, ibp->lc.div);
724 
725  if (StringEquN(bptr, "EST", 3))
726  mol_info->SetTech(CMolInfo::eTech_est);
727  else if (StringEquN(bptr, "STS", 3))
728  mol_info->SetTech(CMolInfo::eTech_sts);
729  else if (StringEquN(bptr, "GSS", 3))
730  mol_info->SetTech(CMolInfo::eTech_survey);
731  else if (StringEquN(bptr, "HTG", 3))
732  mol_info->SetTech(CMolInfo::eTech_htgs_1);
733  else if (ibp->is_wgs) {
734  if (ibp->is_tsa)
735  mol_info->SetTech(CMolInfo::eTech_tsa);
736  else if (ibp->is_tls)
737  mol_info->SetTech(CMolInfo::eTech_targeted);
738  else
739  mol_info->SetTech(CMolInfo::eTech_wgs);
740  } else if (ibp->is_tsa)
741  mol_info->SetTech(CMolInfo::eTech_tsa);
742  else if (ibp->is_tls)
743  mol_info->SetTech(CMolInfo::eTech_targeted);
744  else if (ibp->is_mga) {
745  mol_info->SetTech(CMolInfo::eTech_other);
746  mol_info->SetTechexp("cage");
747  }
748 
749  GetFlatBiomol(mol_info->SetBiomol(), mol_info->GetTech(), molstr, pp, entry, org_ref);
750  if (mol_info->GetBiomol() == CMolInfo::eBiomol_unknown) // not set
751  mol_info->ResetBiomol();
752 
753  return mol_info;
754 }
755 
756 /**********************************************************/
757 static void FakeGenBankBioSources(const DataBlk& entry, CBioseq& bioseq)
758 {
759  char* bptr;
760  char* end;
761  char* ptr;
762 
763  Char ch;
764 
765  size_t len = 0;
767 
768  if (! bptr) {
769  ErrPostStr(SEV_WARNING, ERR_ORGANISM_NoOrganism, "No Organism data in genbank format file");
770  return;
771  }
772 
773  end = bptr + len;
774  ch = *end;
775  *end = '\0';
776 
777  CRef<CBioSource> bio_src(new CBioSource);
778  bptr += ParFlat_COL_DATA;
779 
780  if (GetGenomeInfo(*bio_src, bptr) && bio_src->GetGenome() != CBioSource::eGenome_plasmid) {
781  while (*bptr != ' ' && *bptr != '\0')
782  bptr++;
783  while (*bptr == ' ')
784  bptr++;
785  }
786 
787  ptr = StringChr(bptr, '\n');
788  if (! ptr) {
789  *end = ch;
790  return;
791  }
792 
793  COrg_ref& org_ref = bio_src->SetOrg();
794 
795  *ptr = '\0';
796  org_ref.SetTaxname(bptr);
797  *ptr = '\n';
798 
799  for (;;) {
800  bptr = ptr + 1;
801  if (! StringEquN(bptr, " ", ParFlat_COL_DATA))
802  break;
803 
804  ptr = StringChr(bptr, '\n');
805  if (! ptr)
806  break;
807 
808  *ptr = '\0';
809  if (StringChr(bptr, ';') || ! StringChr(ptr + 1, '\n')) {
810  *ptr = '\n';
811  break;
812  }
813 
814  bptr += ParFlat_COL_DATA;
815  string& taxname = org_ref.SetTaxname();
816  taxname += ' ';
817  taxname += bptr;
818 
819  *ptr = '\n';
820  }
821 
822  *end = ch;
823 
824  if (org_ref.GetTaxname() == "Unknown.") {
825  string& taxname = org_ref.SetTaxname();
826  taxname.pop_back();
827  }
828 
829  string s = GetGenBankLineage(string_view(bptr, end - bptr));
830  if (! s.empty()) {
831  org_ref.SetOrgname().SetLineage(std::move(s));
832  }
833 
834  CRef<CSeqdesc> descr(new CSeqdesc);
835  descr->SetSource(*bio_src);
836  bioseq.SetDescr().Set().push_back(descr);
837 }
838 
839 /**********************************************************/
840 static void fta_get_user_field(char* line, const Char* tag, CUser_object& user_obj)
841 {
842  char* p;
843  char* q;
844  char* res;
845  Char ch;
846 
847  p = StringStr(line, "USER ");
848  if (! p)
849  ch = '\0';
850  else {
851  ch = 'U';
852  *p = '\0';
853  }
854 
855  res = StringSave(line);
856  if (ch == 'U')
857  *p = 'U';
858 
859  for (q = res, p = res; *p != '\0'; p++)
860  if (*p != ' ')
861  *q++ = *p;
862  *q = '\0';
863 
864  CRef<CUser_field> root_field(new CUser_field);
865  root_field->SetLabel().SetStr(tag);
866 
867  for (q = res;;) {
868  q = StringStr(q, "\nACCESSION=");
869  if (! q)
870  break;
871 
872  q += 11;
873  for (p = q; *p != '\0' && *p != '\n' && *p != ';';)
874  p++;
875  ch = *p;
876  *p = '\0';
877 
878  CRef<CUser_field> cur_field(new CUser_field);
879  cur_field->SetLabel().SetStr("accession");
880  cur_field->SetString(q);
881 
882  *p = ch;
883 
884  CRef<CUser_field> field_set(new CUser_field);
885  field_set->SetData().SetFields().push_back(cur_field);
886 
887  if (StringEquN(p, ";gi=", 4)) {
888  p += 4;
889  for (q = p; *p >= '0' && *p <= '9';)
890  p++;
891  ch = *p;
892  *p = '\0';
893 
894  cur_field.Reset(new CUser_field);
895  cur_field->SetLabel().SetStr("gi");
896  cur_field->SetNum(atoi(q));
897  field_set->SetData().SetFields().push_back(cur_field);
898 
899  *p = ch;
900  }
901 
902  root_field->SetData().SetFields().push_back(cur_field);
903  }
904 
905  MemFree(res);
906 
907  if (! root_field->IsSetData())
908  return;
909 
910  user_obj.SetData().push_back(root_field);
911 }
912 
913 /**********************************************************/
914 static void fta_get_str_user_field(char* line, const Char* tag, CUser_object& user_obj)
915 {
916  char* p;
917  char* q;
918  char* r;
919  char* res;
920  Char ch;
921 
922  p = StringStr(line, "USER ");
923  if (! p)
924  ch = '\0';
925  else {
926  ch = 'U';
927  *p = '\0';
928  }
929 
930  res = StringNew(StringLen(line));
931  for (q = line; *q == ' ' || *q == '\n';)
932  q++;
933  for (r = res; *q != '\0';) {
934  if (*q != '\n') {
935  *r++ = *q++;
936  continue;
937  }
938  while (*q == ' ' || *q == '\n')
939  q++;
940  if (*q != '\0')
941  *r++ = ' ';
942  }
943  *r = '\0';
944  if (ch == 'U')
945  *p = 'U';
946 
947  if (*res == '\0') {
948  MemFree(res);
949  return;
950  }
951 
952  CRef<CUser_field> field(new CUser_field);
953  field->SetLabel().SetStr(tag);
954  field->SetString(res);
955 
956  MemFree(res);
957 
958  user_obj.SetData().push_back(field);
959 }
960 
961 /**********************************************************/
962 static void fta_get_user_object(CSeq_entry& seq_entry, const DataBlk& entry)
963 {
964  char* p;
965  char* q;
966  char* r;
967  size_t l;
968 
969  p = xSrchNodeType(entry, ParFlat_USER, &l);
970  if (l < ParFlat_COL_DATA)
971  return;
972 
973  q = StringSave(string_view(p, l - 1));
974 
975  CRef<CUser_object> user_obj(new CUser_object);
976  user_obj->SetType().SetStr("RefGeneTracking");
977 
978  for (p = q;;) {
979  p = StringStr(p, "USER ");
980  if (! p)
981  break;
982  for (p += 12; *p == ' ';)
983  p++;
984  for (r = p; *p != '\0' && *p != '\n' && *p != ' ';)
985  p++;
986  if (*p == '\0' || p == r)
987  break;
988  if (StringEquN(r, "Related", 7))
989  fta_get_user_field(p, "Related", *user_obj);
990  else if (StringEquN(r, "Assembly", 8))
991  fta_get_user_field(p, "Assembly", *user_obj);
992  else if (StringEquN(r, "Comment", 7))
993  fta_get_str_user_field(p, "Comment", *user_obj);
994  else
995  continue;
996  }
997 
998  MemFree(q);
999 
1000  if (! user_obj->IsSetData())
1001  return;
1002 
1003  CRef<CSeqdesc> descr(new CSeqdesc);
1004  descr->SetUser(*user_obj);
1005 
1006  if (seq_entry.IsSeq())
1007  seq_entry.SetSeq().SetDescr().Set().push_back(descr);
1008  else
1009  seq_entry.SetSet().SetDescr().Set().push_back(descr);
1010 }
1011 
1012 /**********************************************************/
1013 static void fta_get_mga_user_object(TSeqdescList& descrs, char* offset, size_t len)
1014 {
1015  char* str;
1016  char* p;
1017 
1018  if (! offset)
1019  return;
1020 
1022  p = StringChr(str, '\n');
1023  if (p)
1024  *p = '\0';
1025  p = StringChr(str, '-');
1026  if (p)
1027  *p++ = '\0';
1028 
1029  CRef<CUser_object> user_obj(new CUser_object);
1030 
1031  CObject_id& id = user_obj->SetType();
1032  id.SetStr("CAGE-Tag-List");
1033 
1034  CRef<CUser_field> field(new CUser_field);
1035 
1036  field->SetLabel().SetStr("CAGE_tag_total");
1037  field->SetData().SetInt(static_cast<CUser_field::C_Data::TInt>(len));
1038  user_obj->SetData().push_back(field);
1039 
1040  field.Reset(new CUser_field);
1041 
1042  field->SetLabel().SetStr("CAGE_accession_first");
1043  field->SetData().SetStr(str);
1044  user_obj->SetData().push_back(field);
1045 
1046  field.Reset(new CUser_field);
1047 
1048  field->SetLabel().SetStr("CAGE_accession_last");
1049  field->SetData().SetStr(p);
1050  user_obj->SetData().push_back(field);
1051 
1052  MemFree(str);
1053 
1054  CRef<CSeqdesc> descr(new CSeqdesc);
1055  descr->SetUser(*user_obj);
1056 
1057  descrs.push_back(descr);
1058 }
1059 
1060 /**********************************************************/
1061 static void GetGenBankDescr(ParserPtr pp, const DataBlk& entry, CBioseq& bioseq)
1062 {
1063  IndexblkPtr ibp;
1064 
1065  DataBlkPtr dbp;
1066 
1067  char* offset;
1068  char* p;
1069  char* q;
1070 
1071  bool is_htg;
1072 
1073  ibp = pp->entrylist[pp->curindx];
1074 
1075  CBioSource* bio_src = nullptr;
1076  COrg_ref* org_ref = nullptr;
1077 
1078  /* ORGANISM
1079  */
1080 
1081  for (auto& descr : bioseq.SetDescr().Set()) {
1082  if (descr->IsSource()) {
1083  bio_src = &(descr->SetSource());
1084  if (bio_src->IsSetOrg())
1085  org_ref = &bio_src->SetOrg();
1086  break;
1087  }
1088  }
1089 
1090  /* MolInfo from LOCUS line
1091  */
1092  CRef<CMolInfo> mol_info = GetGenBankMolInfo(pp, entry, org_ref);
1093 
1094  /* DEFINITION data ==> descr_title
1095  */
1096  size_t len = 0;
1098 
1099  string title;
1100  if (offset) {
1101  string str = GetBlkDataReplaceNewLine(string_view(offset, len), ParFlat_COL_DATA);
1102 
1103  if (! str.empty() && str.front() == ' ') {
1104  size_t i = 0;
1105  for (char c : str) {
1106  if (c == ' ')
1107  ++i;
1108  else
1109  break;
1110  }
1111  str.erase(0, i);
1112  }
1113 
1114  title.swap(str);
1115 
1116  CRef<CSeqdesc> descr(new CSeqdesc);
1117  descr->SetTitle(title);
1118  bioseq.SetDescr().Set().push_back(descr);
1119 
1120  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL &&
1121  StringEquN(title.c_str(), "TPA:", 4)) {
1122  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA:\" prefix is present on its definition line. Entry dropped.");
1123  ibp->drop = true;
1124  return;
1125  }
1126  if (ibp->is_tsa == false && StringEquN(title.c_str(), "TSA:", 4)) {
1127  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA:\" prefix is present on its definition line. Entry dropped.");
1128  ibp->drop = true;
1129  return;
1130  }
1131  if (ibp->is_tls == false && StringEquN(title.c_str(), "TLS:", 4)) {
1132  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS:\" prefix is present on its definition line. Entry dropped.");
1133  ibp->drop = true;
1134  return;
1135  }
1136  }
1137 
1138  CRef<CUser_object> dbuop;
1139  offset = xSrchNodeType(entry, ParFlat_DBLINK, &len);
1140  if (offset)
1141  fta_get_dblink_user_object(bioseq.SetDescr().Set(), offset, len, pp->source, &ibp->drop, dbuop);
1142  else {
1144  if (offset)
1146  }
1147 
1148  if (ibp->is_mga) {
1149  offset = xSrchNodeType(entry, ParFlat_MGA, &len);
1150  fta_get_mga_user_object(bioseq.SetDescr().Set(), offset, ibp->bases);
1151  }
1152  if (ibp->is_tpa &&
1153  (title.empty() || (! StringEquN(title.c_str(), "TPA:", 4) &&
1154  ! StringEquN(title.c_str(), "TPA_exp:", 8) &&
1155  ! StringEquN(title.c_str(), "TPA_inf:", 8) &&
1156  ! StringEquN(title.c_str(), "TPA_asm:", 8) &&
1157  ! StringEquN(title.c_str(), "TPA_reasm:", 10)))) {
1158  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA:\" prefix on its definition line. Entry dropped.");
1159  ibp->drop = true;
1160  return;
1161  }
1162  if (ibp->is_tsa && ! ibp->is_tpa &&
1163  (title.empty() || ! StringEquN(title.c_str(), "TSA:", 4))) {
1164  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TSA:\" prefix on its definition line. Entry dropped.");
1165  ibp->drop = true;
1166  return;
1167  }
1168  if (ibp->is_tls && (title.empty() || ! StringEquN(title.c_str(), "TLS:", 4))) {
1169  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS:\" prefix on its definition line. Entry dropped.");
1170  ibp->drop = true;
1171  return;
1172  }
1173 
1174  /* REFERENCE
1175  */
1176  /* pub should be before GBblock because we need patent ref
1177  */
1178  dbp = TrackNodeType(entry, ParFlat_REF_END);
1179  for (; dbp; dbp = dbp->mpNext) {
1180  if (dbp->mType != ParFlat_REF_END)
1181  continue;
1182 
1183  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, ParFlat_COL_DATA);
1184  if (pubdesc.NotEmpty()) {
1185  CRef<CSeqdesc> descr(new CSeqdesc);
1186  descr->SetPub(*pubdesc);
1187  bioseq.SetDescr().Set().push_back(descr);
1188  }
1189  }
1190 
1191  dbp = TrackNodeType(entry, ParFlat_REF_NO_TARGET);
1192  for (; dbp; dbp = dbp->mpNext) {
1193  if (dbp->mType != ParFlat_REF_NO_TARGET)
1194  continue;
1195 
1196  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, ParFlat_COL_DATA);
1197  if (pubdesc.NotEmpty()) {
1198  CRef<CSeqdesc> descr(new CSeqdesc);
1199  descr->SetPub(*pubdesc);
1200  bioseq.SetDescr().Set().push_back(descr);
1201  }
1202  }
1203 
1204  /* GB-block
1205  */
1206  CRef<CGB_block> gbbp = GetGBBlock(pp, entry, *mol_info, bio_src);
1207 
1208  if ((pp->source == Parser::ESource::DDBJ || pp->source == Parser::ESource::EMBL) &&
1209  ibp->is_contig && (! mol_info->IsSetTech() || mol_info->GetTech() == CMolInfo::eTech_unknown)) {
1210  CMolInfo::TTech tech = fta_check_con_for_wgs(bioseq);
1211  if (tech == CMolInfo::eTech_unknown)
1212  mol_info->ResetTech();
1213  else
1214  mol_info->SetTech(tech);
1215  }
1216 
1217  if (mol_info->IsSetBiomol() || mol_info->IsSetTech()) {
1218  CRef<CSeqdesc> descr(new CSeqdesc);
1219  descr->SetMolinfo(*mol_info);
1220  bioseq.SetDescr().Set().push_back(descr);
1221  }
1222 
1223  if (gbbp.Empty()) {
1224  ibp->drop = true;
1225  return;
1226  }
1227 
1228  if (pp->taxserver == 1 && gbbp->IsSetDiv())
1229  fta_fix_orgref_div(bioseq.GetAnnot(), org_ref, *gbbp);
1230 
1231  if (StringEquNI(ibp->division, "CON", 3))
1232  fta_add_hist(pp, bioseq, gbbp->SetExtra_accessions(), Parser::ESource::DDBJ, CSeq_id::e_Ddbj, true, ibp->acnum);
1233  else
1234  fta_add_hist(pp, bioseq, gbbp->SetExtra_accessions(), Parser::ESource::DDBJ, CSeq_id::e_Ddbj, false, ibp->acnum);
1235 
1236  {
1237  CRef<CSeqdesc> descr(new CSeqdesc);
1238  descr->SetGenbank(*gbbp);
1239  bioseq.SetDescr().Set().push_back(descr);
1240  }
1241 
1243  if (! offset && ibp->is_tpa && ibp->is_wgs == false) {
1244  if (ibp->inferential || ibp->experimental) {
1245  if (! fta_dblink_has_sra(dbuop)) {
1246  ErrPostEx(SEV_REJECT, ERR_TPA_TpaSpansMissing, "TPA:%s record lacks both AH/PRIMARY linetype and Sequence Read Archive links. Entry dropped.", (ibp->inferential == false) ? "experimental" : "inferential");
1247  ibp->drop = true;
1248  return;
1249  }
1250  } else if (ibp->specialist_db == false) {
1251  ErrPostEx(SEV_REJECT, ERR_TPA_TpaSpansMissing, "TPA record lacks required AH/PRIMARY linetype. Entry dropped.");
1252  ibp->drop = true;
1253  return;
1254  }
1255  }
1256 
1257  if (offset && len > 0 &&
1258  fta_parse_tpa_tsa_block(bioseq, offset, ibp->acnum, ibp->vernum, len, ParFlat_COL_DATA, ibp->is_tpa) == false) {
1259  ibp->drop = true;
1260  return;
1261  }
1262 
1263  if (mol_info.NotEmpty() && mol_info->IsSetTech() &&
1264  (mol_info->GetTech() == CMolInfo::eTech_htgs_0 ||
1265  mol_info->GetTech() == CMolInfo::eTech_htgs_1 ||
1266  mol_info->GetTech() == CMolInfo::eTech_htgs_2))
1267  is_htg = true;
1268  else
1269  is_htg = false;
1270 
1271  /* COMMENT data
1272  */
1274  if (offset && len > 0) {
1275  char* str = GetDescrComment(offset, len, ParFlat_COL_DATA, (pp->xml_comp ? false : is_htg), ibp->is_pat);
1276  if (str) {
1277  bool bad = false;
1278  TUserObjVector user_objs;
1279 
1280  fta_parse_structured_comment(str, bad, user_objs);
1281  if (bad) {
1282  ibp->drop = true;
1283  MemFree(str);
1284  return;
1285  }
1286 
1287  for (auto& user_obj : user_objs) {
1288  CRef<CSeqdesc> descr(new CSeqdesc);
1289  descr->SetUser(*user_obj);
1290  bioseq.SetDescr().Set().push_back(descr);
1291  }
1292 
1293  if (pp->xml_comp) {
1294  for (q = str, p = q; *p != '\0';) {
1295  if (*p == ';' && (p[1] == ' ' || p[1] == '~'))
1296  *p = ' ';
1297  if (*p == '~' || *p == ' ') {
1298  *q++ = ' ';
1299  for (p++; *p == ' ' || *p == '~';)
1300  p++;
1301  } else
1302  *q++ = *p++;
1303  }
1304  *q = '\0';
1305  }
1306 
1307  if (str[0] != 0) {
1308  CRef<CSeqdesc> descr(new CSeqdesc);
1309  descr->SetComment(str);
1310  bioseq.SetDescr().Set().push_back(descr);
1311  }
1312  MemFree(str);
1313  }
1314  }
1315 
1316  /* DATE
1317  */
1318  if (pp->no_date) /* -N in command line means no date */
1319  return;
1320 
1321  CRef<CDate> date;
1322  if (pp->date) /* -L in command line means replace date */
1323  {
1324  CTime time(CTime::eCurrent);
1325  date.Reset(new CDate);
1326  date->SetToTime(time);
1327  } else if (ibp->lc.date > 0) {
1328  CRef<CDate_std> std_date = GetUpdateDate(entry.mOffset + ibp->lc.date, pp->source);
1329  if (std_date.NotEmpty()) {
1330  date.Reset(new CDate);
1331  date->SetStd(*std_date);
1332  }
1333  }
1334 
1335  if (date.NotEmpty()) {
1336  CRef<CSeqdesc> descr(new CSeqdesc);
1337  descr->SetUpdate_date(*date);
1338  bioseq.SetDescr().Set().push_back(descr);
1339  }
1340 }
1341 
1342 /**********************************************************/
1343 static void GenBankGetDivision(char* division, Int4 div, const DataBlk& entry)
1344 {
1345  StringNCpy(division, GBDivOffset(entry, div), 3);
1346  division[3] = '\0';
1347 }
1348 
1349 static void xGenBankGetDivision(char* division, Int4 div, const string& locusText)
1350 {
1351  StringCpy(division, locusText.substr(64, 3).c_str());
1352 }
1353 
1354 /**********************************************************
1355  *
1356  * bool GenBankAscii(pp):
1357  *
1358  * Return FALSE if allocate entry block failed.
1359  *
1360  * 3-17-93
1361  *
1362  **********************************************************/
1364 {
1365  Int2 curkw;
1366  int imax;
1367  int segindx;
1368  int total = 0;
1369  int total_long = 0;
1370  int total_dropped = 0;
1371  char* ptr;
1372  char* eptr;
1373  char* div;
1374  unique_ptr<DataBlk, decltype(&xFreeEntry)> pEntry(nullptr, &xFreeEntry);
1375  EntryBlkPtr ebp;
1376 
1377  // unsigned char* dnaconv;
1378  // unsigned char* protconv;
1379  unsigned char* conv;
1380 
1381  TEntryList seq_entries;
1382 
1383  CSeq_loc locs;
1384 
1385  bool seq_long = false;
1386 
1387  IndexblkPtr ibp;
1388  IndexblkPtr tibp;
1389 
1390  auto dnaconv = GetDNAConv(); /* set up sequence alphabets */
1391  auto protconv = GetProteinConv(); /* set up sequence alphabets */
1392 
1393  segindx = -1;
1394 
1395  imax = pp->indx;
1396  for (int i = 0; i < imax; i++) {
1397  pp->curindx = i;
1398  ibp = pp->entrylist[i];
1399 
1400  err_install(ibp, pp->accver);
1401 
1402  if (ibp->segnum == 1)
1403  segindx = i;
1404 
1405  if (ibp->drop && ibp->segnum == 0) {
1406  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1407  total_dropped++;
1408  continue;
1409  }
1410 
1411  pEntry.reset(LoadEntry(pp, ibp->offset, ibp->len));
1412  if (! pEntry) {
1414  // dnaconv.reset();
1415  // protconv.reset();
1416  return false;
1417  }
1418 
1419  ebp = static_cast<EntryBlk*>(pEntry->mpData);
1420  ptr = pEntry->mOffset;
1421  eptr = ptr + pEntry->len;
1422  curkw = ParFlat_LOCUS;
1423  while (curkw != ParFlat_END && ptr < eptr) {
1424  ptr = GetGenBankBlock(&ebp->chain, ptr, &curkw, eptr);
1425  }
1426 
1427  auto ppCurrentEntry = pp->entrylist[pp->curindx];
1428  if (ppCurrentEntry->lc.div > -1) {
1429  GenBankGetDivision(ppCurrentEntry->division, ppCurrentEntry->lc.div, *pEntry);
1430  if (StringEqu(ibp->division, "TSA")) {
1431  if (ibp->tsa_allowed == false)
1432  ErrPostEx(SEV_WARNING, ERR_TSA_UnexpectedPrimaryAccession, "The record with accession \"%s\" is not expected to have a TSA division code.", ibp->acnum);
1433  ibp->is_tsa = true;
1434  }
1435  }
1436 
1437  CheckContigEverywhere(ibp, pp->source);
1438  if (ibp->drop && ibp->segnum == 0) {
1439  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1440  total_dropped++;
1441  continue;
1442  }
1443 
1444  if (ptr >= eptr) {
1445  ibp->drop = true;
1446  ErrPostStr(SEV_ERROR, ERR_FORMAT_MissingEnd, "Missing end of the entry. Entry dropped.");
1447  if (ibp->segnum == 0) {
1448  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1449  total_dropped++;
1450  continue;
1451  }
1452  }
1453  GetGenBankSubBlock(*pEntry, ibp->bases);
1454 
1455  CRef<CBioseq> bioseq = CreateEntryBioseq(pp);
1456  ebp->seq_entry.Reset(new CSeq_entry);
1457  ebp->seq_entry->SetSeq(*bioseq);
1458  GetScope().AddBioseq(*bioseq);
1459 
1460  AddNIDSeqId(*bioseq, *pEntry, ParFlat_NCBI_GI, ParFlat_COL_DATA, pp->source);
1461 
1462  if (StringEquN(pEntry->mOffset + ibp->lc.bp, "aa", 2)) {
1463  ibp->is_prot = true;
1464  conv = protconv.get();
1465  } else {
1466  ibp->is_prot = false;
1467  conv = dnaconv.get();
1468  }
1469 
1470 
1471  if (! GetGenBankInst(pp, *pEntry, conv)) {
1472  ibp->drop = true;
1473  ErrPostStr(SEV_REJECT, ERR_SEQUENCE_BadData, "Bad sequence data. Entry dropped.");
1474  if (ibp->segnum == 0) {
1475  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1476  total_dropped++;
1477  continue;
1478  }
1479  }
1480 
1481  FakeGenBankBioSources(*pEntry, *bioseq);
1482  LoadFeat(pp, *pEntry, *bioseq);
1483 
1484  if (! bioseq->IsSetAnnot() && ibp->drop && ibp->segnum == 0) {
1485  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1486  total_dropped++;
1487  continue;
1488  }
1489 
1490  GetGenBankDescr(pp, *pEntry, *bioseq);
1491  if (ibp->drop && ibp->segnum == 0) {
1492  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1493  total_dropped++;
1494  continue;
1495  }
1496 
1497  fta_set_molinfo_completeness(*bioseq, ibp);
1498 
1499  if (ibp->is_tsa)
1500  fta_tsa_tls_comment_dblink_check(*bioseq, true);
1501 
1502  if (ibp->is_tls)
1503  fta_tsa_tls_comment_dblink_check(*bioseq, false);
1504 
1505  if (bioseq->GetInst().IsNa()) {
1506  if (bioseq->GetInst().GetRepr() == CSeq_inst::eRepr_raw) {
1507  if (ibp->gaps)
1508  GapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
1509  else if (ibp->htg == 4 || ibp->htg == 1 || ibp->htg == 2 ||
1510  (ibp->is_pat && pp->source == Parser::ESource::DDBJ))
1511  SeqToDelta(*bioseq, ibp->htg);
1512  } else if (ibp->gaps)
1513  AssemblyGapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
1514  }
1515 
1516  if (no_date(pp->format, bioseq->GetDescr().Get()) && pp->debug == false &&
1517  pp->no_date == false &&
1518  pp->mode != Parser::EMode::Relaxed) {
1519  ibp->drop = true;
1520  ErrPostStr(SEV_ERROR, ERR_DATE_IllegalDate, "Illegal create date. Entry dropped.");
1521  if (ibp->segnum == 0) {
1522  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1523  total_dropped++;
1524  continue;
1525  }
1526  }
1527 
1528  if (pEntry->mpQscore.empty() && pp->accver) {
1529  if (pp->ff_get_qscore)
1530  pEntry->mpQscore = (*pp->ff_get_qscore)(ibp->acnum, ibp->vernum);
1531  else if (pp->ff_get_qscore_pp)
1532  pEntry->mpQscore = (*pp->ff_get_qscore_pp)(ibp->acnum, ibp->vernum, pp);
1533  if (pp->qsfd && ibp->qslength > 0)
1534  pEntry->mpQscore = GetQSFromFile(pp->qsfd, ibp);
1535  }
1536 
1537  if (! QscoreToSeqAnnot(pEntry->mpQscore, *bioseq, ibp->acnum, ibp->vernum, false, true)) {
1538  if (pp->ign_bad_qs == false) {
1539  ibp->drop = true;
1540  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse, "Error while parsing QScore. Entry dropped.");
1541  if (ibp->segnum == 0) {
1542  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1543  total_dropped++;
1544  continue;
1545  }
1546  } else {
1547  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse, "Error while parsing QScore.");
1548  }
1549  }
1550 
1551  pEntry->mpQscore.clear();
1552 
1553  if (ibp->psip.NotEmpty()) {
1554  CRef<CSeq_id> id(new CSeq_id);
1555  id->SetPatent(*ibp->psip);
1556  bioseq->SetId().push_back(id);
1557  ibp->psip.Reset();
1558  }
1559 
1560  /* add PatentSeqId if patent is found in reference
1561  */
1562  if (pp->mode != Parser::EMode::Relaxed &&
1563  pp->debug == false &&
1564  ibp->wgs_and_gi != 3 &&
1565  no_reference(*bioseq)) {
1566  if (pp->source == Parser::ESource::Flybase) {
1567  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for entry from FlyBase. Continue anyway.");
1568  } else if (pp->source == Parser::ESource::Refseq &&
1569  StringEquN(ibp->acnum, "NW_", 3)) {
1570  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for RefSeq's NW_ entry. Continue anyway.");
1571  } else if (ibp->is_wgs) {
1572  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for WGS entry. Continue anyway.");
1573  } else {
1574  ibp->drop = true;
1575  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references. Entry dropped.");
1576  if (ibp->segnum == 0) {
1577  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1578  total_dropped++;
1579  continue;
1580  }
1581  }
1582  }
1583 
1584  if (ibp->segnum == ibp->segtotal) {
1585  seq_entries.push_back(ebp->seq_entry);
1586  ebp->seq_entry.Reset();
1587 
1588  if (ibp->segnum < 2) {
1589  if (ibp->segnum != 0) {
1590  ErrPostEx(SEV_WARNING, ERR_SEGMENT_OnlyOneMember, "Segmented set contains only one member.");
1591  }
1592  segindx = i;
1593  } else {
1594  GetSeqExt(pp, locs);
1595  // LCOV_EXCL_START
1596  // Excluded per Mark's request on 12/14/2016
1597  BuildBioSegHeader(pp, seq_entries, locs);
1598  // LCOV_EXCL_STOP
1599  }
1600 
1601  /* reject the whole set if any one entry was rejected
1602  */
1603  if (ibp->segnum != 0) {
1604  div = pp->entrylist[segindx]->division;
1605  int j = segindx;
1606  for (; j <= i; j++) {
1607  tibp = pp->entrylist[j];
1608  err_install(tibp, pp->accver);
1609  if (! StringEqu(div, tibp->division)) {
1610  ErrPostEx(SEV_WARNING, ERR_DIVISION_Mismatch, "Division different in segmented set: %s: %s", div, tibp->division);
1611  }
1612  if (tibp->drop) {
1613  ErrPostEx(SEV_WARNING, ERR_SEGMENT_Rejected, "Reject the whole segmented set");
1614  break;
1615  }
1616  }
1617  if (j <= i) {
1618  for (j = segindx; j <= i; j++) {
1619  tibp = pp->entrylist[j];
1620  err_install(tibp, pp->accver);
1621  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", tibp->locusname, tibp->acnum);
1622  total_dropped++;
1623  }
1624 
1625  seq_entries.clear();
1626  continue;
1627  }
1628  }
1629 
1630  DealWithGenes(seq_entries, pp);
1631 
1632  if (seq_entries.empty()) {
1633  if (ibp->segnum != 0) {
1634  ErrPostEx(SEV_WARNING, ERR_SEGMENT_Rejected, "Reject the whole segmented set.");
1635  int j = segindx;
1636  for (; j <= i; j++) {
1637  tibp = pp->entrylist[j];
1638  err_install(tibp, pp->accver);
1639  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", tibp->locusname, tibp->acnum);
1640  total_dropped++;
1641  }
1642  } else {
1643  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1644  total_dropped++;
1645  }
1646  continue;
1647  }
1648 
1649  if (pp->source == Parser::ESource::Flybase && ! seq_entries.empty())
1650  fta_get_user_object(*(*seq_entries.begin()), *pEntry);
1651 
1652  /* remove out all the features if their seqloc has
1653  * "join" or "order" among other segments, to the annot
1654  * which in class = parts
1655  */
1656  if (ibp->segnum != 0)
1657  // LCOV_EXCL_START
1658  // Excluded per Mark's request on 12/14/2016
1659  CheckFeatSeqLoc(seq_entries);
1660  // LCOV_EXCL_STOP
1661 
1662  fta_find_pub_explore(pp, seq_entries);
1663 
1664  /* change qual "citation" on features to SeqFeat.cit
1665  * find citation in the list by serial_number.
1666  * If serial number not found remove /citation
1667  */
1668  ProcessCitations(seq_entries);
1669 
1670  /* check for long sequences in each segment */
1671  if (pp->limit != 0) {
1672  if (ibp->segnum != 0) {
1673  int j = segindx;
1674  for (; j <= i; j++) {
1675  tibp = pp->entrylist[j];
1676  err_install(tibp, pp->accver);
1677  if (tibp->bases <= (size_t)pp->limit)
1678  continue;
1679 
1680  if (tibp->htg == 1 || tibp->htg == 2 || tibp->htg == 4) {
1681  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence, "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem", tibp->locusname, tibp->acnum, pp->limit);
1682  } else {
1683  seq_long = true;
1684  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence, "Sequence %s|%s is longer than limit %ld", tibp->locusname, tibp->acnum, pp->limit);
1685  }
1686  }
1687  } else if (ibp->bases > (size_t)pp->limit) {
1688  if (ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 4) {
1689  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence, "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem", ibp->locusname, ibp->acnum, pp->limit);
1690  } else {
1691  seq_long = true;
1692  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence, "Sequence %s|%s is longer than limit %ld", ibp->locusname, ibp->acnum, pp->limit);
1693  }
1694  }
1695  }
1696  if (pp->mode == Parser::EMode::Relaxed) {
1697  for (auto pEntry : seq_entries) {
1698  auto pScope = Ref(new CScope(*CObjectManager::GetInstance()));
1699  g_InstantiateMissingProteins(pScope->AddTopLevelSeqEntry(*pEntry));
1700  }
1701  }
1702  if (pp->convert) {
1703  if (pp->cleanup <= 1) {
1704  FinalCleanup(seq_entries);
1705 
1706  if (pp->qamode && ! seq_entries.empty())
1707  fta_remove_cleanup_user_object(*seq_entries.front());
1708  }
1709 
1710  MaybeCutGbblockSource(seq_entries);
1711  }
1712 
1713  EntryCheckDivCode(seq_entries, pp);
1714 
1715  if (pp->xml_comp)
1716  fta_set_strandedness(seq_entries);
1717 
1718  if (fta_EntryCheckGBBlock(seq_entries)) {
1719  ErrPostStr(SEV_WARNING, ERR_ENTRY_GBBlock_not_Empty, "Attention: GBBlock is not empty");
1720  }
1721 
1722  /* check for identical features
1723  */
1724  if (pp->qamode) {
1725  fta_sort_descr(seq_entries);
1726  fta_sort_seqfeat_cit(seq_entries);
1727  }
1728 
1729  if (pp->citat) {
1730  StripSerialNumbers(seq_entries);
1731  }
1732 
1733  PackEntries(seq_entries);
1734  CheckDupDates(seq_entries);
1735 
1736  if (ibp->segnum != 0) {
1737  int j = segindx;
1738  for (; j <= i; j++)
1739  err_install(pp->entrylist[j], pp->accver);
1740  }
1741  if (seq_long) {
1742  seq_long = false;
1743  if (ibp->segnum != 0)
1744  total_long += (i - segindx + 1);
1745  else
1746  total_long++;
1747  } else {
1748  pp->entries.splice(pp->entries.end(), seq_entries);
1749 
1750  if (ibp->segnum != 0)
1751  total += (i - segindx + 1);
1752  else
1753  total++;
1754  }
1755 
1756  if (ibp->segnum != 0) {
1757  for (int j = segindx; j <= i; j++) {
1758  tibp = pp->entrylist[j];
1759  err_install(tibp, pp->accver);
1760  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry parsed successfully: \"%s|%s\".", tibp->locusname, tibp->acnum);
1761  }
1762  } else {
1763  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry parsed successfully: \"%s|%s\".", ibp->locusname, ibp->acnum);
1764  }
1765 
1766  seq_entries.clear();
1767  } else {
1768  GetSeqExt(pp, locs);
1769 
1770  seq_entries.push_back(ebp->seq_entry);
1771  ebp->seq_entry.Reset();
1772  }
1773 
1774  } /* for, ascii block entries */
1775 
1777 
1778  ErrPostEx(SEV_INFO, ERR_ENTRY_ParsingComplete, "COMPLETED : SUCCEEDED = %d (including: LONG ones = %d); SKIPPED = %d.", total, total_long, total_dropped);
1779  // MemFree(dnaconv);
1780  // MemFree(protconv);
1781 
1782  return true;
1783 }
1785 {
1786  int imax;
1787  int total = 0;
1788  int total_long = 0;
1789  int total_dropped = 0;
1790  unique_ptr<Entry> pEntry;
1791  unsigned char* conv;
1792 
1793  TEntryList seq_entries;
1794 
1795  CSeq_loc locs;
1796 
1797  IndexblkPtr ibp;
1798 
1799  auto dnaconv = GetDNAConv(); /* set up sequence alphabets */
1800  auto protconv = GetProteinConv(); /* set up sequence alphabets */
1801 
1802  imax = pp->indx;
1803  for (int i = 0; i < imax; i++) {
1804  pp->curindx = i;
1805  ibp = pp->entrylist[i];
1806 
1807  err_install(ibp, pp->accver);
1808 
1809  if (ibp->drop && ibp->segnum == 0) {
1810  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1811  total_dropped++;
1812  continue;
1813  }
1814 
1815  pEntry.reset(LoadEntryGenbank(pp, ibp->offset, ibp->len));
1816  if (! pEntry) {
1818  return false;
1819  }
1820 
1821  xGetGenBankBlocks(*pEntry);
1822 
1823  if (pp->entrylist[pp->curindx]->lc.div > -1) {
1824  xGenBankGetDivision(pp->entrylist[pp->curindx]->division, pp->entrylist[pp->curindx]->lc.div, pEntry->mBaseData);
1825  if (StringEqu(ibp->division, "TSA")) {
1826  if (ibp->tsa_allowed == false)
1827  ErrPostEx(SEV_WARNING, ERR_TSA_UnexpectedPrimaryAccession, "The record with accession \"%s\" is not expected to have a TSA division code.", ibp->acnum);
1828  ibp->is_tsa = true;
1829  }
1830  }
1831 
1832  CheckContigEverywhere(ibp, pp->source);
1833  if (ibp->drop && ibp->segnum == 0) {
1834  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1835  total_dropped++;
1836  continue;
1837  }
1838 
1839  auto lastType = pEntry->mSections.back()->mType;
1840  if (lastType != ParFlat_END) {
1841  ibp->drop = true;
1842  ErrPostStr(SEV_ERROR, ERR_FORMAT_MissingEnd, "Missing end of the entry. Entry dropped.");
1843  if (ibp->segnum == 0) {
1844  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1845  total_dropped++;
1846  continue;
1847  }
1848  }
1849  xGetGenBankSubBlocks(*pEntry, ibp->bases);
1850 
1851  CRef<CBioseq> pBioseq = CreateEntryBioseq(pp);
1852  pEntry->mSeqEntry.Reset(new CSeq_entry);
1853  pEntry->mSeqEntry->SetSeq(*pBioseq);
1854  GetScope().AddBioseq(*pBioseq);
1855  pEntry->xInitNidSeqId(*pBioseq, ParFlat_NCBI_GI, ParFlat_COL_DATA, pp->source);
1856 
1857  if (pEntry->IsAA()) {
1858  ibp->is_prot = true;
1859  conv = protconv.get();
1860  } else {
1861  ibp->is_prot = false;
1862  conv = dnaconv.get();
1863  }
1864 
1865  if (! pEntry->xInitSeqInst(conv)) {
1866  ibp->drop = true;
1867  ErrPostStr(SEV_REJECT, ERR_SEQUENCE_BadData, "Bad sequence data. Entry dropped.");
1868  if (ibp->segnum == 0) {
1869  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1870  total_dropped++;
1871  continue;
1872  }
1873  }
1874  return false;
1875 
1876  /*FakeGenBankBioSources(*pEntry, *bioseq);
1877  LoadFeat(pp, *pEntry, *bioseq);
1878 
1879  if (! bioseq->IsSetAnnot() && ibp->drop && ibp->segnum == 0)
1880  {
1881  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
1882  "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1883  total_dropped++;
1884  continue;
1885  }
1886 
1887  GetGenBankDescr(pp, *pEntry, *bioseq);
1888  if (ibp->drop && ibp->segnum == 0)
1889  {
1890  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
1891  "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1892  total_dropped++;
1893  continue;
1894  }
1895 
1896  fta_set_molinfo_completeness(*bioseq, ibp);
1897 
1898  if (ibp->is_tsa)
1899  fta_tsa_tls_comment_dblink_check(*bioseq, true);
1900 
1901  if (ibp->is_tls)
1902  fta_tsa_tls_comment_dblink_check(*bioseq, false);
1903 
1904  if (bioseq->GetInst().IsNa())
1905  {
1906  if (bioseq->GetInst().GetRepr() == CSeq_inst::eRepr_raw)
1907  {
1908  if (ibp->gaps)
1909  GapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
1910  else if(ibp->htg == 4 || ibp->htg == 1 || ibp->htg == 2 ||
1911  (ibp->is_pat && pp->source == Parser::ESource::DDBJ))
1912  SeqToDelta(*bioseq, ibp->htg);
1913  }
1914  else if (ibp->gaps)
1915  AssemblyGapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
1916  }
1917 
1918  if (no_date(pp->format, bioseq->GetDescr().Get()) && pp->debug == false &&
1919  pp->no_date == false &&
1920  pp->mode != Parser::EMode::Relaxed)
1921  {
1922  ibp->drop = true;
1923  ErrPostStr(SEV_ERROR, ERR_DATE_IllegalDate,
1924  "Illegal create date. Entry dropped.");
1925  if(ibp->segnum == 0)
1926  {
1927  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
1928  "Entry skipped: \"%s|%s\".",
1929  ibp->locusname, ibp->acnum);
1930  total_dropped++;
1931  continue;
1932  }
1933  }
1934 
1935  if (! pEntry->mpQscore && pp->accver)
1936  {
1937  if (pp->ff_get_qscore)
1938  pEntry->mpQscore = (*pp->ff_get_qscore)(ibp->acnum, ibp->vernum);
1939  else if (pp->ff_get_qscore_pp)
1940  pEntry->mpQscore = (*pp->ff_get_qscore_pp)(ibp->acnum, ibp->vernum, pp);
1941  if (pp->qsfd && ibp->qslength > 0)
1942  pEntry->mpQscore = GetQSFromFile(pp->qsfd, ibp);
1943  }
1944 
1945  if (!QscoreToSeqAnnot(pEntry->mpQscore, *bioseq, ibp->acnum, ibp->vernum, false, true))
1946  {
1947  if(pp->ign_bad_qs == false)
1948  {
1949  ibp->drop = true;
1950  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse,
1951  "Error while parsing QScore. Entry dropped.");
1952  if(ibp->segnum == 0)
1953  {
1954  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
1955  "Entry skipped: \"%s|%s\".",
1956  ibp->locusname, ibp->acnum);
1957  total_dropped++;
1958  continue;
1959  }
1960  }
1961  else
1962  {
1963  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse,
1964  "Error while parsing QScore.");
1965  }
1966  }
1967 
1968  if (pEntry->mpQscore)
1969  {
1970  MemFree(pEntry->mpQscore);
1971  pEntry->mpQscore = nullptr;
1972  }
1973 
1974  if (ibp->psip.NotEmpty())
1975  {
1976  CRef<CSeq_id> id(new CSeq_id);
1977  id->SetPatent(*ibp->psip);
1978  bioseq->SetId().push_back(id);
1979  ibp->psip.Reset();
1980  }
1981 
1982  // add PatentSeqId if patent is found in reference
1983  //
1984  if(pp->mode != Parser::EMode::Relaxed &&
1985  pp->debug == false &&
1986  ibp->wgs_and_gi != 3 &&
1987  no_reference(*bioseq))
1988  {
1989  if(pp->source == Parser::ESource::Flybase)
1990  {
1991  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references,
1992  "No references for entry from FlyBase. Continue anyway.");
1993  }
1994  else if(pp->source == Parser::ESource::Refseq &&
1995  StringEquN(ibp->acnum, "NW_", 3))
1996  {
1997  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references,
1998  "No references for RefSeq's NW_ entry. Continue anyway.");
1999  }
2000  else if(ibp->is_wgs)
2001  {
2002  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references,
2003  "No references for WGS entry. Continue anyway.");
2004  }
2005  else
2006  {
2007  ibp->drop = true;
2008  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references,
2009  "No references. Entry dropped.");
2010  if(ibp->segnum == 0)
2011  {
2012  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
2013  "Entry skipped: \"%s|%s\".",
2014  ibp->locusname, ibp->acnum);
2015  total_dropped++;
2016  continue;
2017  }
2018  }
2019  }
2020 
2021  if (ibp->segnum == ibp->segtotal)
2022  {
2023  seq_entries.push_back(ebp->seq_entry);
2024  ebp->seq_entry.Reset();
2025 
2026  if (ibp->segnum < 2)
2027  {
2028  if(ibp->segnum != 0)
2029  {
2030  ErrPostEx(SEV_WARNING, ERR_SEGMENT_OnlyOneMember,
2031  "Segmented set contains only one member.");
2032  }
2033  segindx = i;
2034  }
2035  else
2036  {
2037  GetSeqExt(pp, locs);
2038 // LCOV_EXCL_START
2039 // Excluded per Mark's request on 12/14/2016
2040  BuildBioSegHeader(pp, seq_entries, locs);
2041 // LCOV_EXCL_STOP
2042  }
2043 
2044  // reject the whole set if any one entry was rejected
2045  //
2046  if(ibp->segnum != 0)
2047  {
2048  div = pp->entrylist[segindx]->division;
2049  int j = segindx;
2050  for(; j <= i; j++)
2051  {
2052  tibp = pp->entrylist[j];
2053  err_install(tibp, pp->accver);
2054  if (! StringEqu(div, tibp->division))
2055  {
2056  ErrPostEx(SEV_WARNING, ERR_DIVISION_Mismatch,
2057  "Division different in segmented set: %s: %s",
2058  div, tibp->division);
2059  }
2060  if (tibp->drop)
2061  {
2062  ErrPostEx(SEV_WARNING, ERR_SEGMENT_Rejected,
2063  "Reject the whole segmented set");
2064  break;
2065  }
2066  }
2067  if(j <= i)
2068  {
2069  for(j = segindx; j <= i; j++)
2070  {
2071  tibp = pp->entrylist[j];
2072  err_install(tibp, pp->accver);
2073  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
2074  "Entry skipped: \"%s|%s\".",
2075  tibp->locusname, tibp->acnum);
2076  total_dropped++;
2077  }
2078 
2079  seq_entries.clear();
2080  continue;
2081  }
2082  }
2083 
2084  DealWithGenes(seq_entries, pp);
2085 
2086  if (seq_entries.empty())
2087  {
2088  if(ibp->segnum != 0)
2089  {
2090  ErrPostEx(SEV_WARNING, ERR_SEGMENT_Rejected,
2091  "Reject the whole segmented set.");
2092  int j = segindx;
2093  for(; j <= i; j++)
2094  {
2095  tibp = pp->entrylist[j];
2096  err_install(tibp, pp->accver);
2097  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
2098  "Entry skipped: \"%s|%s\".",
2099  tibp->locusname, tibp->acnum);
2100  total_dropped++;
2101  }
2102  }
2103  else
2104  {
2105  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
2106  "Entry skipped: \"%s|%s\".",
2107  ibp->locusname, ibp->acnum);
2108  total_dropped++;
2109  }
2110  continue;
2111  }
2112 
2113  if (pp->source == Parser::ESource::Flybase && !seq_entries.empty())
2114  fta_get_user_object(*(*seq_entries.begin()), *pEntry);
2115 
2116  // remove out all the features if their seqloc has
2117  // "join" or "order" among other segments, to the annot
2118  // which in class = parts
2119  //
2120  if(ibp->segnum != 0)
2121 // LCOV_EXCL_START
2122 // Excluded per Mark's request on 12/14/2016
2123  CheckFeatSeqLoc(seq_entries);
2124 // LCOV_EXCL_STOP
2125 
2126  fta_find_pub_explore(pp, seq_entries);
2127 
2128  // change qual "citation" on features to SeqFeat.cit
2129  // find citation in the list by serial_number.
2130  // If serial number not found remove /citation
2131  //
2132  ProcessCitations(seq_entries);
2133 
2134  // check for long sequences in each segment
2135  //
2136  if(pp->limit != 0)
2137  {
2138  if(ibp->segnum != 0)
2139  {
2140  int j = segindx;
2141  for(; j <= i; j++)
2142  {
2143  tibp = pp->entrylist[j];
2144  err_install(tibp, pp->accver);
2145  if(tibp->bases <= (size_t) pp->limit)
2146  continue;
2147 
2148  if(tibp->htg == 1 || tibp->htg == 2 || tibp->htg == 4)
2149  {
2150  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence,
2151  "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem",
2152  tibp->locusname, tibp->acnum, pp->limit);
2153  }
2154  else
2155  {
2156  seq_long = true;
2157  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence,
2158  "Sequence %s|%s is longer than limit %ld",
2159  tibp->locusname, tibp->acnum, pp->limit);
2160  }
2161  }
2162  }
2163  else if(ibp->bases > (size_t) pp->limit)
2164  {
2165  if(ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 4)
2166  {
2167  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence,
2168  "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem",
2169  ibp->locusname, ibp->acnum, pp->limit);
2170  }
2171  else
2172  {
2173  seq_long = true;
2174  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence,
2175  "Sequence %s|%s is longer than limit %ld",
2176  ibp->locusname, ibp->acnum, pp->limit);
2177  }
2178  }
2179  }
2180  if (pp->mode == Parser::EMode::Relaxed) {
2181  for(auto pEntry : seq_entries) {
2182  auto pScope = Ref(new CScope(*CObjectManager::GetInstance()));
2183  g_InstantiateMissingProteins(pScope->AddTopLevelSeqEntry(*pEntry));
2184  }
2185  }
2186  if (pp->convert)
2187  {
2188  if(pp->cleanup <= 1)
2189  {
2190  FinalCleanup(seq_entries);
2191 
2192  if (pp->qamode && !seq_entries.empty())
2193  fta_remove_cleanup_user_object(*seq_entries.front());
2194  }
2195 
2196  MaybeCutGbblockSource(seq_entries);
2197  }
2198 
2199  EntryCheckDivCode(seq_entries, pp);
2200 
2201  if(pp->xml_comp)
2202  fta_set_strandedness(seq_entries);
2203 
2204  if (fta_EntryCheckGBBlock(seq_entries))
2205  {
2206  ErrPostStr(SEV_WARNING, ERR_ENTRY_GBBlock_not_Empty,
2207  "Attention: GBBlock is not empty");
2208  }
2209 
2210  // check for identical features
2211  //
2212  if(pp->qamode)
2213  {
2214  fta_sort_descr(seq_entries);
2215  fta_sort_seqfeat_cit(seq_entries);
2216  }
2217 
2218  if (pp->citat)
2219  {
2220  StripSerialNumbers(seq_entries);
2221  }
2222 
2223  PackEntries(seq_entries);
2224  CheckDupDates(seq_entries);
2225 
2226  if(ibp->segnum != 0) {
2227  int j = segindx;
2228  for(; j <= i; j++)
2229  err_install(pp->entrylist[j], pp->accver);
2230  }
2231  if (seq_long)
2232  {
2233  seq_long = false;
2234  if(ibp->segnum != 0)
2235  total_long += (i - segindx + 1);
2236  else
2237  total_long++;
2238  }
2239  else
2240  {
2241  pp->entries.splice(pp->entries.end(), seq_entries);
2242 
2243  if(ibp->segnum != 0)
2244  total += (i - segindx + 1);
2245  else
2246  total++;
2247  }
2248 
2249  if(ibp->segnum != 0)
2250  {
2251  for(int j = segindx; j <= i; j++)
2252  {
2253  tibp = pp->entrylist[j];
2254  err_install(tibp, pp->accver);
2255  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed,
2256  "OK - entry parsed successfully: \"%s|%s\".",
2257  tibp->locusname, tibp->acnum);
2258  }
2259  }
2260  else
2261  {
2262  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed,
2263  "OK - entry parsed successfully: \"%s|%s\".",
2264  ibp->locusname, ibp->acnum);
2265  }
2266 
2267  seq_entries.clear();
2268  }
2269  else
2270  {
2271  GetSeqExt(pp, locs);
2272 
2273  seq_entries.push_back(ebp->seq_entry);
2274  ebp->seq_entry.Reset();
2275  }
2276 
2277  */} // for, ascii block entries
2278 
2280 
2281  ErrPostEx(SEV_INFO, ERR_ENTRY_ParsingComplete, "COMPLETED : SUCCEEDED = %d (including: LONG ones = %d); SKIPPED = %d.", total, total_long, total_dropped);
2282  // MemFree(dnaconv);
2283  // MemFree(protconv);
2284 
2285  return false;
2286 }
2287 
2288 // LCOV_EXCL_START
2289 // Excluded per Mark's request on 12/14/2016
2290 /**********************************************************
2291  *
2292  * static void SrchFeatSeqLoc(sslbp, sfp):
2293  *
2294  * 9-14-93
2295  *
2296  **********************************************************/
2298 {
2299  for (CSeq_annot::C_Data::TFtable::iterator feat = feat_table.begin(); feat != feat_table.end();) {
2300  if ((*feat)->IsSetLocation() && (*feat)->GetLocation().GetId()) {
2301  ++feat;
2302  continue;
2303  }
2304 
2305  /* SeqLocId will return NULL if any one of seqid in the SeqLocPtr
2306  * is diffenent, so move out cursfp to sslbp
2307  */
2308 
2309  feats.push_back(*feat);
2310  feat = feat_table.erase(feat);
2311  }
2312 }
2313 
2314 /**********************************************************
2315  *
2316  * static void FindFeatSeqLoc(sep, data, index, indent):
2317  *
2318  * 9-14-93
2319  *
2320  **********************************************************/
2321 static void FindFeatSeqLoc(TEntryList& seq_entries, TSeqFeatList& feats)
2322 {
2323  for (auto& entry : seq_entries) {
2324  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
2325  const CSeq_id& first_id = *(*bioseq->GetId().begin());
2326  if (IsSegBioseq(first_id) || ! bioseq->IsSetAnnot())
2327  continue;
2328 
2329  /* process this bioseq entry */
2330  CBioseq::TAnnot annots = bioseq->SetAnnot();
2331  for (CBioseq::TAnnot::iterator annot = annots.begin(); annot != annots.end();) {
2332  if (! (*annot)->IsSetData() || ! (*annot)->GetData().IsFtable()) {
2333  ++annot;
2334  continue;
2335  }
2336 
2337  CSeq_annot::C_Data::TFtable& feat_table = (*annot)->SetData().SetFtable();
2338  SrchFeatSeqLoc(feats, feat_table);
2339 
2340  if (! feat_table.empty()) {
2341  ++annot;
2342  continue;
2343  }
2344 
2345  annot = annots.erase(annot);
2346  }
2347  }
2348  }
2349 }
2350 
2351 /**********************************************************/
2352 static CBioseq_set* GetParts(TEntryList& seq_entries)
2353 {
2354  for (auto& entry : seq_entries) {
2355  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
2356  if (bio_set->IsSetClass() && bio_set->GetClass() == CBioseq_set::eClass_parts)
2357  return bio_set.operator->();
2358  }
2359  }
2360 
2361  return nullptr;
2362 }
2363 
2364 /**********************************************************
2365  *
2366  * void CheckFeatSeqLoc(sep):
2367  *
2368  * Remove out all the features which its seqloc has
2369  * "join" or "order" among other segments, then insert
2370  * into the annot which in the level of the class = parts.
2371  *
2372  * 9-14-93
2373  *
2374  **********************************************************/
2375 void CheckFeatSeqLoc(TEntryList& seq_entries)
2376 {
2377  TSeqFeatList feats_no_id;
2378  FindFeatSeqLoc(seq_entries, feats_no_id);
2379 
2380  CBioseq_set* parts = GetParts(seq_entries);
2381 
2382  if (! feats_no_id.empty() && parts) /* may need to delete duplicate
2383  one 9-14-93 */
2384  {
2385  for (auto& annot : parts->SetAnnot()) {
2386  if (! annot->IsFtable())
2387  continue;
2388 
2389  annot->SetData().SetFtable().splice(annot->SetData().SetFtable().end(), feats_no_id);
2390  break;
2391  }
2392 
2393  if (parts->GetAnnot().empty()) {
2394  CRef<CSeq_annot> new_annot(new CSeq_annot);
2395  new_annot->SetData().SetFtable().swap(feats_no_id);
2396  parts->SetAnnot().push_back(new_annot);
2397  }
2398  }
2399 }
2400 
2402 // LCOV_EXCL_STOP
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
bool no_reference(const CBioseq &bioseq)
Definition: add.cpp:220
void SeqToDelta(CBioseq &bioseq, Int2 tech)
Definition: add.cpp:505
CMolInfo::TTech fta_check_con_for_wgs(CBioseq &bioseq)
Definition: add.cpp:2075
bool fta_check_htg_kwds(TKeywordList &kwds, IndexblkPtr ibp, CMolInfo &mol_info)
Definition: add.cpp:913
void fta_set_molinfo_completeness(CBioseq &bioseq, const Indexblk *ibp)
Definition: add.cpp:2765
void fta_add_hist(ParserPtr pp, CBioseq &bioseq, CGB_block::TExtra_accessions &extra_accs, Parser::ESource source, CSeq_id::E_Choice acctype, bool pricon, const char *acc)
Definition: add.cpp:793
void AssemblyGapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:339
bool fta_parse_tpa_tsa_block(CBioseq &bioseq, char *offset, char *acnum, Int2 vernum, size_t len, Int2 col_data, bool tpa)
Definition: add.cpp:1118
string GetQSFromFile(FILE *fd, const Indexblk *ibp)
Definition: add.cpp:2668
void fta_get_project_user_object(TSeqdescList &descrs, char *offset, Parser::EFormat format, bool *drop, Parser::ESource source)
Definition: add.cpp:1610
bool check_cds(const DataBlk &entry, Parser::EFormat format)
Definition: add.cpp:258
void fta_create_far_fetch_policy_user_object(CBioseq &bsp, Int4 num)
Definition: add.cpp:2790
void fta_tsa_tls_comment_dblink_check(const CBioseq &bioseq, bool is_tsa)
Definition: add.cpp:2720
void fta_remove_cleanup_user_object(CSeq_entry &seq_entry)
Definition: add.cpp:2687
bool fta_dblink_has_sra(const CRef< CUser_object > &uop)
Definition: add.cpp:2831
void GapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:387
void fta_get_dblink_user_object(TSeqdescList &descrs, char *offset, size_t len, Parser::ESource source, bool *drop, CRef< CUser_object > &dbuop)
Definition: add.cpp:1950
void err_install(const Indexblk *ibp, bool accver)
Definition: add.cpp:302
bool no_date(Parser::EFormat format, const TSeqdescList &descrs)
Definition: add.cpp:190
void fta_parse_structured_comment(char *str, bool &bad, TUserObjVector &objs)
Definition: add.cpp:2552
Int4 fta_fix_seq_loc_id(TSeqLocList &locs, ParserPtr pp, const char *location, const char *name, bool iscon)
Definition: add.cpp:2299
void GetGenBankSubBlock(const DataBlk &entry, size_t bases)
Definition: asci_blk.cpp:454
void StripSerialNumbers(TEntryList &seq_entries)
Definition: asci_blk.cpp:3406
void AddNIDSeqId(CBioseq &bioseq, const DataBlk &entry, Int2 type, Int2 coldata, Parser::ESource source)
Definition: asci_blk.cpp:2726
void fta_fix_orgref_div(const CBioseq::TAnnot &annots, COrg_ref *org_ref, CGB_block &gbb)
Definition: asci_blk.cpp:3270
void DefVsHTGKeywords(CMolInfo::TTech tech, const DataBlk &entry, Int2 what, Int2 ori, bool cancelled)
Definition: asci_blk.cpp:2821
bool IsSegBioseq(const CSeq_id &id)
Definition: asci_blk.cpp:2540
void xGetGenBankSubBlocks(Entry &entry, size_t bases)
Definition: asci_blk.cpp:493
void fta_sort_seqfeat_cit(TEntryList &seq_entries)
Definition: asci_blk.cpp:3242
void PackEntries(TEntryList &seq_entries)
Definition: asci_blk.cpp:3506
void fta_set_strandedness(TEntryList &seq_entries)
Definition: asci_blk.cpp:3341
void CheckHTGDivision(const char *div, CMolInfo::TTech tech)
Definition: asci_blk.cpp:2946
unique_ptr< unsigned char[]> GetDNAConv(void)
Definition: asci_blk.cpp:1790
unique_ptr< unsigned char[]> GetProteinConv(void)
Definition: asci_blk.cpp:1818
char * GetDescrComment(char *offset, size_t len, Uint2 col_data, bool is_htg, bool is_pat)
Definition: asci_blk.cpp:1159
void GetSequenceOfKeywords(const DataBlk &entry, int type, Uint2 col_data, TKeywordList &keywords)
Definition: asci_blk.cpp:1551
void EntryCheckDivCode(TEntryList &seq_entries, ParserPtr pp)
Definition: asci_blk.cpp:2810
char * GetGenBankBlock(DataBlkPtr *chain, char *ptr, Int2 *retkw, char *eptr)
Definition: asci_blk.cpp:284
void GetSeqExt(ParserPtr pp, CSeq_loc &seq_loc)
Definition: asci_blk.cpp:2473
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
Definition: asci_blk.cpp:1678
bool fta_EntryCheckGBBlock(TEntryList &seq_entries)
Definition: asci_blk.cpp:3113
char * SrchNodeSubType(const DataBlk &entry, Int2 type, Int2 subtype, size_t *len)
Definition: asci_blk.cpp:1039
void xGetGenBankBlocks(Entry &entry)
Definition: asci_blk.cpp:256
void fta_sort_descr(TEntryList &seq_entries)
Definition: asci_blk.cpp:3185
void BuildBioSegHeader(ParserPtr pp, TEntryList &entries, const CSeq_loc &seqloc)
Definition: asci_blk.cpp:2498
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
Definition: asci_blk.cpp:1321
bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string &div, CMolInfo::TTech *tech, size_t bases, Parser::ESource source, bool &drop)
Definition: asci_blk.cpp:2569
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
Definition: asci_blk.cpp:1074
void xFreeEntry(DataBlkPtr entry)
Definition: block.cpp:109
void ProcessCitations(TEntryList &seq_entries)
Definition: citation.cpp:307
Definition: Date.hpp:53
void SetToTime(const CTime &time, EPrecision prec=ePrecision_second)
Definition: Date.cpp:57
CScope –.
Definition: scope.hpp:92
Definition: Seq_entry.hpp:56
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
CTime –.
Definition: ncbitime.hpp:296
CUser_field & SetString(const char *value)
Definition: User_field.cpp:445
char * mOffset
Definition: ftablock.h:329
size_t len
Definition: ftablock.h:330
CFlatFileData * mpData
Definition: ftablock.h:328
DataBlk * mpNext
Definition: ftablock.h:333
int mType
Definition: ftablock.h:327
EntryPtr LoadEntryGenbank(ParserPtr pp, size_t offset, size_t len)
Definition: entry.cpp:217
DataBlkPtr LoadEntry(ParserPtr pp, size_t offset, size_t len)
Definition: entry.cpp:300
void g_InstantiateMissingProteins(CSeq_entry_Handle entryHandle)
Definition: fcleanup.cpp:335
void FinalCleanup(TEntryList &seq_entries)
Definition: fcleanup.cpp:377
#define ERR_SEQUENCE_BadData
Definition: flat2err.h:150
#define ERR_TPA_TpaSpansMissing
Definition: flat2err.h:593
#define ERR_ENTRY_LongSequence
Definition: flat2err.h:82
#define ERR_FORMAT_MissingContigFeature
Definition: flat2err.h:43
#define ERR_KEYWORD_ShouldNotBeTPA
Definition: flat2err.h:208
#define ERR_DIVISION_BadTSADivcode
Definition: flat2err.h:261
#define ERR_FORMAT_MissingSequenceData
Definition: flat2err.h:41
#define ERR_DIVISION_InvalidHTCKeyword
Definition: flat2err.h:254
#define ERR_KEYWORD_IllegalForCON
Definition: flat2err.h:210
#define ERR_DIVISION_MissingHTGKeywords
Definition: flat2err.h:249
#define ERR_QSCORE_FailedToParse
Definition: flat2err.h:577
#define ERR_ENTRY_LongHTGSSequence
Definition: flat2err.h:86
#define ERR_KEYWORD_MissingTSA
Definition: flat2err.h:216
#define ERR_DIVISION_BadTPADivcode
Definition: flat2err.h:257
#define ERR_REFERENCE_No_references
Definition: flat2err.h:289
#define ERR_KEYWORD_ShouldNotBeTLS
Definition: flat2err.h:218
#define ERR_ENTRY_GBBlock_not_Empty
Definition: flat2err.h:85
#define ERR_KEYWORD_HTGPlusENV
Definition: flat2err.h:217
#define ERR_DEFINITION_MissingTPA
Definition: flat2err.h:269
#define ERR_ENTRY_Skipped
Definition: flat2err.h:80
#define ERR_DEFINITION_MissingTLS
Definition: flat2err.h:273
#define ERR_KEYWORD_ESTSubstring
Definition: flat2err.h:204
#define ERR_KEYWORD_ConflictingKeywords
Definition: flat2err.h:207
#define ERR_DIVISION_ConDivLacksContig
Definition: flat2err.h:252
#define ERR_LOCATION_ContigHasNull
Definition: flat2err.h:397
#define ERR_SEGMENT_OnlyOneMember
Definition: flat2err.h:165
#define ERR_KEYWORD_ENV_NoMatchingQualifier
Definition: flat2err.h:214
#define ERR_KEYWORD_ShouldNotBeTSA
Definition: flat2err.h:215
#define ERR_KEYWORD_STSSubstring
Definition: flat2err.h:205
#define ERR_DIVISION_UnknownDivCode
Definition: flat2err.h:222
#define ERR_KEYWORD_MissingTLS
Definition: flat2err.h:219
#define ERR_DEFINITION_ShouldNotBeTSA
Definition: flat2err.h:270
#define ERR_SEGMENT_Rejected
Definition: flat2err.h:166
#define ERR_DIVISION_MissingHTCKeyword
Definition: flat2err.h:253
#define ERR_DIVISION_MappedtoCON
Definition: flat2err.h:248
#define ERR_DIVISION_MappedtoEST
Definition: flat2err.h:223
#define ERR_FORMAT_ContigWithSequenceData
Definition: flat2err.h:42
#define ERR_KEYWORD_NoGeneExpressionKeywords
Definition: flat2err.h:213
#define ERR_DEFINITION_MissingTSA
Definition: flat2err.h:271
#define ERR_DEFINITION_ShouldNotBeTPA
Definition: flat2err.h:268
#define ERR_FORMAT_MissingEnd
Definition: flat2err.h:39
#define ERR_KEYWORD_MissingTPA
Definition: flat2err.h:209
#define ERR_DIVISION_ConDivInSegset
Definition: flat2err.h:251
#define ERR_ENTRY_ParsingComplete
Definition: flat2err.h:79
#define ERR_DIVISION_Mismatch
Definition: flat2err.h:226
#define ERR_ORGANISM_NoOrganism
Definition: flat2err.h:184
#define ERR_DATE_IllegalDate
Definition: flat2err.h:102
#define ERR_ENTRY_Parsed
Definition: flat2err.h:83
#define ERR_DIVISION_HTCWrongMolType
Definition: flat2err.h:255
#define ERR_KEYWORD_ShouldNotBeCAGE
Definition: flat2err.h:211
#define ERR_DEFINITION_ShouldNotBeTLS
Definition: flat2err.h:272
#define ERR_TSA_UnexpectedPrimaryAccession
Definition: flat2err.h:609
list< CRef< objects::CSeq_entry > > TEntryList
bool QscoreToSeqAnnot(const string &qscore, CBioseq &bioseq, char *acc, Int2 ver, bool check_minmax, bool allow_na)
std::list< CRef< objects::CSeq_feat > > TSeqFeatList
Definition: ftablock.h:56
std::list< CRef< objects::CSeqdesc > > TSeqdescList
Definition: ftablock.h:61
std::vector< CRef< objects::CUser_object > > TUserObjVector
Definition: ftablock.h:62
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:131
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:121
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:111
void StringCpy(char *d, const char *s)
Definition: ftacpp.hpp:89
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:90
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
void MemCpy(void *p, const void *q, size_t sz)
Definition: ftacpp.hpp:50
char * StringNew(size_t sz)
Definition: ftacpp.hpp:43
void FtaDeletePrefix(int prefix)
Definition: ftaerr.cpp:346
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
void fta_find_pub_explore(ParserPtr pp, TEntryList &seq_entries)
Definition: ftanet.cpp:753
static const char * str(char *buf, int n)
Definition: stats.c:84
int offset
Definition: replacements.h:160
static TDSICONV * conv
Definition: charconv.c:168
bool GetGenBankInstContig(const DataBlk &entry, CBioseq &bsp, ParserPtr pp)
Definition: gb_ascii.cpp:135
USING_SCOPE(objects)
static CRef< CGB_block > GetGBBlock(ParserPtr pp, const DataBlk &entry, CMolInfo &mol_info, CBioSource *bio_src)
Definition: gb_ascii.cpp:279
static string GetGenBankLineage(string_view sv)
Definition: gb_ascii.cpp:254
static void fta_get_str_user_field(char *line, const Char *tag, CUser_object &user_obj)
Definition: gb_ascii.cpp:914
void CheckFeatSeqLoc(TEntryList &seq_entries)
Definition: gb_ascii.cpp:2375
static CRef< CMolInfo > GetGenBankMolInfo(ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
Definition: gb_ascii.cpp:710
static void FindFeatSeqLoc(TEntryList &seq_entries, TSeqFeatList &feats)
Definition: gb_ascii.cpp:2321
static void FakeGenBankBioSources(const DataBlk &entry, CBioseq &bioseq)
Definition: gb_ascii.cpp:757
static void CheckContigEverywhere(IndexblkPtr ibp, Parser::ESource source)
Definition: gb_ascii.cpp:103
bool GenBankAsciiOrig(ParserPtr pp)
Definition: gb_ascii.cpp:1363
static void fta_get_user_object(CSeq_entry &seq_entry, const DataBlk &entry)
Definition: gb_ascii.cpp:962
bool GenBankAscii(ParserPtr pp)
Definition: gb_ascii.cpp:1784
static void GenBankGetDivision(char *division, Int4 div, const DataBlk &entry)
Definition: gb_ascii.cpp:1343
static void fta_get_user_field(char *line, const Char *tag, CUser_object &user_obj)
Definition: gb_ascii.cpp:840
static void SrchFeatSeqLoc(TSeqFeatList &feats, CSeq_annot::C_Data::TFtable &feat_table)
Definition: gb_ascii.cpp:2297
static CBioseq_set * GetParts(TEntryList &seq_entries)
Definition: gb_ascii.cpp:2352
static void fta_get_mga_user_object(TSeqdescList &descrs, char *offset, size_t len)
Definition: gb_ascii.cpp:1013
static char * GBDivOffset(const DataBlk &entry, Int4 div_shift)
Definition: gb_ascii.cpp:97
static bool GetGenBankInst(ParserPtr pp, const DataBlk &entry, unsigned char *dnaconv)
Definition: gb_ascii.cpp:215
static void xGenBankGetDivision(char *division, Int4 div, const string &locusText)
Definition: gb_ascii.cpp:1349
static void GetGenBankDescr(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
Definition: gb_ascii.cpp:1061
@ ParFlat_SOURCE
Definition: genbank.h:48
@ ParFlat_DBLINK
Definition: genbank.h:63
@ ParFlat_COMMENT
Definition: genbank.h:50
@ ParFlat_LOCUS
Definition: genbank.h:41
@ ParFlat_PROJECT
Definition: genbank.h:62
@ ParFlat_NCBI_GI
Definition: genbank.h:44
@ ParFlat_USER
Definition: genbank.h:58
@ ParFlat_PRIMARY
Definition: genbank.h:60
@ ParFlat_END
Definition: genbank.h:54
@ ParFlat_ORGANISM
Definition: genbank.h:66
@ ParFlat_KEYWORDS
Definition: genbank.h:46
@ ParFlat_DEFINITION
Definition: genbank.h:42
@ ParFlat_CONTIG
Definition: genbank.h:56
@ ParFlat_MGA
Definition: genbank.h:61
@ ParFlat_ORIGIN
Definition: genbank.h:53
#define ParFlat_COL_DATA
Definition: genbank.h:37
void DealWithGenes(TEntryList &seq_entries, ParserPtr pp)
Definition: genref.cpp:2957
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
#define StringStr
Definition: ncbistr.hpp:322
#define StringSave
Definition: ncbistr.hpp:326
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
TOrigin GetOrigin(void) const
Get the Origin member data.
Definition: BioSource_.hpp:472
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
Definition: BioSource_.hpp:447
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
@ eOrigin_synthetic
purely synthetic
Definition: BioSource_.hpp:134
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
TStd & SetStd(void)
Select the variant.
Definition: Date_.cpp:115
TData & SetData(void)
Assign a value to Data data member.
void SetNum(TNum value)
Assign a value to Num data member.
void SetLabel(TLabel &value)
Assign a value to Label data member.
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
const TDiv & GetDiv(void) const
Get the Div member data.
Definition: OrgName_.hpp:1005
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
bool IsSetDiv(void) const
GenBank division code Check if a value has been assigned to Div data member.
Definition: OrgName_.hpp:993
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
Definition: Org_ref_.hpp:381
bool IsSetMod(void) const
Check if a value has been assigned to Mod data member.
Definition: OrgName_.hpp:827
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSubtype_metagenome_source
Definition: OrgMod_.hpp:120
@ eSeq_code_type_iupacaa
IUPAC 1 letter amino acid code.
@ eSeq_code_type_iupacna
IUPAC 1 letter nuc acid code.
bool IsMix(void) const
Check if variant Mix is selected.
Definition: Seq_loc_.hpp:552
const TMix & GetMix(void) const
Get the variant data.
Definition: Seq_loc_.cpp:282
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
const TAnnot & GetAnnot(void) const
Get the Annot member data.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
@ eClass_parts
parts for 2 or 3
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
TPub & SetPub(void)
Select the variant.
Definition: Seqdesc_.cpp:362
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
TGenbank & SetGenbank(void)
Select the variant.
Definition: Seqdesc_.cpp:340
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
TComment & SetComment(void)
Select the variant.
Definition: Seqdesc_.hpp:1065
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
void ResetTech(void)
Reset Tech data member.
Definition: MolInfo_.hpp:484
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
void SetTopology(TTopology value)
Assign a value to Topology data member.
Definition: Seq_inst_.hpp:739
ETopology
topology of molecule
Definition: Seq_inst_.hpp:121
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
EStrand
strandedness in living organism
Definition: Seq_inst_.hpp:133
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
void SetStrand(TStrand value)
Assign a value to Strand data member.
Definition: Seq_inst_.hpp:786
void SetTech(TTech value)
Assign a value to Tech data member.
Definition: MolInfo_.hpp:503
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
TMolinfo & SetMolinfo(void)
Select the variant.
Definition: Seqdesc_.cpp:594
TUpdate_date & SetUpdate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:500
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eRepr_virtual
no seq data
Definition: Seq_inst_.hpp:93
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_other
use Source.techexp
Definition: MolInfo_.hpp:148
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_targeted
targeted locus sets/studies
Definition: MolInfo_.hpp:147
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ ParFlat_REF_NO_TARGET
Definition: index.h:63
@ ParFlat_REF_END
Definition: index.h:60
CRef< CDate_std > GetUpdateDate(const char *ptr, Parser::ESource source)
Definition: indx_blk.cpp:609
int CheckTPG(const string &str)
Definition: indx_blk.cpp:500
int CheckSTRAND(const string &str)
Definition: indx_blk.cpp:465
Int4 IsNewAccessFormat(const Char *acnum)
Definition: indx_blk.cpp:991
Int2 CheckDIV(const char *str)
Definition: indx_blk.cpp:530
int i
int len
void GetFlatBiomol(CMolInfo::TBiomol &biomol, CMolInfo::TTech tech, char *molstr, ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
Definition: loadfeat.cpp:4966
void LoadFeat(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
Definition: loadfeat.cpp:4660
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
const char * tag
int isspace(Uchar c)
Definition: ncbictype.hpp:69
std::list< SeqLoc > TSeqLocList
Int mod(Int i, Int j)
Definition: njn_integer.hpp:67
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void CheckDupDates(TEntryList &seq_entries)
Definition: nucprot.cpp:2661
CRef< CPubdesc > DescrRefs(ParserPtr pp, DataBlkPtr dbp, Uint2 col_data)
Definition: ref.cpp:2426
DataBlkPtr chain
Definition: ftablock.h:341
CRef< objects::CSeq_entry > seq_entry
Definition: ftablock.h:343
Char acnum[200]
Definition: ftablock.h:166
CRef< objects::CPatent_seq_id > psip
Definition: ftablock.h:190
Char division[4]
Definition: ftablock.h:171
bool is_mga
Definition: ftablock.h:199
bool tsa_allowed
Definition: ftablock.h:211
Int4 wgs_and_gi
Definition: ftablock.h:231
Int2 htg
Definition: ftablock.h:196
bool is_tls
Definition: ftablock.h:208
Int2 vernum
Definition: ftablock.h:167
bool is_tpa
Definition: ftablock.h:206
TKeywordList keywords
Definition: ftablock.h:240
bool is_prot
Definition: ftablock.h:222
bool is_wgs
Definition: ftablock.h:205
bool origin
Definition: ftablock.h:201
bool is_contig
Definition: ftablock.h:197
bool STS
Definition: ftablock.h:193
bool is_pat
Definition: ftablock.h:202
bool HTC
Definition: ftablock.h:195
bool drop
Definition: ftablock.h:182
bool experimental
Definition: ftablock.h:247
size_t bases
Definition: ftablock.h:172
bool inferential
Definition: ftablock.h:245
Uint2 segtotal
Definition: ftablock.h:175
bool is_tsa
Definition: ftablock.h:207
bool EST
Definition: ftablock.h:192
size_t len
Definition: ftablock.h:184
GapFeatsPtr gaps
Definition: ftablock.h:214
string wgssec
Definition: ftablock.h:236
size_t offset
Definition: ftablock.h:168
bool specialist_db
Definition: ftablock.h:243
Uint2 segnum
Definition: ftablock.h:173
Char locusname[200]
Definition: ftablock.h:170
bool env_sample_qual
Definition: ftablock.h:219
size_t qslength
Definition: ftablock.h:230
LocusCont lc
Definition: ftablock.h:212
bool GSS
Definition: ftablock.h:194
Int4 molecule
Definition: ftablock.h:109
Int4 strand
Definition: ftablock.h:108
Int4 topology
Definition: ftablock.h:110
Int4 date
Definition: ftablock.h:112
Int4 bp
Definition: ftablock.h:107
Int4 div
Definition: ftablock.h:111
char *(* ff_get_qscore_pp)(const char *accession, Int2 v, Parser *pp)
vector< IndexblkPtr > entrylist
bool allow_crossdb_featloc
optional< string > buf
char *(* ff_get_qscore)(const char *accession, Int2 v)
TEntryList entries
CScope & GetScope()
bool GetGenomeInfo(CBioSource &bsp, string_view bptr)
Definition: utilfeat.cpp:225
void MaybeCutGbblockSource(TEntryList &seq_entries)
Definition: utilfeat.cpp:435
bool HasHtg(const TKeywordList &keywords)
Definition: utilfun.cpp:1575
bool HasHtc(const TKeywordList &keywords)
Definition: utilfun.cpp:1604
char * SrchTheChar(char *bptr, char *eptr, Char letter)
Definition: utilfun.cpp:759
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1137
void RemoveHtgPhase(TKeywordList &keywords)
Definition: utilfun.cpp:1589
string GetBlkDataReplaceNewLine(string_view instr, Uint2 indent)
Definition: utilfun.cpp:644
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1278
void fta_remove_tpa_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1264
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
Definition: utilfun.cpp:1233
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1293
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
Definition: utilfun.cpp:964
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
Definition: utilfun.cpp:1196
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
Definition: utilfun.cpp:995
void fta_remove_mag_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1322
bool IsCancelled(const TKeywordList &keywords)
Definition: utilfun.cpp:1564
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1103
void fta_remove_env_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1308
bool fta_tpa_keywords_check(const TKeywordList &kwds)
Definition: utilfun.cpp:1021
bool fta_check_mga_keywords(CMolInfo &mol_info, const TKeywordList &kwds)
Definition: utilfun.cpp:1448
CRef< CSeq_loc > xgbparseint_ver(const char *raw_intervals, bool &keep_rawPt, int &numErrors, const TSeqIdList &seq_ids, bool accver)
Definition: xgbparint.cpp:1466
void XGappedSeqLocsToDeltaSeqs(const TSeqLocList &locs, TDeltaList &deltas)
Definition: xutils.cpp:91
Modified on Thu May 23 12:33:53 2024 by modify_doxy.py rev. 669887