NCBI C++ ToolKit
gb_ascii.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gb_ascii.cpp 99284 2023-03-06 16:28:57Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: gb_ascii.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Parse gb from blocks to asn.
32  * Build GenBank format entry block.
33  *
34  */
35 
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 
40 #include <objects/seq/Seq_inst.hpp>
43 #include <objects/seq/Bioseq.hpp>
45 #include <serial/objostr.hpp>
46 #include <serial/serial.hpp>
47 #include <objects/seq/Seq_ext.hpp>
53 #include <objmgr/scope.hpp>
59 #include <objects/seq/Pubdesc.hpp>
60 #include <objects/seq/MolInfo.hpp>
61 
62 #include "index.h"
63 #include "genbank.h"
64 
67 #include "ftanet.h"
68 
69 #include "ftaerr.hpp"
70 #include "asci_blk.h"
71 #include "indx_blk.h"
72 #include "utilref.h"
73 #include "utilfeat.h"
74 #include "loadfeat.h"
75 #include "gb_ascii.h"
76 #include "add.h"
77 #include "nucprot.h"
78 #include "fta_qscore.h"
79 #include "citation.h"
80 #include "fcleanup.h"
81 #include "utilfun.h"
82 #include "entry.h"
83 #include "ref.h"
84 #include "xgbparint.h"
85 #include "xutils.h"
86 
87 
88 #ifdef THIS_FILE
89 # undef THIS_FILE
90 #endif
91 #define THIS_FILE "gb_ascii.cpp"
92 
95 
96 /**********************************************************/
97 static char* GBDivOffset(const DataBlk& entry, Int4 div_shift)
98 {
99  return (entry.mOffset + div_shift);
100 }
101 
102 /**********************************************************/
104 {
105  bool condiv = (NStr::CompareNocase(ibp->division, "CON") == 0);
106 
107  if (condiv && ibp->segnum != 0) {
108  ErrPostEx(SEV_ERROR, ERR_DIVISION_ConDivInSegset, "Use of the CON division is not allowed for members of segmented set : %s|%s. Entry skipped.", ibp->locusname, ibp->acnum);
109  ibp->drop = true;
110  return;
111  }
112 
113  if (! condiv && ibp->is_contig == false && ibp->origin == false &&
114  ibp->is_mga == false) {
115  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingSequenceData, "Required sequence data is absent. Entry dropped.");
116  ibp->drop = true;
117  } else if (! condiv && ibp->is_contig && ibp->origin == false) {
118  ErrPostEx(SEV_WARNING, ERR_DIVISION_MappedtoCON, "Division [%s] mapped to CON based on the existence of CONTIG line.", ibp->division);
119  } else if (ibp->is_contig && ibp->origin) {
121  ErrPostEx(SEV_INFO, ERR_FORMAT_ContigWithSequenceData, "The CONTIG/CO linetype and sequence data are both present. Ignoring sequence data.");
122  } else {
123  ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigWithSequenceData, "The CONTIG/CO linetype and sequence data may not both be present in a sequence record.");
124  ibp->drop = true;
125  }
126  } else if (condiv && ibp->is_contig == false && ibp->origin == false) {
127  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingContigFeature, "No CONTIG data in GenBank format file, entry dropped.");
128  ibp->drop = true;
129  } else if (condiv && ibp->is_contig == false && ibp->origin) {
130  ErrPostEx(SEV_WARNING, ERR_DIVISION_ConDivLacksContig, "Division is CON, but CONTIG data have not been found.");
131  }
132 }
133 
134 /**********************************************************/
135 bool GetGenBankInstContig(const DataBlk& entry, CBioseq& bsp, ParserPtr pp)
136 {
137  DataBlkPtr dbp;
138 
139  char* p;
140  char* q;
141  char* r;
142 
143  bool locmap;
144 
145  bool allow_crossdb_featloc;
146  Int4 i;
147  int numerr;
148 
149  dbp = TrackNodeType(entry, ParFlat_CONTIG);
150  if (! dbp || ! dbp->mOffset)
151  return true;
152 
153  i = static_cast<Int4>(dbp->len) - ParFlat_COL_DATA;
154  if (i <= 0)
155  return false;
156 
157  p = MemNew(i + 1);
158  StringNCpy(p, &dbp->mOffset[ParFlat_COL_DATA], i);
159  p[i - 1] = '\0';
160  for (q = p, r = p; *q != '\0'; q++)
161  if (*q != '\n' && *q != '\t' && *q != ' ')
162  *r++ = *q;
163  *r = '\0';
164 
165  for (q = p; *q != '\0'; q++)
166  if ((q[0] == ',' && q[1] == ',') || (q[0] == '(' && q[1] == ',') ||
167  (q[0] == ',' && q[1] == ')'))
168  break;
169  if (*q != '\0') {
170  ErrPostEx(SEV_REJECT, ERR_LOCATION_ContigHasNull, "The join() statement for this record's contig line contains one or more comma-delimited components which are null.");
171  MemFree(p);
172  return false;
173  }
174 
175  if (pp->buf)
176  MemFree(pp->buf);
177  pp->buf = nullptr;
178 
179  CRef<CSeq_loc> loc = xgbparseint_ver(p, locmap, numerr, bsp.GetId(), pp->accver);
180  if (loc.Empty()) {
181  MemFree(p);
182  return true;
183  }
184 
185  allow_crossdb_featloc = pp->allow_crossdb_featloc;
186  pp->allow_crossdb_featloc = true;
187 
188  TSeqLocList locs;
189  locs.push_back(loc);
190  i = fta_fix_seq_loc_id(locs, pp, p, nullptr, true);
191 
192  if (i > 999)
194 
195  pp->allow_crossdb_featloc = allow_crossdb_featloc;
196 
197  if (loc->IsMix()) {
198  XGappedSeqLocsToDeltaSeqs(loc->GetMix(), bsp.SetInst().SetExt().SetDelta().Set());
199  bsp.SetInst().SetRepr(CSeq_inst::eRepr_delta);
200  } else
201  bsp.SetInst().ResetExt();
202 
203  MemFree(p);
204  return true;
205 }
206 
207 /**********************************************************
208  *
209  * bool GetGenBankInst(pp, entry, dnaconv):
210  *
211  * Fills in Seq-inst for an entry. Assumes Bioseq
212  * already allocated.
213  *
214  * 3-30-93
215  *
216  **********************************************************/
217 static bool GetGenBankInst(ParserPtr pp, const DataBlk& entry, unsigned char* dnaconv)
218 {
219  EntryBlkPtr ebp;
220  Int2 topology;
221  Int2 strand;
222  char* topstr;
223 
224  char* bptr = entry.mOffset;
225  IndexblkPtr ibp = pp->entrylist[pp->curindx];
226  LocusContPtr lcp = &ibp->lc;
227 
228  topstr = bptr + lcp->topology;
229 
230  ebp = static_cast<EntryBlk*>(entry.mpData);
231  CBioseq& bioseq = ebp->seq_entry->SetSeq();
232 
233  CSeq_inst& inst = bioseq.SetInst();
235 
236  /* get linear, circular, tandem topology, blank is linear which = 1
237  */
238  topology = CheckTPG(topstr);
239  if (topology > 1)
240  inst.SetTopology(static_cast<CSeq_inst::ETopology>(topology));
241 
242  strand = CheckSTRAND((lcp->strand >= 0) ? bptr + lcp->strand : " ");
243  if (strand > 0)
244  inst.SetStrand(static_cast<CSeq_inst::EStrand>(strand));
245 
246  if (GetSeqData(pp, entry, bioseq, ParFlat_ORIGIN, dnaconv, (ibp->is_prot ? eSeq_code_type_iupacaa : eSeq_code_type_iupacna)) == false)
247  return false;
248 
249  if (ibp->is_contig && ! GetGenBankInstContig(entry, bioseq, pp))
250  return false;
251 
252  return true;
253 }
254 
255 /**********************************************************/
256 static char* GetGenBankLineage(char* start, char* end)
257 {
258  char* p;
259  char* str;
260 
261  if (start >= end)
262  return nullptr;
263 
265  if (! str)
266  return nullptr;
267 
268  for (p = str; *p != '\0';)
269  p++;
270  if (p == str) {
271  MemFree(str);
272  return nullptr;
273  }
274  for (p--;; p--) {
275  if (*p != ' ' && *p != '\t' && *p != '\n' && *p != '.' && *p != ';') {
276  p++;
277  break;
278  }
279  if (p == str)
280  break;
281  }
282  if (p == str) {
283  MemFree(str);
284  return nullptr;
285  }
286  *p = '\0';
287  return (str);
288 }
289 
290 /**********************************************************
291  *
292  * static GBBlockPtr GetGBBlock(pp, entry, mfp, biosp):
293  *
294  * 4-7-93
295  *
296  **********************************************************/
297 static CRef<CGB_block> GetGBBlock(ParserPtr pp, const DataBlk& entry, CMolInfo& mol_info, CBioSource* bio_src)
298 {
299  LocusContPtr lcp;
300 
301  CRef<CGB_block> gbb(new CGB_block),
302  ret;
303 
304  IndexblkPtr ibp;
305  char* bptr;
306  char* eptr;
307  char* ptr;
308  char* str;
309  Char msg[4];
310  char* kw;
311  char* kwp;
312  size_t len;
313  Int2 div;
314 
315  bool if_cds;
316  bool pat_ref = false;
317  bool est_kwd = false;
318  bool sts_kwd = false;
319  bool gss_kwd = false;
320  bool htc_kwd = false;
321  bool fli_kwd = false;
322  bool wgs_kwd = false;
323  bool tpa_kwd = false;
324  bool tsa_kwd = false;
325  bool tls_kwd = false;
326  bool env_kwd = false;
327  bool mga_kwd = false;
328 
329  bool cancelled;
330  bool drop;
331 
332  char* tempdiv;
333  char* p;
334  Int4 i;
335 
336  ibp = pp->entrylist[pp->curindx];
337  ibp->wgssec[0] = '\0';
338 
339  bptr = xSrchNodeType(entry, ParFlat_SOURCE, &len);
341  if (str) {
342  p = StringRChr(str, '.');
343  if (p && p > str && p[1] == '\0' && *(p - 1) == '.')
344  *p = '\0';
345 
346  gbb->SetSource(str);
347  MemFree(str);
348  }
349 
350  if (! ibp->keywords.empty()) {
351  gbb->SetKeywords().swap(ibp->keywords);
352  ibp->keywords.clear();
353  } else
354  GetSequenceOfKeywords(entry, ParFlat_KEYWORDS, ParFlat_COL_DATA, gbb->SetKeywords());
355 
356  if (ibp->is_mga && ! fta_check_mga_keywords(mol_info, gbb->GetKeywords())) {
357  return ret;
358  }
359 
360  if (ibp->is_tpa && ! fta_tpa_keywords_check(gbb->GetKeywords())) {
361  return ret;
362  }
363 
364  if (ibp->is_tsa && ! fta_tsa_keywords_check(gbb->GetKeywords(), pp->source)) {
365  return ret;
366  }
367 
368  if (ibp->is_tls && ! fta_tls_keywords_check(gbb->GetKeywords(), pp->source)) {
369  return ret;
370  }
371 
372  for (const string& key : gbb->GetKeywords()) {
373  fta_keywords_check(key.c_str(), &est_kwd, &sts_kwd, &gss_kwd, &htc_kwd, &fli_kwd, &wgs_kwd, &tpa_kwd, &env_kwd, &mga_kwd, &tsa_kwd, &tls_kwd);
374  }
375 
376  if (ibp->env_sample_qual == false && env_kwd) {
377  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ENV_NoMatchingQualifier, "This record utilizes the ENV keyword, but there are no /environmental_sample qualifiers among its source features.");
378  return ret;
379  }
380 
381  bptr = xSrchNodeType(entry, ParFlat_ORIGIN, &len);
382  eptr = bptr + len;
383  ptr = SrchTheChar(bptr, eptr, '\n');
384  if (ptr) {
385  eptr = ptr;
386  bptr += 6;
387 
388  if (eptr != bptr) {
389  while (isspace(*bptr) != 0)
390  bptr++;
391  len = eptr - bptr;
392  if (len > 0) {
393  gbb->SetOrigin(string(bptr, eptr));
394  }
395  }
396  }
397 
398  lcp = &ibp->lc;
399 
400  bptr = GBDivOffset(entry, lcp->div);
401 
402  if (*bptr != ' ') {
403  if_cds = check_cds(entry, pp->format);
404  div = CheckDIV(bptr);
405  if (div != -1) {
406  string div_str(bptr, bptr + 3);
407  gbb->SetDiv(div_str);
408 
409  if (div == 16) /* "ORG" replaced by "UNA" */
410  gbb->SetDiv("UNA");
411 
412  /* preserve the division code for later use
413  */
414  const char* p_div = gbb->GetDiv().c_str();
415  StringCpy(ibp->division, p_div);
416 
417  if (ibp->psip.NotEmpty())
418  pat_ref = true;
419 
420  if (ibp->is_tpa &&
421  (StringEqu(p_div, "EST") || StringEqu(p_div, "GSS") ||
422  StringEqu(p_div, "PAT") || StringEqu(p_div, "HTG"))) {
423  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTPADivcode, "Division code \"%s\" is not legal for TPA records. Entry dropped.", p_div);
424  return ret;
425  }
426 
427  if (ibp->is_tsa && ! StringEqu(p_div, "TSA")) {
428  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTSADivcode, "Division code \"%s\" is not legal for TSA records. Entry dropped.", p_div);
429  return ret;
430  }
431 
432  cancelled = IsCancelled(gbb->GetKeywords());
433 
434  if (StringEqu(p_div, "HTG")) {
435  if (! HasHtg(gbb->GetKeywords())) {
436  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTGKeywords, "Division is HTG, but entry lacks HTG-related keywords. Entry dropped.");
437  return ret;
438  }
439  }
440 
441  tempdiv = StringSave(gbb->GetDiv().c_str());
442 
443  if (fta_check_htg_kwds(gbb->SetKeywords(), pp->entrylist[pp->curindx], mol_info))
444  gbb->ResetDiv();
445 
446  DefVsHTGKeywords(mol_info.GetTech(), entry, ParFlat_DEFINITION, ParFlat_ORIGIN, cancelled);
447 
448  CheckHTGDivision(tempdiv, mol_info.GetTech());
449  if (tempdiv)
450  MemFree(tempdiv);
451 
452  i = 0;
453  if (est_kwd)
454  i++;
455  if (sts_kwd)
456  i++;
457  if (gss_kwd)
458  i++;
459  if (ibp->htg > 0)
460  i++;
461  if (htc_kwd)
462  i++;
463  if (fli_kwd)
464  i++;
465  if (wgs_kwd)
466  i++;
467  if (env_kwd)
468  i++;
469  if (mga_kwd) {
470  if (ibp->is_mga == false) {
471  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeCAGE, "This is apparently _not_ a CAGE record, but the special keywords are present. Entry dropped.");
472  return ret;
473  }
474  i++;
475  } else if (ibp->is_mga) {
476  ErrPostEx(SEV_ERROR, ERR_KEYWORD_NoGeneExpressionKeywords, "This is apparently a CAGE or 5'-SAGE record, but it lacks the required keywords. Entry dropped.");
477  }
478  if (tpa_kwd) {
479  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL) {
480  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA\" and/or \"Third Party Annotation\" keywords are present. Entry dropped.");
481  return ret;
482  }
483  i++;
484  } else if (ibp->is_tpa) {
485  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA\" and/or \"Third Party Annotation\" keywords. Entry dropped.");
486  return ret;
487  }
488  if (tsa_kwd) {
489  if (ibp->is_tsa == false) {
490  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords are present. Entry dropped.");
491  return ret;
492  }
493  i++;
494  } else if (ibp->is_tsa) {
495  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords. Entry dropped.");
496  return ret;
497  }
498  if (tls_kwd) {
499  if (ibp->is_tls == false) {
500  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS\" and/or \"Targeted Locus Study\" keywords are present. Entry dropped.");
501  return ret;
502  }
503  i++;
504  } else if (ibp->is_tls) {
505  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS\" and/or \"Targeted Locus Study\" keywords. Entry dropped.");
506  return ret;
507  }
508  if (i > 1) {
509  if (i == 2 && ibp->htg > 0 && env_kwd)
510  ErrPostEx(SEV_WARNING, ERR_KEYWORD_HTGPlusENV, "This HTG record also has the ENV keyword, which is an unusual combination. Confirmation that isolation and cloning steps actually occured might be appropriate.");
511  else if ((i == 2 && wgs_kwd && tpa_kwd) ||
512  (i == 2 && tsa_kwd && tpa_kwd) ||
513  (i == 2 && pp->source == Parser::ESource::DDBJ &&
514  env_kwd && tpa_kwd)) {
515  } else if (i != 2 || env_kwd == false ||
516  (est_kwd == false && gss_kwd == false && wgs_kwd == false)) {
517  if (i != 2 || pp->source != Parser::ESource::DDBJ ||
518  ibp->is_tsa == false || env_kwd == false) {
519  if (pp->source != Parser::ESource::DDBJ || ibp->is_wgs == false ||
520  (env_kwd == false && tpa_kwd == false)) {
521  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingKeywords, "This record contains more than one of the special keywords used to indicate that a sequence is an HTG, EST, GSS, STS, HTC, WGS, ENV, FLI_CDNA, TPA, CAGE, TSA or TLS sequence.");
522  return ret;
523  }
524  }
525  }
526  }
527 
528  if (wgs_kwd)
529  i--;
530 
531  if (ibp->is_contig && i > 0 &&
532  wgs_kwd == false && tpa_kwd == false && env_kwd == false) {
533  ErrPostEx(SEV_REJECT, ERR_KEYWORD_IllegalForCON, "This CON record should not have HTG, EST, GSS, STS, HTC, FLI_CDNA, CAGE, TSA or TLS special keywords. Entry dropped.");
534  return ret;
535  }
536 
537  CMolInfo::TTech thtg = mol_info.GetTech();
538  if (thtg == CMolInfo::eTech_htgs_0 || thtg == CMolInfo::eTech_htgs_1 ||
539  thtg == CMolInfo::eTech_htgs_2 || thtg == CMolInfo::eTech_htgs_3) {
540  RemoveHtgPhase(gbb->SetKeywords());
541  }
542 
543  bptr = xSrchNodeType(entry, ParFlat_KEYWORDS, &len);
544  if (bptr) {
545  kw = GetBlkDataReplaceNewLine(bptr, bptr + len, ParFlat_COL_DATA);
546 
547  kwp = StringStr(kw, "EST");
548  if (kwp && est_kwd == false) {
549  ErrPostEx(SEV_WARNING, ERR_KEYWORD_ESTSubstring, "Keyword %s has substring EST, but no official EST keywords found", kw);
550  }
551  kwp = StringStr(kw, "STS");
552  if (kwp && sts_kwd == false) {
553  ErrPostEx(SEV_WARNING, ERR_KEYWORD_STSSubstring, "Keyword %s has substring STS, but no official STS keywords found", kw);
554  }
555  MemFree(kw);
556  }
557 
558  if (! ibp->is_contig) {
559  drop = false;
560  CMolInfo::TTech tech = mol_info.GetTech();
561  string p_div;
562  if (gbb->IsSetDiv())
563  p_div = gbb->GetDiv();
564 
565  check_div(ibp->is_pat, pat_ref, est_kwd, sts_kwd, gss_kwd, if_cds, p_div, &tech, ibp->bases, pp->source, drop);
566 
567  if (tech != CMolInfo::eTech_unknown)
568  mol_info.SetTech(tech);
569  else
570  mol_info.ResetTech();
571 
572  if (! p_div.empty())
573  gbb->SetDiv(p_div);
574  else
575  gbb->ResetDiv();
576 
577  if (drop) {
578  return ret;
579  }
580  } else if (gbb->GetDiv() == "CON") {
581  gbb->ResetDiv();
582  }
583  } else if (pp->mode != Parser::EMode::Relaxed) {
584  MemCpy(msg, bptr, 3);
585  msg[3] = '\0';
586  ErrPostEx(SEV_REJECT, ERR_DIVISION_UnknownDivCode, "Unknown division code \"%s\" found in GenBank flatfile. Record rejected.", msg);
587  return ret;
588  }
589 
590  if (IsNewAccessFormat(ibp->acnum) == 0 && *ibp->acnum == 'T' &&
591  gbb->IsSetDiv() && gbb->GetDiv() != "EST") {
592  ErrPostStr(SEV_INFO, ERR_DIVISION_MappedtoEST, "Leading T in accession number.");
593 
594  mol_info.SetTech(CMolInfo::eTech_est);
595  gbb->ResetDiv();
596  }
597  }
598 
599  bool is_htc_div = gbb->IsSetDiv() && gbb->GetDiv() == "HTC",
600  has_htc = HasHtc(gbb->GetKeywords());
601 
602  if (is_htc_div && ! has_htc) {
603  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTCKeyword, "This record is in the HTC division, but lacks the required HTC keyword.");
604  return ret;
605  }
606 
607  if (! is_htc_div && has_htc) {
608  ErrPostEx(SEV_ERROR, ERR_DIVISION_InvalidHTCKeyword, "This record has the special HTC keyword, but is not in HTC division. If this record has graduated out of HTC, then the keyword should be removed.");
609  return ret;
610  }
611 
612  if (is_htc_div) {
613  bptr = entry.mOffset;
614  p = bptr + lcp->molecule;
615  if (*p == 'm' || *p == 'r')
616  p++;
617  else if (StringEquN(p, "pre-", 4))
618  p += 4;
619  else if (StringEquN(p, "transcribed ", 12))
620  p += 12;
621 
622  if (! StringEquN(p, "RNA", 3)) {
623  ErrPostEx(SEV_ERROR, ERR_DIVISION_HTCWrongMolType, "All HTC division records should have a moltype of pre-RNA, mRNA or RNA.");
624  return ret;
625  }
626  }
627 
628  if (fli_kwd)
630 
631  /* will be used in flat file database
632  */
633  if (gbb->IsSetDiv()) {
634  if (gbb->GetDiv() == "EST") {
635  ibp->EST = true;
636  mol_info.SetTech(CMolInfo::eTech_est);
637 
638  gbb->ResetDiv();
639  } else if (gbb->GetDiv() == "STS") {
640  ibp->STS = true;
641  mol_info.SetTech(CMolInfo::eTech_sts);
642 
643  gbb->ResetDiv();
644  } else if (gbb->GetDiv() == "GSS") {
645  ibp->GSS = true;
647 
648  gbb->ResetDiv();
649  } else if (gbb->GetDiv() == "HTC") {
650  ibp->HTC = true;
651  mol_info.SetTech(CMolInfo::eTech_htc);
652 
653  gbb->ResetDiv();
654  } else if (gbb->GetDiv() == "SYN" && bio_src && bio_src->IsSetOrigin() &&
656  gbb->ResetDiv();
657  }
658  } else if (mol_info.IsSetTech()) {
659  if (mol_info.GetTech() == CMolInfo::eTech_est)
660  ibp->EST = true;
661  if (mol_info.GetTech() == CMolInfo::eTech_sts)
662  ibp->STS = true;
663  if (mol_info.GetTech() == CMolInfo::eTech_survey)
664  ibp->GSS = true;
665  if (mol_info.GetTech() == CMolInfo::eTech_htc)
666  ibp->HTC = true;
667  }
668 
669  if (mol_info.IsSetTech())
670  fta_remove_keywords(mol_info.GetTech(), gbb->SetKeywords());
671 
672  if (ibp->is_tpa)
673  fta_remove_tpa_keywords(gbb->SetKeywords());
674 
675  if (ibp->is_tsa)
676  fta_remove_tsa_keywords(gbb->SetKeywords(), pp->source);
677 
678  if (ibp->is_tls)
679  fta_remove_tls_keywords(gbb->SetKeywords(), pp->source);
680 
681  if (bio_src) {
682  if (bio_src->IsSetSubtype()) {
683  for (const auto& subtype : bio_src->GetSubtype()) {
684  if (subtype->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
685  fta_remove_env_keywords(gbb->SetKeywords());
686  break;
687  }
688  }
689  }
690  if (bio_src->IsSetOrg()) {
691  const COrg_ref& org_ref = bio_src->GetOrg();
692  if (org_ref.IsSetOrgname() && org_ref.GetOrgname().IsSetMod()) {
693  for (const auto& mod : org_ref.GetOrgname().GetMod()) {
694  if (! mod->IsSetSubtype())
695  continue;
696 
697  COrgMod::TSubtype stype = mod->GetSubtype();
699  fta_remove_mag_keywords(gbb->SetKeywords());
700  break;
701  }
702  }
703  }
704  }
705  }
706 
707  if (pp->source == Parser::ESource::DDBJ && gbb->IsSetDiv() && bio_src &&
708  bio_src->IsSetOrg() && bio_src->GetOrg().IsSetOrgname() &&
709  bio_src->GetOrg().GetOrgname().IsSetDiv()) {
710  gbb->ResetDiv();
711  } else if (gbb->IsSetDiv() &&
712  bio_src &&
713  bio_src->IsSetOrg() &&
714  bio_src->GetOrg().IsSetOrgname() &&
715  bio_src->GetOrg().GetOrgname().IsSetDiv() &&
716  bio_src->GetOrg().GetOrgname().GetDiv() == gbb->GetDiv()) {
717  gbb->ResetDiv();
718  }
719 
720  GetExtraAccession(ibp, pp->allow_uwsec, pp->source, gbb->SetExtra_accessions());
721  ret.Reset(gbb.Release());
722 
723  return ret;
724 }
725 
726 /**********************************************************
727  *
728  * static MolInfoPtr GetGenBankMolInfo(pp, entry, orp):
729  *
730  * Data from :
731  * LOCUS ... column 37, or column 53 if "EST"
732  *
733  **********************************************************/
734 static CRef<CMolInfo> GetGenBankMolInfo(ParserPtr pp, const DataBlk& entry, const COrg_ref* org_ref)
735 {
736  IndexblkPtr ibp;
737  char* bptr;
738  char* molstr = nullptr;
739 
740  CRef<CMolInfo> mol_info(new CMolInfo);
741 
742  bptr = entry.mOffset;
743  ibp = pp->entrylist[pp->curindx];
744 
745  molstr = bptr + ibp->lc.molecule;
746 
747  bptr = GBDivOffset(entry, ibp->lc.div);
748 
749  if (StringEquN(bptr, "EST", 3))
750  mol_info->SetTech(CMolInfo::eTech_est);
751  else if (StringEquN(bptr, "STS", 3))
752  mol_info->SetTech(CMolInfo::eTech_sts);
753  else if (StringEquN(bptr, "GSS", 3))
754  mol_info->SetTech(CMolInfo::eTech_survey);
755  else if (StringEquN(bptr, "HTG", 3))
756  mol_info->SetTech(CMolInfo::eTech_htgs_1);
757  else if (ibp->is_wgs) {
758  if (ibp->is_tsa)
759  mol_info->SetTech(CMolInfo::eTech_tsa);
760  else if (ibp->is_tls)
761  mol_info->SetTech(CMolInfo::eTech_targeted);
762  else
763  mol_info->SetTech(CMolInfo::eTech_wgs);
764  } else if (ibp->is_tsa)
765  mol_info->SetTech(CMolInfo::eTech_tsa);
766  else if (ibp->is_tls)
767  mol_info->SetTech(CMolInfo::eTech_targeted);
768  else if (ibp->is_mga) {
769  mol_info->SetTech(CMolInfo::eTech_other);
770  mol_info->SetTechexp("cage");
771  }
772 
773  GetFlatBiomol(mol_info->SetBiomol(), mol_info->GetTech(), molstr, pp, entry, org_ref);
774  if (mol_info->GetBiomol() == CMolInfo::eBiomol_unknown) // not set
775  mol_info->ResetBiomol();
776 
777  return mol_info;
778 }
779 
780 /**********************************************************/
781 static void FakeGenBankBioSources(const DataBlk& entry, CBioseq& bioseq)
782 {
783  char* bptr;
784  char* end;
785  char* ptr;
786 
787  Char ch;
788 
789  size_t len = 0;
791 
792  if (! bptr) {
793  ErrPostStr(SEV_WARNING, ERR_ORGANISM_NoOrganism, "No Organism data in genbank format file");
794  return;
795  }
796 
797  end = bptr + len;
798  ch = *end;
799  *end = '\0';
800 
801  CRef<CBioSource> bio_src(new CBioSource);
802  bptr += ParFlat_COL_DATA;
803 
804  if (GetGenomeInfo(*bio_src, bptr) && bio_src->GetGenome() != 9) /* ! Plasmid */
805  {
806  while (*bptr != ' ' && *bptr != '\0')
807  bptr++;
808  while (*bptr == ' ')
809  bptr++;
810  }
811 
812  ptr = StringChr(bptr, '\n');
813  if (! ptr) {
814  *end = ch;
815  return;
816  }
817 
818  COrg_ref& org_ref = bio_src->SetOrg();
819 
820  *ptr = '\0';
821  org_ref.SetTaxname(bptr);
822  *ptr = '\n';
823 
824  for (;;) {
825  bptr = ptr + 1;
826  if (! StringEquN(bptr, " ", ParFlat_COL_DATA))
827  break;
828 
829  ptr = StringChr(bptr, '\n');
830  if (! ptr)
831  break;
832 
833  *ptr = '\0';
834  if (StringChr(bptr, ';') || ! StringChr(ptr + 1, '\n')) {
835  *ptr = '\n';
836  break;
837  }
838 
839  bptr += ParFlat_COL_DATA;
840  string& taxname = org_ref.SetTaxname();
841  taxname += ' ';
842  taxname += bptr;
843 
844  *ptr = '\n';
845  }
846 
847  *end = ch;
848 
849  if (org_ref.GetTaxname() == "Unknown.") {
850  string& taxname = org_ref.SetTaxname();
851  taxname = taxname.substr(0, taxname.size() - 1);
852  }
853 
854  ptr = GetGenBankLineage(bptr, end);
855  if (ptr) {
856  org_ref.SetOrgname().SetLineage(ptr);
857  MemFree(ptr);
858  }
859 
860  CRef<CSeqdesc> descr(new CSeqdesc);
861  descr->SetSource(*bio_src);
862  bioseq.SetDescr().Set().push_back(descr);
863 }
864 
865 /**********************************************************/
866 static void fta_get_user_field(char* line, const Char* tag, CUser_object& user_obj)
867 {
868  char* p;
869  char* q;
870  char* res;
871  Char ch;
872 
873  p = StringStr(line, "USER ");
874  if (! p)
875  ch = '\0';
876  else {
877  ch = 'U';
878  *p = '\0';
879  }
880 
881  res = StringSave(line);
882  if (ch == 'U')
883  *p = 'U';
884 
885  for (q = res, p = res; *p != '\0'; p++)
886  if (*p != ' ')
887  *q++ = *p;
888  *q = '\0';
889 
890  CRef<CUser_field> root_field(new CUser_field);
891  root_field->SetLabel().SetStr(tag);
892 
893  for (q = res;;) {
894  q = StringStr(q, "\nACCESSION=");
895  if (! q)
896  break;
897 
898  q += 11;
899  for (p = q; *p != '\0' && *p != '\n' && *p != ';';)
900  p++;
901  ch = *p;
902  *p = '\0';
903 
904  CRef<CUser_field> cur_field(new CUser_field);
905  cur_field->SetLabel().SetStr("accession");
906  cur_field->SetString(q);
907 
908  *p = ch;
909 
910  CRef<CUser_field> field_set(new CUser_field);
911  field_set->SetData().SetFields().push_back(cur_field);
912 
913  if (StringEquN(p, ";gi=", 4)) {
914  p += 4;
915  for (q = p; *p >= '0' && *p <= '9';)
916  p++;
917  ch = *p;
918  *p = '\0';
919 
920  cur_field.Reset(new CUser_field);
921  cur_field->SetLabel().SetStr("gi");
922  cur_field->SetNum(atoi(q));
923  field_set->SetData().SetFields().push_back(cur_field);
924 
925  *p = ch;
926  }
927 
928  root_field->SetData().SetFields().push_back(cur_field);
929  }
930 
931  MemFree(res);
932 
933  if (! root_field->IsSetData())
934  return;
935 
936  user_obj.SetData().push_back(root_field);
937 }
938 
939 /**********************************************************/
940 static void fta_get_str_user_field(char* line, const Char* tag, CUser_object& user_obj)
941 {
942  char* p;
943  char* q;
944  char* r;
945  char* res;
946  Char ch;
947 
948  p = StringStr(line, "USER ");
949  if (! p)
950  ch = '\0';
951  else {
952  ch = 'U';
953  *p = '\0';
954  }
955 
956  res = MemNew(StringLen(line) + 1);
957  for (q = line; *q == ' ' || *q == '\n';)
958  q++;
959  for (r = res; *q != '\0';) {
960  if (*q != '\n') {
961  *r++ = *q++;
962  continue;
963  }
964  while (*q == ' ' || *q == '\n')
965  q++;
966  if (*q != '\0')
967  *r++ = ' ';
968  }
969  *r = '\0';
970  if (ch == 'U')
971  *p = 'U';
972 
973  if (*res == '\0') {
974  MemFree(res);
975  return;
976  }
977 
978  CRef<CUser_field> field(new CUser_field);
979  field->SetLabel().SetStr(tag);
980  field->SetString(res);
981 
982  MemFree(res);
983 
984  user_obj.SetData().push_back(field);
985 }
986 
987 /**********************************************************/
988 static void fta_get_user_object(CSeq_entry& seq_entry, const DataBlk& entry)
989 {
990  char* p;
991  char* q;
992  char* r;
993  Char ch;
994  size_t l;
995 
996  p = xSrchNodeType(entry, ParFlat_USER, &l);
997  if (l < ParFlat_COL_DATA)
998  return;
999 
1000  ch = p[l - 1];
1001  p[l - 1] = '\0';
1002  q = StringSave(p);
1003  p[l - 1] = ch;
1004 
1005  CRef<CUser_object> user_obj(new CUser_object);
1006  user_obj->SetType().SetStr("RefGeneTracking");
1007 
1008  for (p = q;;) {
1009  p = StringStr(p, "USER ");
1010  if (! p)
1011  break;
1012  for (p += 12; *p == ' ';)
1013  p++;
1014  for (r = p; *p != '\0' && *p != '\n' && *p != ' ';)
1015  p++;
1016  if (*p == '\0' || p == r)
1017  break;
1018  if (StringEquN(r, "Related", 7))
1019  fta_get_user_field(p, "Related", *user_obj);
1020  else if (StringEquN(r, "Assembly", 8))
1021  fta_get_user_field(p, "Assembly", *user_obj);
1022  else if (StringEquN(r, "Comment", 7))
1023  fta_get_str_user_field(p, "Comment", *user_obj);
1024  else
1025  continue;
1026  }
1027 
1028  MemFree(q);
1029 
1030  if (! user_obj->IsSetData())
1031  return;
1032 
1033  CRef<CSeqdesc> descr(new CSeqdesc);
1034  descr->SetUser(*user_obj);
1035 
1036  if (seq_entry.IsSeq())
1037  seq_entry.SetSeq().SetDescr().Set().push_back(descr);
1038  else
1039  seq_entry.SetSet().SetDescr().Set().push_back(descr);
1040 }
1041 
1042 /**********************************************************/
1043 static void fta_get_mga_user_object(TSeqdescList& descrs, char* offset, size_t len)
1044 {
1045  char* str;
1046  char* p;
1047 
1048  if (! offset)
1049  return;
1050 
1052  p = StringChr(str, '\n');
1053  if (p)
1054  *p = '\0';
1055  p = StringChr(str, '-');
1056  if (p)
1057  *p++ = '\0';
1058 
1059  CRef<CUser_object> user_obj(new CUser_object);
1060 
1061  CObject_id& id = user_obj->SetType();
1062  id.SetStr("CAGE-Tag-List");
1063 
1064  CRef<CUser_field> field(new CUser_field);
1065 
1066  field->SetLabel().SetStr("CAGE_tag_total");
1067  field->SetData().SetInt(static_cast<CUser_field::C_Data::TInt>(len));
1068  user_obj->SetData().push_back(field);
1069 
1070  field.Reset(new CUser_field);
1071 
1072  field->SetLabel().SetStr("CAGE_accession_first");
1073  field->SetData().SetStr(str);
1074  user_obj->SetData().push_back(field);
1075 
1076  field.Reset(new CUser_field);
1077 
1078  field->SetLabel().SetStr("CAGE_accession_last");
1079  field->SetData().SetStr(p);
1080  user_obj->SetData().push_back(field);
1081 
1082  MemFree(str);
1083 
1084  CRef<CSeqdesc> descr(new CSeqdesc);
1085  descr->SetUser(*user_obj);
1086 
1087  descrs.push_back(descr);
1088 }
1089 
1090 /**********************************************************/
1091 static void GetGenBankDescr(ParserPtr pp, const DataBlk& entry, CBioseq& bioseq)
1092 {
1093  IndexblkPtr ibp;
1094 
1095  DataBlkPtr dbp;
1096 
1097  char* offset;
1098  char* str;
1099  char* p;
1100  char* q;
1101 
1102  bool is_htg;
1103 
1104  ibp = pp->entrylist[pp->curindx];
1105 
1106  CBioSource* bio_src = nullptr;
1107  COrg_ref* org_ref = nullptr;
1108 
1109  /* ORGANISM
1110  */
1111 
1112  for (auto& descr : bioseq.SetDescr().Set()) {
1113  if (descr->IsSource()) {
1114  bio_src = &(descr->SetSource());
1115  if (bio_src->IsSetOrg())
1116  org_ref = &bio_src->SetOrg();
1117  break;
1118  }
1119  }
1120 
1121  /* MolInfo from LOCUS line
1122  */
1123  CRef<CMolInfo> mol_info = GetGenBankMolInfo(pp, entry, org_ref);
1124 
1125  /* DEFINITION data ==> descr_title
1126  */
1127  str = nullptr;
1128  size_t len = 0;
1130 
1131  string title;
1132  if (offset) {
1134 
1135  for (p = str; *p == ' ';)
1136  p++;
1137  if (p > str)
1138  fta_StringCpy(str, p);
1139 
1140  title = str;
1141  MemFree(str);
1142  str = nullptr;
1143 
1144  CRef<CSeqdesc> descr(new CSeqdesc);
1145  descr->SetTitle(title);
1146  bioseq.SetDescr().Set().push_back(descr);
1147 
1148  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL &&
1149  StringEquN(title.c_str(), "TPA:", 4)) {
1150  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA:\" prefix is present on its definition line. Entry dropped.");
1151  ibp->drop = true;
1152  return;
1153  }
1154  if (ibp->is_tsa == false && StringEquN(title.c_str(), "TSA:", 4)) {
1155  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA:\" prefix is present on its definition line. Entry dropped.");
1156  ibp->drop = true;
1157  return;
1158  }
1159  if (ibp->is_tls == false && StringEquN(title.c_str(), "TLS:", 4)) {
1160  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS:\" prefix is present on its definition line. Entry dropped.");
1161  ibp->drop = true;
1162  return;
1163  }
1164  }
1165 
1166  CRef<CUser_object> dbuop;
1167  offset = xSrchNodeType(entry, ParFlat_DBLINK, &len);
1168  if (offset)
1169  fta_get_dblink_user_object(bioseq.SetDescr().Set(), offset, len, pp->source, &ibp->drop, dbuop);
1170  else {
1172  if (offset)
1174  }
1175 
1176  if (ibp->is_mga) {
1177  offset = xSrchNodeType(entry, ParFlat_MGA, &len);
1178  fta_get_mga_user_object(bioseq.SetDescr().Set(), offset, ibp->bases);
1179  }
1180  if (ibp->is_tpa &&
1181  (title.empty() || (! StringEquN(title.c_str(), "TPA:", 4) &&
1182  ! StringEquN(title.c_str(), "TPA_exp:", 8) &&
1183  ! StringEquN(title.c_str(), "TPA_inf:", 8) &&
1184  ! StringEquN(title.c_str(), "TPA_asm:", 8) &&
1185  ! StringEquN(title.c_str(), "TPA_reasm:", 10)))) {
1186  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA:\" prefix on its definition line. Entry dropped.");
1187  ibp->drop = true;
1188  return;
1189  }
1190  if (ibp->is_tsa && ! ibp->is_tpa &&
1191  (title.empty() || ! StringEquN(title.c_str(), "TSA:", 4))) {
1192  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TSA:\" prefix on its definition line. Entry dropped.");
1193  ibp->drop = true;
1194  return;
1195  }
1196  if (ibp->is_tls && (title.empty() || ! StringEquN(title.c_str(), "TLS:", 4))) {
1197  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS:\" prefix on its definition line. Entry dropped.");
1198  ibp->drop = true;
1199  return;
1200  }
1201 
1202  /* REFERENCE
1203  */
1204  /* pub should be before GBblock because we need patent ref
1205  */
1206  dbp = TrackNodeType(entry, ParFlat_REF_END);
1207  for (; dbp; dbp = dbp->mpNext) {
1208  if (dbp->mType != ParFlat_REF_END)
1209  continue;
1210 
1211  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, ParFlat_COL_DATA);
1212  if (pubdesc.NotEmpty()) {
1213  CRef<CSeqdesc> descr(new CSeqdesc);
1214  descr->SetPub(*pubdesc);
1215  bioseq.SetDescr().Set().push_back(descr);
1216  }
1217  }
1218 
1219  dbp = TrackNodeType(entry, ParFlat_REF_NO_TARGET);
1220  for (; dbp; dbp = dbp->mpNext) {
1221  if (dbp->mType != ParFlat_REF_NO_TARGET)
1222  continue;
1223 
1224  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, ParFlat_COL_DATA);
1225  if (pubdesc.NotEmpty()) {
1226  CRef<CSeqdesc> descr(new CSeqdesc);
1227  descr->SetPub(*pubdesc);
1228  bioseq.SetDescr().Set().push_back(descr);
1229  }
1230  }
1231 
1232  /* GB-block
1233  */
1234  CRef<CGB_block> gbbp = GetGBBlock(pp, entry, *mol_info, bio_src);
1235 
1236  if ((pp->source == Parser::ESource::DDBJ || pp->source == Parser::ESource::EMBL) &&
1237  ibp->is_contig && (! mol_info->IsSetTech() || mol_info->GetTech() == CMolInfo::eTech_unknown)) {
1238  CMolInfo::TTech tech = fta_check_con_for_wgs(bioseq);
1239  if (tech == CMolInfo::eTech_unknown)
1240  mol_info->ResetTech();
1241  else
1242  mol_info->SetTech(tech);
1243  }
1244 
1245  if (mol_info->IsSetBiomol() || mol_info->IsSetTech()) {
1246  CRef<CSeqdesc> descr(new CSeqdesc);
1247  descr->SetMolinfo(*mol_info);
1248  bioseq.SetDescr().Set().push_back(descr);
1249  }
1250 
1251  if (gbbp.Empty()) {
1252  ibp->drop = true;
1253  return;
1254  }
1255 
1256  if (pp->taxserver == 1 && gbbp->IsSetDiv())
1257  fta_fix_orgref_div(bioseq.GetAnnot(), org_ref, *gbbp);
1258 
1259  if (StringEquNI(ibp->division, "CON", 3))
1260  fta_add_hist(pp, bioseq, gbbp->SetExtra_accessions(), Parser::ESource::DDBJ, CSeq_id::e_Ddbj, true, ibp->acnum);
1261  else
1262  fta_add_hist(pp, bioseq, gbbp->SetExtra_accessions(), Parser::ESource::DDBJ, CSeq_id::e_Ddbj, false, ibp->acnum);
1263 
1264  {
1265  CRef<CSeqdesc> descr(new CSeqdesc);
1266  descr->SetGenbank(*gbbp);
1267  bioseq.SetDescr().Set().push_back(descr);
1268  }
1269 
1271  if (! offset && ibp->is_tpa && ibp->is_wgs == false) {
1272  if (ibp->inferential || ibp->experimental) {
1273  if (! fta_dblink_has_sra(dbuop)) {
1274  ErrPostEx(SEV_REJECT, ERR_TPA_TpaSpansMissing, "TPA:%s record lacks both AH/PRIMARY linetype and Sequence Read Archive links. Entry dropped.", (ibp->inferential == false) ? "experimental" : "inferential");
1275  ibp->drop = true;
1276  return;
1277  }
1278  } else if (ibp->specialist_db == false) {
1279  ErrPostEx(SEV_REJECT, ERR_TPA_TpaSpansMissing, "TPA record lacks required AH/PRIMARY linetype. Entry dropped.");
1280  ibp->drop = true;
1281  return;
1282  }
1283  }
1284 
1285  if (offset && len > 0 &&
1286  fta_parse_tpa_tsa_block(bioseq, offset, ibp->acnum, ibp->vernum, len, ParFlat_COL_DATA, ibp->is_tpa) == false) {
1287  ibp->drop = true;
1288  return;
1289  }
1290 
1291  if (mol_info.NotEmpty() && mol_info->IsSetTech() &&
1292  (mol_info->GetTech() == CMolInfo::eTech_htgs_0 ||
1293  mol_info->GetTech() == CMolInfo::eTech_htgs_1 ||
1294  mol_info->GetTech() == CMolInfo::eTech_htgs_2))
1295  is_htg = true;
1296  else
1297  is_htg = false;
1298 
1299  /* COMMENT data
1300  */
1302  if (offset && len > 0) {
1303  str = GetDescrComment(offset, len, ParFlat_COL_DATA, (pp->xml_comp ? false : is_htg), ibp->is_pat);
1304  if (str) {
1305  bool bad = false;
1306  TUserObjVector user_objs;
1307 
1308  fta_parse_structured_comment(str, bad, user_objs);
1309  if (bad) {
1310  ibp->drop = true;
1311  MemFree(str);
1312  return;
1313  }
1314 
1315  for (auto& user_obj : user_objs) {
1316  CRef<CSeqdesc> descr(new CSeqdesc);
1317  descr->SetUser(*user_obj);
1318  bioseq.SetDescr().Set().push_back(descr);
1319  }
1320 
1321  if (pp->xml_comp) {
1322  for (q = str, p = q; *p != '\0';) {
1323  if (*p == ';' && (p[1] == ' ' || p[1] == '~'))
1324  *p = ' ';
1325  if (*p == '~' || *p == ' ') {
1326  *q++ = ' ';
1327  for (p++; *p == ' ' || *p == '~';)
1328  p++;
1329  } else
1330  *q++ = *p++;
1331  }
1332  *q = '\0';
1333  }
1334 
1335  if (str[0] != 0) {
1336  CRef<CSeqdesc> descr(new CSeqdesc);
1337  descr->SetComment(str);
1338  bioseq.SetDescr().Set().push_back(descr);
1339  }
1340  MemFree(str);
1341  }
1342  }
1343 
1344  /* DATE
1345  */
1346  if (pp->no_date) /* -N in command line means no date */
1347  return;
1348 
1349  CRef<CDate> date;
1350  if (pp->date) /* -L in command line means replace date */
1351  {
1352  CTime time(CTime::eCurrent);
1353  date.Reset(new CDate);
1354  date->SetToTime(time);
1355  } else if (ibp->lc.date > 0) {
1356  CRef<CDate_std> std_date = GetUpdateDate(entry.mOffset + ibp->lc.date, pp->source);
1357  if (std_date.NotEmpty()) {
1358  date.Reset(new CDate);
1359  date->SetStd(*std_date);
1360  }
1361  }
1362 
1363  if (date.NotEmpty()) {
1364  CRef<CSeqdesc> descr(new CSeqdesc);
1365  descr->SetUpdate_date(*date);
1366  bioseq.SetDescr().Set().push_back(descr);
1367  }
1368 }
1369 
1370 /**********************************************************/
1371 static void GenBankGetDivision(char* division, Int4 div, const DataBlk& entry)
1372 {
1373  StringNCpy(division, GBDivOffset(entry, div), 3);
1374  division[3] = '\0';
1375 }
1376 
1377 static void xGenBankGetDivision(char* division, Int4 div, const string& locusText)
1378 {
1379  StringCpy(division, locusText.substr(64, 3).c_str());
1380 }
1381 
1382 /**********************************************************
1383  *
1384  * bool GenBankAscii(pp):
1385  *
1386  * Return FALSE if allocate entry block failed.
1387  *
1388  * 3-17-93
1389  *
1390  **********************************************************/
1392 {
1393  Int2 curkw;
1394  int imax;
1395  int segindx;
1396  int total = 0;
1397  int total_long = 0;
1398  int total_dropped = 0;
1399  char* ptr;
1400  char* eptr;
1401  char* div;
1402  unique_ptr<DataBlk, decltype(&xFreeEntry)> pEntry(nullptr, &xFreeEntry);
1403  EntryBlkPtr ebp;
1404 
1405  // unsigned char* dnaconv;
1406  // unsigned char* protconv;
1407  unsigned char* conv;
1408 
1409  TEntryList seq_entries;
1410 
1411  CSeq_loc locs;
1412 
1413  bool seq_long = false;
1414 
1415  IndexblkPtr ibp;
1416  IndexblkPtr tibp;
1417 
1418  auto dnaconv = GetDNAConv(); /* set up sequence alphabets */
1419  auto protconv = GetProteinConv(); /* set up sequence alphabets */
1420 
1421  segindx = -1;
1422 
1423  imax = pp->indx;
1424  for (int i = 0; i < imax; i++) {
1425  pp->curindx = i;
1426  ibp = pp->entrylist[i];
1427 
1428  err_install(ibp, pp->accver);
1429 
1430  if (ibp->segnum == 1)
1431  segindx = i;
1432 
1433  if (ibp->drop && ibp->segnum == 0) {
1434  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1435  total_dropped++;
1436  continue;
1437  }
1438 
1439  pEntry.reset(LoadEntry(pp, ibp->offset, ibp->len));
1440  if (! pEntry) {
1442  // dnaconv.reset();
1443  // protconv.reset();
1444  return false;
1445  }
1446 
1447  ebp = static_cast<EntryBlk*>(pEntry->mpData);
1448  ptr = pEntry->mOffset;
1449  eptr = ptr + pEntry->len;
1450  curkw = ParFlat_LOCUS;
1451  while (curkw != ParFlat_END && ptr < eptr) {
1452  ptr = GetGenBankBlock(&ebp->chain, ptr, &curkw, eptr);
1453  }
1454 
1455  auto ppCurrentEntry = pp->entrylist[pp->curindx];
1456  if (ppCurrentEntry->lc.div > -1) {
1457  GenBankGetDivision(ppCurrentEntry->division, ppCurrentEntry->lc.div, *pEntry);
1458  if (StringEqu(ibp->division, "TSA")) {
1459  if (ibp->tsa_allowed == false)
1460  ErrPostEx(SEV_WARNING, ERR_TSA_UnexpectedPrimaryAccession, "The record with accession \"%s\" is not expected to have a TSA division code.", ibp->acnum);
1461  ibp->is_tsa = true;
1462  }
1463  }
1464 
1465  CheckContigEverywhere(ibp, pp->source);
1466  if (ibp->drop && ibp->segnum == 0) {
1467  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1468  total_dropped++;
1469  continue;
1470  }
1471 
1472  if (ptr >= eptr) {
1473  ibp->drop = true;
1474  ErrPostStr(SEV_ERROR, ERR_FORMAT_MissingEnd, "Missing end of the entry. Entry dropped.");
1475  if (ibp->segnum == 0) {
1476  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1477  total_dropped++;
1478  continue;
1479  }
1480  }
1481  GetGenBankSubBlock(*pEntry, ibp->bases);
1482 
1483  CRef<CBioseq> bioseq = CreateEntryBioseq(pp);
1484  ebp->seq_entry.Reset(new CSeq_entry);
1485  ebp->seq_entry->SetSeq(*bioseq);
1486  GetScope().AddBioseq(*bioseq);
1487 
1488  AddNIDSeqId(*bioseq, *pEntry, ParFlat_NCBI_GI, ParFlat_COL_DATA, pp->source);
1489 
1490  if (StringEquN(pEntry->mOffset + ibp->lc.bp, "aa", 2)) {
1491  ibp->is_prot = true;
1492  conv = protconv.get();
1493  } else {
1494  ibp->is_prot = false;
1495  conv = dnaconv.get();
1496  }
1497 
1498 
1499  if (! GetGenBankInst(pp, *pEntry, conv)) {
1500  ibp->drop = true;
1501  ErrPostStr(SEV_REJECT, ERR_SEQUENCE_BadData, "Bad sequence data. Entry dropped.");
1502  if (ibp->segnum == 0) {
1503  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1504  total_dropped++;
1505  continue;
1506  }
1507  }
1508 
1509  FakeGenBankBioSources(*pEntry, *bioseq);
1510  LoadFeat(pp, *pEntry, *bioseq);
1511 
1512  if (! bioseq->IsSetAnnot() && ibp->drop && ibp->segnum == 0) {
1513  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1514  total_dropped++;
1515  continue;
1516  }
1517 
1518  GetGenBankDescr(pp, *pEntry, *bioseq);
1519  if (ibp->drop && ibp->segnum == 0) {
1520  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1521  total_dropped++;
1522  continue;
1523  }
1524 
1525  fta_set_molinfo_completeness(*bioseq, ibp);
1526 
1527  if (ibp->is_tsa)
1528  fta_tsa_tls_comment_dblink_check(*bioseq, true);
1529 
1530  if (ibp->is_tls)
1531  fta_tsa_tls_comment_dblink_check(*bioseq, false);
1532 
1533  if (bioseq->GetInst().IsNa()) {
1534  if (bioseq->GetInst().GetRepr() == CSeq_inst::eRepr_raw) {
1535  if (ibp->gaps)
1536  GapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
1537  else if (ibp->htg == 4 || ibp->htg == 1 || ibp->htg == 2 ||
1538  (ibp->is_pat && pp->source == Parser::ESource::DDBJ))
1539  SeqToDelta(*bioseq, ibp->htg);
1540  } else if (ibp->gaps)
1541  AssemblyGapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
1542  }
1543 
1544  if (no_date(pp->format, bioseq->GetDescr().Get()) && pp->debug == false &&
1545  pp->no_date == false &&
1546  pp->mode != Parser::EMode::Relaxed) {
1547  ibp->drop = true;
1548  ErrPostStr(SEV_ERROR, ERR_DATE_IllegalDate, "Illegal create date. Entry dropped.");
1549  if (ibp->segnum == 0) {
1550  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1551  total_dropped++;
1552  continue;
1553  }
1554  }
1555 
1556  if (pEntry->mpQscore.empty() && pp->accver) {
1557  if (pp->ff_get_qscore)
1558  pEntry->mpQscore = (*pp->ff_get_qscore)(ibp->acnum, ibp->vernum);
1559  else if (pp->ff_get_qscore_pp)
1560  pEntry->mpQscore = (*pp->ff_get_qscore_pp)(ibp->acnum, ibp->vernum, pp);
1561  if (pp->qsfd && ibp->qslength > 0)
1562  pEntry->mpQscore = GetQSFromFile(pp->qsfd, ibp);
1563  }
1564 
1565  if (! QscoreToSeqAnnot(pEntry->mpQscore, *bioseq, ibp->acnum, ibp->vernum, false, true)) {
1566  if (pp->ign_bad_qs == false) {
1567  ibp->drop = true;
1568  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse, "Error while parsing QScore. Entry dropped.");
1569  if (ibp->segnum == 0) {
1570  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1571  total_dropped++;
1572  continue;
1573  }
1574  } else {
1575  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse, "Error while parsing QScore.");
1576  }
1577  }
1578 
1579  pEntry->mpQscore.clear();
1580 
1581  if (ibp->psip.NotEmpty()) {
1582  CRef<CSeq_id> id(new CSeq_id);
1583  id->SetPatent(*ibp->psip);
1584  bioseq->SetId().push_back(id);
1585  ibp->psip.Reset();
1586  }
1587 
1588  /* add PatentSeqId if patent is found in reference
1589  */
1590  if (pp->mode != Parser::EMode::Relaxed &&
1591  pp->debug == false &&
1592  ibp->wgs_and_gi != 3 &&
1593  no_reference(*bioseq)) {
1594  if (pp->source == Parser::ESource::Flybase) {
1595  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for entry from FlyBase. Continue anyway.");
1596  } else if (pp->source == Parser::ESource::Refseq &&
1597  StringEquN(ibp->acnum, "NW_", 3)) {
1598  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for RefSeq's NW_ entry. Continue anyway.");
1599  } else if (ibp->is_wgs) {
1600  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for WGS entry. Continue anyway.");
1601  } else {
1602  ibp->drop = true;
1603  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references. Entry dropped.");
1604  if (ibp->segnum == 0) {
1605  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1606  total_dropped++;
1607  continue;
1608  }
1609  }
1610  }
1611 
1612  if (ibp->segnum == ibp->segtotal) {
1613  seq_entries.push_back(ebp->seq_entry);
1614  ebp->seq_entry.Reset();
1615 
1616  if (ibp->segnum < 2) {
1617  if (ibp->segnum != 0) {
1618  ErrPostEx(SEV_WARNING, ERR_SEGMENT_OnlyOneMember, "Segmented set contains only one member.");
1619  }
1620  segindx = i;
1621  } else {
1622  GetSeqExt(pp, locs);
1623  // LCOV_EXCL_START
1624  // Excluded per Mark's request on 12/14/2016
1625  BuildBioSegHeader(pp, seq_entries, locs);
1626  // LCOV_EXCL_STOP
1627  }
1628 
1629  /* reject the whole set if any one entry was rejected
1630  */
1631  if (ibp->segnum != 0) {
1632  div = pp->entrylist[segindx]->division;
1633  int j = segindx;
1634  for (; j <= i; j++) {
1635  tibp = pp->entrylist[j];
1636  err_install(tibp, pp->accver);
1637  if (! StringEqu(div, tibp->division)) {
1638  ErrPostEx(SEV_WARNING, ERR_DIVISION_Mismatch, "Division different in segmented set: %s: %s", div, tibp->division);
1639  }
1640  if (tibp->drop) {
1641  ErrPostEx(SEV_WARNING, ERR_SEGMENT_Rejected, "Reject the whole segmented set");
1642  break;
1643  }
1644  }
1645  if (j <= i) {
1646  for (j = segindx; j <= i; j++) {
1647  tibp = pp->entrylist[j];
1648  err_install(tibp, pp->accver);
1649  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", tibp->locusname, tibp->acnum);
1650  total_dropped++;
1651  }
1652 
1653  seq_entries.clear();
1654  continue;
1655  }
1656  }
1657 
1658  DealWithGenes(seq_entries, pp);
1659 
1660  if (seq_entries.empty()) {
1661  if (ibp->segnum != 0) {
1662  ErrPostEx(SEV_WARNING, ERR_SEGMENT_Rejected, "Reject the whole segmented set.");
1663  int j = segindx;
1664  for (; j <= i; j++) {
1665  tibp = pp->entrylist[j];
1666  err_install(tibp, pp->accver);
1667  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", tibp->locusname, tibp->acnum);
1668  total_dropped++;
1669  }
1670  } else {
1671  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1672  total_dropped++;
1673  }
1674  continue;
1675  }
1676 
1677  if (pp->source == Parser::ESource::Flybase && ! seq_entries.empty())
1678  fta_get_user_object(*(*seq_entries.begin()), *pEntry);
1679 
1680  /* remove out all the features if their seqloc has
1681  * "join" or "order" among other segments, to the annot
1682  * which in class = parts
1683  */
1684  if (ibp->segnum != 0)
1685  // LCOV_EXCL_START
1686  // Excluded per Mark's request on 12/14/2016
1687  CheckFeatSeqLoc(seq_entries);
1688  // LCOV_EXCL_STOP
1689 
1690  fta_find_pub_explore(pp, seq_entries);
1691 
1692  /* change qual "citation" on features to SeqFeat.cit
1693  * find citation in the list by serial_number.
1694  * If serial number not found remove /citation
1695  */
1696  ProcessCitations(seq_entries);
1697 
1698  /* check for long sequences in each segment */
1699  if (pp->limit != 0) {
1700  if (ibp->segnum != 0) {
1701  int j = segindx;
1702  for (; j <= i; j++) {
1703  tibp = pp->entrylist[j];
1704  err_install(tibp, pp->accver);
1705  if (tibp->bases <= (size_t)pp->limit)
1706  continue;
1707 
1708  if (tibp->htg == 1 || tibp->htg == 2 || tibp->htg == 4) {
1709  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence, "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem", tibp->locusname, tibp->acnum, pp->limit);
1710  } else {
1711  seq_long = true;
1712  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence, "Sequence %s|%s is longer than limit %ld", tibp->locusname, tibp->acnum, pp->limit);
1713  }
1714  }
1715  } else if (ibp->bases > (size_t)pp->limit) {
1716  if (ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 4) {
1717  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence, "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem", ibp->locusname, ibp->acnum, pp->limit);
1718  } else {
1719  seq_long = true;
1720  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence, "Sequence %s|%s is longer than limit %ld", ibp->locusname, ibp->acnum, pp->limit);
1721  }
1722  }
1723  }
1724  if (pp->mode == Parser::EMode::Relaxed) {
1725  for (auto pEntry : seq_entries) {
1726  auto pScope = Ref(new CScope(*CObjectManager::GetInstance()));
1727  g_InstantiateMissingProteins(pScope->AddTopLevelSeqEntry(*pEntry));
1728  }
1729  }
1730  if (pp->convert) {
1731  if (pp->cleanup <= 1) {
1732  FinalCleanup(seq_entries);
1733 
1734  if (pp->qamode && ! seq_entries.empty())
1735  fta_remove_cleanup_user_object(*seq_entries.front());
1736  }
1737 
1738  MaybeCutGbblockSource(seq_entries);
1739  }
1740 
1741  EntryCheckDivCode(seq_entries, pp);
1742 
1743  if (pp->xml_comp)
1744  fta_set_strandedness(seq_entries);
1745 
1746  if (fta_EntryCheckGBBlock(seq_entries)) {
1747  ErrPostStr(SEV_WARNING, ERR_ENTRY_GBBlock_not_Empty, "Attention: GBBlock is not empty");
1748  }
1749 
1750  /* check for identical features
1751  */
1752  if (pp->qamode) {
1753  fta_sort_descr(seq_entries);
1754  fta_sort_seqfeat_cit(seq_entries);
1755  }
1756 
1757  if (pp->citat) {
1758  StripSerialNumbers(seq_entries);
1759  }
1760 
1761  PackEntries(seq_entries);
1762  CheckDupDates(seq_entries);
1763 
1764  if (ibp->segnum != 0) {
1765  int j = segindx;
1766  for (; j <= i; j++)
1767  err_install(pp->entrylist[j], pp->accver);
1768  }
1769  if (seq_long) {
1770  seq_long = false;
1771  if (ibp->segnum != 0)
1772  total_long += (i - segindx + 1);
1773  else
1774  total_long++;
1775  } else {
1776  pp->entries.splice(pp->entries.end(), seq_entries);
1777 
1778  if (ibp->segnum != 0)
1779  total += (i - segindx + 1);
1780  else
1781  total++;
1782  }
1783 
1784  if (ibp->segnum != 0) {
1785  for (int j = segindx; j <= i; j++) {
1786  tibp = pp->entrylist[j];
1787  err_install(tibp, pp->accver);
1788  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry parsed successfully: \"%s|%s\".", tibp->locusname, tibp->acnum);
1789  }
1790  } else {
1791  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry parsed successfully: \"%s|%s\".", ibp->locusname, ibp->acnum);
1792  }
1793 
1794  seq_entries.clear();
1795  } else {
1796  GetSeqExt(pp, locs);
1797 
1798  seq_entries.push_back(ebp->seq_entry);
1799  ebp->seq_entry.Reset();
1800  }
1801 
1802  } /* for, ascii block entries */
1803 
1805 
1806  ErrPostEx(SEV_INFO, ERR_ENTRY_ParsingComplete, "COMPLETED : SUCCEEDED = %d (including: LONG ones = %d); SKIPPED = %d.", total, total_long, total_dropped);
1807  // MemFree(dnaconv);
1808  // MemFree(protconv);
1809 
1810  return true;
1811 }
1813 {
1814  int imax;
1815  int total = 0;
1816  int total_long = 0;
1817  int total_dropped = 0;
1818  unique_ptr<Entry> pEntry;
1819  unsigned char* conv;
1820 
1821  TEntryList seq_entries;
1822 
1823  CSeq_loc locs;
1824 
1825  IndexblkPtr ibp;
1826 
1827  auto dnaconv = GetDNAConv(); /* set up sequence alphabets */
1828  auto protconv = GetProteinConv(); /* set up sequence alphabets */
1829 
1830  imax = pp->indx;
1831  for (int i = 0; i < imax; i++) {
1832  pp->curindx = i;
1833  ibp = pp->entrylist[i];
1834 
1835  err_install(ibp, pp->accver);
1836 
1837  if (ibp->drop && ibp->segnum == 0) {
1838  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1839  total_dropped++;
1840  continue;
1841  }
1842 
1843  pEntry.reset(LoadEntryGenbank(pp, ibp->offset, ibp->len));
1844  if (! pEntry) {
1846  return false;
1847  }
1848 
1849  xGetGenBankBlocks(*pEntry);
1850 
1851  if (pp->entrylist[pp->curindx]->lc.div > -1) {
1852  xGenBankGetDivision(pp->entrylist[pp->curindx]->division, pp->entrylist[pp->curindx]->lc.div, pEntry->mBaseData);
1853  if (StringEqu(ibp->division, "TSA")) {
1854  if (ibp->tsa_allowed == false)
1855  ErrPostEx(SEV_WARNING, ERR_TSA_UnexpectedPrimaryAccession, "The record with accession \"%s\" is not expected to have a TSA division code.", ibp->acnum);
1856  ibp->is_tsa = true;
1857  }
1858  }
1859 
1860  CheckContigEverywhere(ibp, pp->source);
1861  if (ibp->drop && ibp->segnum == 0) {
1862  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1863  total_dropped++;
1864  continue;
1865  }
1866 
1867  auto lastType = pEntry->mSections.back()->mType;
1868  if (lastType != ParFlat_END) {
1869  ibp->drop = true;
1870  ErrPostStr(SEV_ERROR, ERR_FORMAT_MissingEnd, "Missing end of the entry. Entry dropped.");
1871  if (ibp->segnum == 0) {
1872  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1873  total_dropped++;
1874  continue;
1875  }
1876  }
1877  xGetGenBankSubBlocks(*pEntry, ibp->bases);
1878 
1879  CRef<CBioseq> pBioseq = CreateEntryBioseq(pp);
1880  pEntry->mSeqEntry.Reset(new CSeq_entry);
1881  pEntry->mSeqEntry->SetSeq(*pBioseq);
1882  GetScope().AddBioseq(*pBioseq);
1883  pEntry->xInitNidSeqId(*pBioseq, ParFlat_NCBI_GI, ParFlat_COL_DATA, pp->source);
1884 
1885  if (pEntry->IsAA()) {
1886  ibp->is_prot = true;
1887  conv = protconv.get();
1888  } else {
1889  ibp->is_prot = false;
1890  conv = dnaconv.get();
1891  }
1892 
1893  if (! pEntry->xInitSeqInst(conv)) {
1894  ibp->drop = true;
1895  ErrPostStr(SEV_REJECT, ERR_SEQUENCE_BadData, "Bad sequence data. Entry dropped.");
1896  if (ibp->segnum == 0) {
1897  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1898  total_dropped++;
1899  continue;
1900  }
1901  }
1902  return false;
1903 
1904  /*FakeGenBankBioSources(*pEntry, *bioseq);
1905  LoadFeat(pp, *pEntry, *bioseq);
1906 
1907  if (! bioseq->IsSetAnnot() && ibp->drop && ibp->segnum == 0)
1908  {
1909  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
1910  "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1911  total_dropped++;
1912  continue;
1913  }
1914 
1915  GetGenBankDescr(pp, *pEntry, *bioseq);
1916  if (ibp->drop && ibp->segnum == 0)
1917  {
1918  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
1919  "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1920  total_dropped++;
1921  continue;
1922  }
1923 
1924  fta_set_molinfo_completeness(*bioseq, ibp);
1925 
1926  if (ibp->is_tsa)
1927  fta_tsa_tls_comment_dblink_check(*bioseq, true);
1928 
1929  if (ibp->is_tls)
1930  fta_tsa_tls_comment_dblink_check(*bioseq, false);
1931 
1932  if (bioseq->GetInst().IsNa())
1933  {
1934  if (bioseq->GetInst().GetRepr() == CSeq_inst::eRepr_raw)
1935  {
1936  if (ibp->gaps)
1937  GapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
1938  else if(ibp->htg == 4 || ibp->htg == 1 || ibp->htg == 2 ||
1939  (ibp->is_pat && pp->source == Parser::ESource::DDBJ))
1940  SeqToDelta(*bioseq, ibp->htg);
1941  }
1942  else if (ibp->gaps)
1943  AssemblyGapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
1944  }
1945 
1946  if (no_date(pp->format, bioseq->GetDescr().Get()) && pp->debug == false &&
1947  pp->no_date == false &&
1948  pp->mode != Parser::EMode::Relaxed)
1949  {
1950  ibp->drop = true;
1951  ErrPostStr(SEV_ERROR, ERR_DATE_IllegalDate,
1952  "Illegal create date. Entry dropped.");
1953  if(ibp->segnum == 0)
1954  {
1955  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
1956  "Entry skipped: \"%s|%s\".",
1957  ibp->locusname, ibp->acnum);
1958  total_dropped++;
1959  continue;
1960  }
1961  }
1962 
1963  if (! pEntry->mpQscore && pp->accver)
1964  {
1965  if (pp->ff_get_qscore)
1966  pEntry->mpQscore = (*pp->ff_get_qscore)(ibp->acnum, ibp->vernum);
1967  else if (pp->ff_get_qscore_pp)
1968  pEntry->mpQscore = (*pp->ff_get_qscore_pp)(ibp->acnum, ibp->vernum, pp);
1969  if (pp->qsfd && ibp->qslength > 0)
1970  pEntry->mpQscore = GetQSFromFile(pp->qsfd, ibp);
1971  }
1972 
1973  if (!QscoreToSeqAnnot(pEntry->mpQscore, *bioseq, ibp->acnum, ibp->vernum, false, true))
1974  {
1975  if(pp->ign_bad_qs == false)
1976  {
1977  ibp->drop = true;
1978  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse,
1979  "Error while parsing QScore. Entry dropped.");
1980  if(ibp->segnum == 0)
1981  {
1982  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
1983  "Entry skipped: \"%s|%s\".",
1984  ibp->locusname, ibp->acnum);
1985  total_dropped++;
1986  continue;
1987  }
1988  }
1989  else
1990  {
1991  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse,
1992  "Error while parsing QScore.");
1993  }
1994  }
1995 
1996  if (pEntry->mpQscore)
1997  {
1998  MemFree(pEntry->mpQscore);
1999  pEntry->mpQscore = nullptr;
2000  }
2001 
2002  if (ibp->psip.NotEmpty())
2003  {
2004  CRef<CSeq_id> id(new CSeq_id);
2005  id->SetPatent(*ibp->psip);
2006  bioseq->SetId().push_back(id);
2007  ibp->psip.Reset();
2008  }
2009 
2010  // add PatentSeqId if patent is found in reference
2011  //
2012  if(pp->mode != Parser::EMode::Relaxed &&
2013  pp->debug == false &&
2014  ibp->wgs_and_gi != 3 &&
2015  no_reference(*bioseq))
2016  {
2017  if(pp->source == Parser::ESource::Flybase)
2018  {
2019  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references,
2020  "No references for entry from FlyBase. Continue anyway.");
2021  }
2022  else if(pp->source == Parser::ESource::Refseq &&
2023  StringEquN(ibp->acnum, "NW_", 3))
2024  {
2025  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references,
2026  "No references for RefSeq's NW_ entry. Continue anyway.");
2027  }
2028  else if(ibp->is_wgs)
2029  {
2030  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references,
2031  "No references for WGS entry. Continue anyway.");
2032  }
2033  else
2034  {
2035  ibp->drop = true;
2036  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references,
2037  "No references. Entry dropped.");
2038  if(ibp->segnum == 0)
2039  {
2040  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
2041  "Entry skipped: \"%s|%s\".",
2042  ibp->locusname, ibp->acnum);
2043  total_dropped++;
2044  continue;
2045  }
2046  }
2047  }
2048 
2049  if (ibp->segnum == ibp->segtotal)
2050  {
2051  seq_entries.push_back(ebp->seq_entry);
2052  ebp->seq_entry.Reset();
2053 
2054  if (ibp->segnum < 2)
2055  {
2056  if(ibp->segnum != 0)
2057  {
2058  ErrPostEx(SEV_WARNING, ERR_SEGMENT_OnlyOneMember,
2059  "Segmented set contains only one member.");
2060  }
2061  segindx = i;
2062  }
2063  else
2064  {
2065  GetSeqExt(pp, locs);
2066 // LCOV_EXCL_START
2067 // Excluded per Mark's request on 12/14/2016
2068  BuildBioSegHeader(pp, seq_entries, locs);
2069 // LCOV_EXCL_STOP
2070  }
2071 
2072  // reject the whole set if any one entry was rejected
2073  //
2074  if(ibp->segnum != 0)
2075  {
2076  div = pp->entrylist[segindx]->division;
2077  int j = segindx;
2078  for(; j <= i; j++)
2079  {
2080  tibp = pp->entrylist[j];
2081  err_install(tibp, pp->accver);
2082  if (! StringEqu(div, tibp->division))
2083  {
2084  ErrPostEx(SEV_WARNING, ERR_DIVISION_Mismatch,
2085  "Division different in segmented set: %s: %s",
2086  div, tibp->division);
2087  }
2088  if (tibp->drop)
2089  {
2090  ErrPostEx(SEV_WARNING, ERR_SEGMENT_Rejected,
2091  "Reject the whole segmented set");
2092  break;
2093  }
2094  }
2095  if(j <= i)
2096  {
2097  for(j = segindx; j <= i; j++)
2098  {
2099  tibp = pp->entrylist[j];
2100  err_install(tibp, pp->accver);
2101  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
2102  "Entry skipped: \"%s|%s\".",
2103  tibp->locusname, tibp->acnum);
2104  total_dropped++;
2105  }
2106 
2107  seq_entries.clear();
2108  continue;
2109  }
2110  }
2111 
2112  DealWithGenes(seq_entries, pp);
2113 
2114  if (seq_entries.empty())
2115  {
2116  if(ibp->segnum != 0)
2117  {
2118  ErrPostEx(SEV_WARNING, ERR_SEGMENT_Rejected,
2119  "Reject the whole segmented set.");
2120  int j = segindx;
2121  for(; j <= i; j++)
2122  {
2123  tibp = pp->entrylist[j];
2124  err_install(tibp, pp->accver);
2125  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
2126  "Entry skipped: \"%s|%s\".",
2127  tibp->locusname, tibp->acnum);
2128  total_dropped++;
2129  }
2130  }
2131  else
2132  {
2133  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
2134  "Entry skipped: \"%s|%s\".",
2135  ibp->locusname, ibp->acnum);
2136  total_dropped++;
2137  }
2138  continue;
2139  }
2140 
2141  if (pp->source == Parser::ESource::Flybase && !seq_entries.empty())
2142  fta_get_user_object(*(*seq_entries.begin()), *pEntry);
2143 
2144  // remove out all the features if their seqloc has
2145  // "join" or "order" among other segments, to the annot
2146  // which in class = parts
2147  //
2148  if(ibp->segnum != 0)
2149 // LCOV_EXCL_START
2150 // Excluded per Mark's request on 12/14/2016
2151  CheckFeatSeqLoc(seq_entries);
2152 // LCOV_EXCL_STOP
2153 
2154  fta_find_pub_explore(pp, seq_entries);
2155 
2156  // change qual "citation" on features to SeqFeat.cit
2157  // find citation in the list by serial_number.
2158  // If serial number not found remove /citation
2159  //
2160  ProcessCitations(seq_entries);
2161 
2162  // check for long sequences in each segment
2163  //
2164  if(pp->limit != 0)
2165  {
2166  if(ibp->segnum != 0)
2167  {
2168  int j = segindx;
2169  for(; j <= i; j++)
2170  {
2171  tibp = pp->entrylist[j];
2172  err_install(tibp, pp->accver);
2173  if(tibp->bases <= (size_t) pp->limit)
2174  continue;
2175 
2176  if(tibp->htg == 1 || tibp->htg == 2 || tibp->htg == 4)
2177  {
2178  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence,
2179  "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem",
2180  tibp->locusname, tibp->acnum, pp->limit);
2181  }
2182  else
2183  {
2184  seq_long = true;
2185  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence,
2186  "Sequence %s|%s is longer than limit %ld",
2187  tibp->locusname, tibp->acnum, pp->limit);
2188  }
2189  }
2190  }
2191  else if(ibp->bases > (size_t) pp->limit)
2192  {
2193  if(ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 4)
2194  {
2195  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence,
2196  "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem",
2197  ibp->locusname, ibp->acnum, pp->limit);
2198  }
2199  else
2200  {
2201  seq_long = true;
2202  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence,
2203  "Sequence %s|%s is longer than limit %ld",
2204  ibp->locusname, ibp->acnum, pp->limit);
2205  }
2206  }
2207  }
2208  if (pp->mode == Parser::EMode::Relaxed) {
2209  for(auto pEntry : seq_entries) {
2210  auto pScope = Ref(new CScope(*CObjectManager::GetInstance()));
2211  g_InstantiateMissingProteins(pScope->AddTopLevelSeqEntry(*pEntry));
2212  }
2213  }
2214  if (pp->convert)
2215  {
2216  if(pp->cleanup <= 1)
2217  {
2218  FinalCleanup(seq_entries);
2219 
2220  if (pp->qamode && !seq_entries.empty())
2221  fta_remove_cleanup_user_object(*seq_entries.front());
2222  }
2223 
2224  MaybeCutGbblockSource(seq_entries);
2225  }
2226 
2227  EntryCheckDivCode(seq_entries, pp);
2228 
2229  if(pp->xml_comp)
2230  fta_set_strandedness(seq_entries);
2231 
2232  if (fta_EntryCheckGBBlock(seq_entries))
2233  {
2234  ErrPostStr(SEV_WARNING, ERR_ENTRY_GBBlock_not_Empty,
2235  "Attention: GBBlock is not empty");
2236  }
2237 
2238  // check for identical features
2239  //
2240  if(pp->qamode)
2241  {
2242  fta_sort_descr(seq_entries);
2243  fta_sort_seqfeat_cit(seq_entries);
2244  }
2245 
2246  if (pp->citat)
2247  {
2248  StripSerialNumbers(seq_entries);
2249  }
2250 
2251  PackEntries(seq_entries);
2252  CheckDupDates(seq_entries);
2253 
2254  if(ibp->segnum != 0) {
2255  int j = segindx;
2256  for(; j <= i; j++)
2257  err_install(pp->entrylist[j], pp->accver);
2258  }
2259  if (seq_long)
2260  {
2261  seq_long = false;
2262  if(ibp->segnum != 0)
2263  total_long += (i - segindx + 1);
2264  else
2265  total_long++;
2266  }
2267  else
2268  {
2269  pp->entries.splice(pp->entries.end(), seq_entries);
2270 
2271  if(ibp->segnum != 0)
2272  total += (i - segindx + 1);
2273  else
2274  total++;
2275  }
2276 
2277  if(ibp->segnum != 0)
2278  {
2279  for(int j = segindx; j <= i; j++)
2280  {
2281  tibp = pp->entrylist[j];
2282  err_install(tibp, pp->accver);
2283  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed,
2284  "OK - entry parsed successfully: \"%s|%s\".",
2285  tibp->locusname, tibp->acnum);
2286  }
2287  }
2288  else
2289  {
2290  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed,
2291  "OK - entry parsed successfully: \"%s|%s\".",
2292  ibp->locusname, ibp->acnum);
2293  }
2294 
2295  seq_entries.clear();
2296  }
2297  else
2298  {
2299  GetSeqExt(pp, locs);
2300 
2301  seq_entries.push_back(ebp->seq_entry);
2302  ebp->seq_entry.Reset();
2303  }
2304 
2305  */} // for, ascii block entries
2306 
2308 
2309  ErrPostEx(SEV_INFO, ERR_ENTRY_ParsingComplete, "COMPLETED : SUCCEEDED = %d (including: LONG ones = %d); SKIPPED = %d.", total, total_long, total_dropped);
2310  // MemFree(dnaconv);
2311  // MemFree(protconv);
2312 
2313  return false;
2314 }
2315 
2316 // LCOV_EXCL_START
2317 // Excluded per Mark's request on 12/14/2016
2318 /**********************************************************
2319  *
2320  * static void SrchFeatSeqLoc(sslbp, sfp):
2321  *
2322  * 9-14-93
2323  *
2324  **********************************************************/
2326 {
2327  for (CSeq_annot::C_Data::TFtable::iterator feat = feat_table.begin(); feat != feat_table.end();) {
2328  if ((*feat)->IsSetLocation() && (*feat)->GetLocation().GetId()) {
2329  ++feat;
2330  continue;
2331  }
2332 
2333  /* SeqLocId will return NULL if any one of seqid in the SeqLocPtr
2334  * is diffenent, so move out cursfp to sslbp
2335  */
2336 
2337  feats.push_back(*feat);
2338  feat = feat_table.erase(feat);
2339  }
2340 }
2341 
2342 /**********************************************************
2343  *
2344  * static void FindFeatSeqLoc(sep, data, index, indent):
2345  *
2346  * 9-14-93
2347  *
2348  **********************************************************/
2349 static void FindFeatSeqLoc(TEntryList& seq_entries, TSeqFeatList& feats)
2350 {
2351  for (auto& entry : seq_entries) {
2352  for (CTypeIterator<CBioseq> bioseq(Begin(*entry)); bioseq; ++bioseq) {
2353  const CSeq_id& first_id = *(*bioseq->GetId().begin());
2354  if (IsSegBioseq(first_id) || ! bioseq->IsSetAnnot())
2355  continue;
2356 
2357  /* process this bioseq entry */
2358  CBioseq::TAnnot annots = bioseq->SetAnnot();
2359  for (CBioseq::TAnnot::iterator annot = annots.begin(); annot != annots.end();) {
2360  if (! (*annot)->IsSetData() || ! (*annot)->GetData().IsFtable()) {
2361  ++annot;
2362  continue;
2363  }
2364 
2365  CSeq_annot::C_Data::TFtable& feat_table = (*annot)->SetData().SetFtable();
2366  SrchFeatSeqLoc(feats, feat_table);
2367 
2368  if (! feat_table.empty()) {
2369  ++annot;
2370  continue;
2371  }
2372 
2373  annot = annots.erase(annot);
2374  }
2375  }
2376  }
2377 }
2378 
2379 /**********************************************************/
2380 static CBioseq_set* GetParts(TEntryList& seq_entries)
2381 {
2382  for (auto& entry : seq_entries) {
2383  for (CTypeIterator<CBioseq_set> bio_set(Begin(*entry)); bio_set; ++bio_set) {
2384  if (bio_set->IsSetClass() && bio_set->GetClass() == CBioseq_set::eClass_parts)
2385  return bio_set.operator->();
2386  }
2387  }
2388 
2389  return nullptr;
2390 }
2391 
2392 /**********************************************************
2393  *
2394  * void CheckFeatSeqLoc(sep):
2395  *
2396  * Remove out all the features which its seqloc has
2397  * "join" or "order" among other segments, then insert
2398  * into the annot which in the level of the class = parts.
2399  *
2400  * 9-14-93
2401  *
2402  **********************************************************/
2403 void CheckFeatSeqLoc(TEntryList& seq_entries)
2404 {
2405  TSeqFeatList feats_no_id;
2406  FindFeatSeqLoc(seq_entries, feats_no_id);
2407 
2408  CBioseq_set* parts = GetParts(seq_entries);
2409 
2410  if (! feats_no_id.empty() && parts) /* may need to delete duplicate
2411  one 9-14-93 */
2412  {
2413  for (auto& annot : parts->SetAnnot()) {
2414  if (! annot->IsFtable())
2415  continue;
2416 
2417  annot->SetData().SetFtable().splice(annot->SetData().SetFtable().end(), feats_no_id);
2418  break;
2419  }
2420 
2421  if (parts->GetAnnot().empty()) {
2422  CRef<CSeq_annot> new_annot(new CSeq_annot);
2423  new_annot->SetData().SetFtable().swap(feats_no_id);
2424  parts->SetAnnot().push_back(new_annot);
2425  }
2426  }
2427 }
2428 
2430 // LCOV_EXCL_STOP
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
bool no_reference(const CBioseq &bioseq)
Definition: add.cpp:208
void SeqToDelta(CBioseq &bioseq, Int2 tech)
Definition: add.cpp:493
CMolInfo::TTech fta_check_con_for_wgs(CBioseq &bioseq)
Definition: add.cpp:2047
bool fta_check_htg_kwds(TKeywordList &kwds, IndexblkPtr ibp, CMolInfo &mol_info)
Definition: add.cpp:885
void fta_set_molinfo_completeness(CBioseq &bioseq, const Indexblk *ibp)
Definition: add.cpp:2770
void fta_add_hist(ParserPtr pp, CBioseq &bioseq, CGB_block::TExtra_accessions &extra_accs, Parser::ESource source, CSeq_id::E_Choice acctype, bool pricon, const char *acc)
Definition: add.cpp:781
void AssemblyGapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:327
bool fta_parse_tpa_tsa_block(CBioseq &bioseq, char *offset, char *acnum, Int2 vernum, size_t len, Int2 col_data, bool tpa)
Definition: add.cpp:1090
string GetQSFromFile(FILE *fd, const Indexblk *ibp)
Definition: add.cpp:2673
void fta_get_project_user_object(TSeqdescList &descrs, char *offset, Parser::EFormat format, bool *drop, Parser::ESource source)
Definition: add.cpp:1582
bool check_cds(const DataBlk &entry, Parser::EFormat format)
Definition: add.cpp:246
void fta_create_far_fetch_policy_user_object(CBioseq &bsp, Int4 num)
Definition: add.cpp:2795
void fta_tsa_tls_comment_dblink_check(const CBioseq &bioseq, bool is_tsa)
Definition: add.cpp:2725
void fta_remove_cleanup_user_object(CSeq_entry &seq_entry)
Definition: add.cpp:2692
bool fta_dblink_has_sra(const CRef< CUser_object > &uop)
Definition: add.cpp:2864
void GapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:375
void fta_get_dblink_user_object(TSeqdescList &descrs, char *offset, size_t len, Parser::ESource source, bool *drop, CRef< CUser_object > &dbuop)
Definition: add.cpp:1922
void err_install(const Indexblk *ibp, bool accver)
Definition: add.cpp:290
Int4 fta_fix_seq_loc_id(TSeqLocList &locs, ParserPtr pp, char *location, const char *name, bool iscon)
Definition: add.cpp:2293
bool no_date(Parser::EFormat format, const TSeqdescList &descrs)
Definition: add.cpp:178
void fta_parse_structured_comment(char *str, bool &bad, TUserObjVector &objs)
Definition: add.cpp:2555
void GetGenBankSubBlock(const DataBlk &entry, size_t bases)
Definition: asci_blk.cpp:400
void StripSerialNumbers(TEntryList &seq_entries)
Definition: asci_blk.cpp:3377
void AddNIDSeqId(CBioseq &bioseq, const DataBlk &entry, Int2 type, Int2 coldata, Parser::ESource source)
Definition: asci_blk.cpp:2692
void fta_fix_orgref_div(const CBioseq::TAnnot &annots, COrg_ref *org_ref, CGB_block &gbb)
Definition: asci_blk.cpp:3241
char * GetDescrComment(char *offset, size_t len, Int2 col_data, bool is_htg, bool is_pat)
Definition: asci_blk.cpp:1105
void DefVsHTGKeywords(CMolInfo::TTech tech, const DataBlk &entry, Int2 what, Int2 ori, bool cancelled)
Definition: asci_blk.cpp:2787
bool IsSegBioseq(const CSeq_id &id)
Definition: asci_blk.cpp:2506
void xGetGenBankSubBlocks(Entry &entry, size_t bases)
Definition: asci_blk.cpp:439
void fta_sort_seqfeat_cit(TEntryList &seq_entries)
Definition: asci_blk.cpp:3213
void PackEntries(TEntryList &seq_entries)
Definition: asci_blk.cpp:3477
void fta_set_strandedness(TEntryList &seq_entries)
Definition: asci_blk.cpp:3312
void CheckHTGDivision(const char *div, CMolInfo::TTech tech)
Definition: asci_blk.cpp:2917
unique_ptr< unsigned char[]> GetDNAConv(void)
Definition: asci_blk.cpp:1744
unique_ptr< unsigned char[]> GetProteinConv(void)
Definition: asci_blk.cpp:1772
void GetSequenceOfKeywords(const DataBlk &entry, int type, int col_data, TKeywordList &keywords)
Definition: asci_blk.cpp:1505
void EntryCheckDivCode(TEntryList &seq_entries, ParserPtr pp)
Definition: asci_blk.cpp:2776
char * GetGenBankBlock(DataBlkPtr *chain, char *ptr, Int2 *retkw, char *eptr)
Definition: asci_blk.cpp:230
void GetSeqExt(ParserPtr pp, CSeq_loc &seq_loc)
Definition: asci_blk.cpp:2439
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
Definition: asci_blk.cpp:1632
bool fta_EntryCheckGBBlock(TEntryList &seq_entries)
Definition: asci_blk.cpp:3084
char * SrchNodeSubType(const DataBlk &entry, Int2 type, Int2 subtype, size_t *len)
Definition: asci_blk.cpp:985
void xGetGenBankBlocks(Entry &entry)
Definition: asci_blk.cpp:202
void fta_sort_descr(TEntryList &seq_entries)
Definition: asci_blk.cpp:3156
void BuildBioSegHeader(ParserPtr pp, TEntryList &entries, const CSeq_loc &seqloc)
Definition: asci_blk.cpp:2464
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
Definition: asci_blk.cpp:1274
bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string &div, CMolInfo::TTech *tech, size_t bases, Parser::ESource source, bool &drop)
Definition: asci_blk.cpp:2535
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
Definition: asci_blk.cpp:1020
void xFreeEntry(DataBlkPtr entry)
Definition: block.cpp:109
static TDSICONV * conv
Definition: charconv.c:168
void ProcessCitations(TEntryList &seq_entries)
Definition: citation.cpp:307
Definition: Date.hpp:53
void SetToTime(const CTime &time, EPrecision prec=ePrecision_second)
Definition: Date.cpp:57
CScope –.
Definition: scope.hpp:92
Definition: Seq_entry.hpp:56
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
CTime –.
Definition: ncbitime.hpp:296
CUser_field & SetString(const char *value)
Definition: User_field.cpp:445
char * mOffset
Definition: ftablock.h:332
size_t len
Definition: ftablock.h:333
CFlatFileData * mpData
Definition: ftablock.h:331
DataBlk * mpNext
Definition: ftablock.h:336
int mType
Definition: ftablock.h:330
EntryPtr LoadEntryGenbank(ParserPtr pp, size_t offset, size_t len)
Definition: entry.cpp:217
DataBlkPtr LoadEntry(ParserPtr pp, size_t offset, size_t len)
Definition: entry.cpp:300
void g_InstantiateMissingProteins(CSeq_entry_Handle entryHandle)
Definition: fcleanup.cpp:335
void FinalCleanup(TEntryList &seq_entries)
Definition: fcleanup.cpp:377
#define ERR_SEQUENCE_BadData
Definition: flat2err.h:150
#define ERR_TPA_TpaSpansMissing
Definition: flat2err.h:593
#define ERR_ENTRY_LongSequence
Definition: flat2err.h:82
#define ERR_FORMAT_MissingContigFeature
Definition: flat2err.h:43
#define ERR_KEYWORD_ShouldNotBeTPA
Definition: flat2err.h:208
#define ERR_DIVISION_BadTSADivcode
Definition: flat2err.h:261
#define ERR_FORMAT_MissingSequenceData
Definition: flat2err.h:41
#define ERR_DIVISION_InvalidHTCKeyword
Definition: flat2err.h:254
#define ERR_KEYWORD_IllegalForCON
Definition: flat2err.h:210
#define ERR_DIVISION_MissingHTGKeywords
Definition: flat2err.h:249
#define ERR_QSCORE_FailedToParse
Definition: flat2err.h:577
#define ERR_ENTRY_LongHTGSSequence
Definition: flat2err.h:86
#define ERR_KEYWORD_MissingTSA
Definition: flat2err.h:216
#define ERR_DIVISION_BadTPADivcode
Definition: flat2err.h:257
#define ERR_REFERENCE_No_references
Definition: flat2err.h:289
#define ERR_KEYWORD_ShouldNotBeTLS
Definition: flat2err.h:218
#define ERR_ENTRY_GBBlock_not_Empty
Definition: flat2err.h:85
#define ERR_KEYWORD_HTGPlusENV
Definition: flat2err.h:217
#define ERR_DEFINITION_MissingTPA
Definition: flat2err.h:269
#define ERR_ENTRY_Skipped
Definition: flat2err.h:80
#define ERR_DEFINITION_MissingTLS
Definition: flat2err.h:273
#define ERR_KEYWORD_ESTSubstring
Definition: flat2err.h:204
#define ERR_KEYWORD_ConflictingKeywords
Definition: flat2err.h:207
#define ERR_DIVISION_ConDivLacksContig
Definition: flat2err.h:252
#define ERR_LOCATION_ContigHasNull
Definition: flat2err.h:397
#define ERR_SEGMENT_OnlyOneMember
Definition: flat2err.h:165
#define ERR_KEYWORD_ENV_NoMatchingQualifier
Definition: flat2err.h:214
#define ERR_KEYWORD_ShouldNotBeTSA
Definition: flat2err.h:215
#define ERR_KEYWORD_STSSubstring
Definition: flat2err.h:205
#define ERR_DIVISION_UnknownDivCode
Definition: flat2err.h:222
#define ERR_KEYWORD_MissingTLS
Definition: flat2err.h:219
#define ERR_DEFINITION_ShouldNotBeTSA
Definition: flat2err.h:270
#define ERR_SEGMENT_Rejected
Definition: flat2err.h:166
#define ERR_DIVISION_MissingHTCKeyword
Definition: flat2err.h:253
#define ERR_DIVISION_MappedtoCON
Definition: flat2err.h:248
#define ERR_DIVISION_MappedtoEST
Definition: flat2err.h:223
#define ERR_FORMAT_ContigWithSequenceData
Definition: flat2err.h:42
#define ERR_KEYWORD_NoGeneExpressionKeywords
Definition: flat2err.h:213
#define ERR_DEFINITION_MissingTSA
Definition: flat2err.h:271
#define ERR_DEFINITION_ShouldNotBeTPA
Definition: flat2err.h:268
#define ERR_FORMAT_MissingEnd
Definition: flat2err.h:39
#define ERR_KEYWORD_MissingTPA
Definition: flat2err.h:209
#define ERR_DIVISION_ConDivInSegset
Definition: flat2err.h:251
#define ERR_ENTRY_ParsingComplete
Definition: flat2err.h:79
#define ERR_DIVISION_Mismatch
Definition: flat2err.h:226
#define ERR_ORGANISM_NoOrganism
Definition: flat2err.h:184
#define ERR_DATE_IllegalDate
Definition: flat2err.h:102
#define ERR_ENTRY_Parsed
Definition: flat2err.h:83
#define ERR_DIVISION_HTCWrongMolType
Definition: flat2err.h:255
#define ERR_KEYWORD_ShouldNotBeCAGE
Definition: flat2err.h:211
#define ERR_DEFINITION_ShouldNotBeTLS
Definition: flat2err.h:272
#define ERR_TSA_UnexpectedPrimaryAccession
Definition: flat2err.h:609
list< CRef< objects::CSeq_entry > > TEntryList
bool QscoreToSeqAnnot(const string &qscore, CBioseq &bioseq, char *acc, Int2 ver, bool check_minmax, bool allow_na)
std::list< CRef< objects::CSeq_feat > > TSeqFeatList
Definition: ftablock.h:55
std::list< CRef< objects::CSeqdesc > > TSeqdescList
Definition: ftablock.h:60
std::vector< CRef< objects::CUser_object > > TUserObjVector
Definition: ftablock.h:61
char * StringSave(const char *s)
Definition: ftacpp.hpp:61
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:116
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:106
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:96
void StringCpy(char *d, const char *s)
Definition: ftacpp.hpp:74
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:75
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
char * MemNew(size_t sz)
Definition: ftacpp.hpp:43
void MemCpy(void *p, const void *q, size_t sz)
Definition: ftacpp.hpp:50
char * StringRChr(char *s, const char c)
Definition: ftacpp.hpp:78
void FtaDeletePrefix(int prefix)
Definition: ftaerr.cpp:344
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
void fta_find_pub_explore(ParserPtr pp, TEntryList &seq_entries)
Definition: ftanet.cpp:762
bool GetGenBankInstContig(const DataBlk &entry, CBioseq &bsp, ParserPtr pp)
Definition: gb_ascii.cpp:135
USING_SCOPE(objects)
static CRef< CGB_block > GetGBBlock(ParserPtr pp, const DataBlk &entry, CMolInfo &mol_info, CBioSource *bio_src)
Definition: gb_ascii.cpp:297
static void fta_get_str_user_field(char *line, const Char *tag, CUser_object &user_obj)
Definition: gb_ascii.cpp:940
void CheckFeatSeqLoc(TEntryList &seq_entries)
Definition: gb_ascii.cpp:2403
static CRef< CMolInfo > GetGenBankMolInfo(ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
Definition: gb_ascii.cpp:734
static void FindFeatSeqLoc(TEntryList &seq_entries, TSeqFeatList &feats)
Definition: gb_ascii.cpp:2349
static void FakeGenBankBioSources(const DataBlk &entry, CBioseq &bioseq)
Definition: gb_ascii.cpp:781
static void CheckContigEverywhere(IndexblkPtr ibp, Parser::ESource source)
Definition: gb_ascii.cpp:103
bool GenBankAsciiOrig(ParserPtr pp)
Definition: gb_ascii.cpp:1391
static void fta_get_user_object(CSeq_entry &seq_entry, const DataBlk &entry)
Definition: gb_ascii.cpp:988
static char * GetGenBankLineage(char *start, char *end)
Definition: gb_ascii.cpp:256
bool GenBankAscii(ParserPtr pp)
Definition: gb_ascii.cpp:1812
static void GenBankGetDivision(char *division, Int4 div, const DataBlk &entry)
Definition: gb_ascii.cpp:1371
static void fta_get_user_field(char *line, const Char *tag, CUser_object &user_obj)
Definition: gb_ascii.cpp:866
static void SrchFeatSeqLoc(TSeqFeatList &feats, CSeq_annot::C_Data::TFtable &feat_table)
Definition: gb_ascii.cpp:2325
static CBioseq_set * GetParts(TEntryList &seq_entries)
Definition: gb_ascii.cpp:2380
static void fta_get_mga_user_object(TSeqdescList &descrs, char *offset, size_t len)
Definition: gb_ascii.cpp:1043
static char * GBDivOffset(const DataBlk &entry, Int4 div_shift)
Definition: gb_ascii.cpp:97
static bool GetGenBankInst(ParserPtr pp, const DataBlk &entry, unsigned char *dnaconv)
Definition: gb_ascii.cpp:217
static void xGenBankGetDivision(char *division, Int4 div, const string &locusText)
Definition: gb_ascii.cpp:1377
static void GetGenBankDescr(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
Definition: gb_ascii.cpp:1091
@ ParFlat_SOURCE
Definition: genbank.h:48
@ ParFlat_DBLINK
Definition: genbank.h:63
@ ParFlat_COMMENT
Definition: genbank.h:50
@ ParFlat_LOCUS
Definition: genbank.h:41
@ ParFlat_PROJECT
Definition: genbank.h:62
@ ParFlat_NCBI_GI
Definition: genbank.h:44
@ ParFlat_USER
Definition: genbank.h:58
@ ParFlat_PRIMARY
Definition: genbank.h:60
@ ParFlat_END
Definition: genbank.h:54
@ ParFlat_ORGANISM
Definition: genbank.h:66
@ ParFlat_KEYWORDS
Definition: genbank.h:46
@ ParFlat_DEFINITION
Definition: genbank.h:42
@ ParFlat_CONTIG
Definition: genbank.h:56
@ ParFlat_MGA
Definition: genbank.h:61
@ ParFlat_ORIGIN
Definition: genbank.h:53
#define ParFlat_COL_DATA
Definition: genbank.h:37
void DealWithGenes(TEntryList &seq_entries, ParserPtr pp)
Definition: genref.cpp:2981
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
#define StringStr
Definition: ncbistr.hpp:322
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
TOrigin GetOrigin(void) const
Get the Origin member data.
Definition: BioSource_.hpp:472
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
Definition: BioSource_.hpp:447
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
@ eOrigin_synthetic
purely synthetic
Definition: BioSource_.hpp:134
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
TStd & SetStd(void)
Select the variant.
Definition: Date_.cpp:115
TData & SetData(void)
Assign a value to Data data member.
void SetNum(TNum value)
Assign a value to Num data member.
void SetLabel(TLabel &value)
Assign a value to Label data member.
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
const TDiv & GetDiv(void) const
Get the Div member data.
Definition: OrgName_.hpp:1005
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
bool IsSetDiv(void) const
GenBank division code Check if a value has been assigned to Div data member.
Definition: OrgName_.hpp:993
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
Definition: Org_ref_.hpp:381
bool IsSetMod(void) const
Check if a value has been assigned to Mod data member.
Definition: OrgName_.hpp:827
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSubtype_metagenome_source
Definition: OrgMod_.hpp:120
@ eSeq_code_type_iupacaa
IUPAC 1 letter amino acid code.
@ eSeq_code_type_iupacna
IUPAC 1 letter nuc acid code.
bool IsMix(void) const
Check if variant Mix is selected.
Definition: Seq_loc_.hpp:552
const TMix & GetMix(void) const
Get the variant data.
Definition: Seq_loc_.cpp:282
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
const TAnnot & GetAnnot(void) const
Get the Annot member data.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
@ eClass_parts
parts for 2 or 3
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
TPub & SetPub(void)
Select the variant.
Definition: Seqdesc_.cpp:362
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
TGenbank & SetGenbank(void)
Select the variant.
Definition: Seqdesc_.cpp:340
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
TComment & SetComment(void)
Select the variant.
Definition: Seqdesc_.hpp:1065
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
void ResetTech(void)
Reset Tech data member.
Definition: MolInfo_.hpp:484
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
void SetTopology(TTopology value)
Assign a value to Topology data member.
Definition: Seq_inst_.hpp:739
ETopology
topology of molecule
Definition: Seq_inst_.hpp:121
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
EStrand
strandedness in living organism
Definition: Seq_inst_.hpp:133
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
void SetStrand(TStrand value)
Assign a value to Strand data member.
Definition: Seq_inst_.hpp:786
void SetTech(TTech value)
Assign a value to Tech data member.
Definition: MolInfo_.hpp:503
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
TMolinfo & SetMolinfo(void)
Select the variant.
Definition: Seqdesc_.cpp:594
TUpdate_date & SetUpdate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:500
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eRepr_virtual
no seq data
Definition: Seq_inst_.hpp:93
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_other
use Source.techexp
Definition: MolInfo_.hpp:148
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_targeted
targeted locus sets/studies
Definition: MolInfo_.hpp:147
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ ParFlat_REF_NO_TARGET
Definition: index.h:63
@ ParFlat_REF_END
Definition: index.h:60
CRef< CDate_std > GetUpdateDate(const char *ptr, Parser::ESource source)
Definition: indx_blk.cpp:611
int CheckTPG(const string &str)
Definition: indx_blk.cpp:502
int CheckSTRAND(const string &str)
Definition: indx_blk.cpp:467
Int4 IsNewAccessFormat(const Char *acnum)
Definition: indx_blk.cpp:995
Int2 CheckDIV(const char *str)
Definition: indx_blk.cpp:532
int i
int len
void GetFlatBiomol(CMolInfo::TBiomol &biomol, CMolInfo::TTech tech, char *molstr, ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
Definition: loadfeat.cpp:5131
void LoadFeat(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
Definition: loadfeat.cpp:4825
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
const char * tag
int isspace(Uchar c)
Definition: ncbictype.hpp:69
std::list< SeqLoc > TSeqLocList
Int mod(Int i, Int j)
Definition: njn_integer.hpp:67
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void CheckDupDates(TEntryList &seq_entries)
Definition: nucprot.cpp:2691
CRef< CPubdesc > DescrRefs(ParserPtr pp, DataBlkPtr dbp, Int4 col_data)
Definition: ref.cpp:2445
int offset
Definition: replacements.h:160
static const char * str(char *buf, int n)
Definition: stats.c:84
DataBlkPtr chain
Definition: ftablock.h:344
CRef< objects::CSeq_entry > seq_entry
Definition: ftablock.h:346
Char acnum[200]
Definition: ftablock.h:169
CRef< objects::CPatent_seq_id > psip
Definition: ftablock.h:193
Char division[4]
Definition: ftablock.h:174
bool is_mga
Definition: ftablock.h:202
bool tsa_allowed
Definition: ftablock.h:214
Int4 wgs_and_gi
Definition: ftablock.h:234
Int2 htg
Definition: ftablock.h:199
bool is_tls
Definition: ftablock.h:211
Int2 vernum
Definition: ftablock.h:170
bool is_tpa
Definition: ftablock.h:209
TKeywordList keywords
Definition: ftablock.h:243
bool is_prot
Definition: ftablock.h:225
bool is_wgs
Definition: ftablock.h:208
bool origin
Definition: ftablock.h:204
bool is_contig
Definition: ftablock.h:200
bool STS
Definition: ftablock.h:196
bool is_pat
Definition: ftablock.h:205
bool HTC
Definition: ftablock.h:198
bool drop
Definition: ftablock.h:185
bool experimental
Definition: ftablock.h:250
size_t bases
Definition: ftablock.h:175
bool inferential
Definition: ftablock.h:248
Uint2 segtotal
Definition: ftablock.h:178
bool is_tsa
Definition: ftablock.h:210
bool EST
Definition: ftablock.h:195
size_t len
Definition: ftablock.h:187
GapFeatsPtr gaps
Definition: ftablock.h:217
string wgssec
Definition: ftablock.h:239
size_t offset
Definition: ftablock.h:171
bool specialist_db
Definition: ftablock.h:246
Uint2 segnum
Definition: ftablock.h:176
Char locusname[200]
Definition: ftablock.h:173
bool env_sample_qual
Definition: ftablock.h:222
size_t qslength
Definition: ftablock.h:233
LocusCont lc
Definition: ftablock.h:215
bool GSS
Definition: ftablock.h:197
Int4 molecule
Definition: ftablock.h:108
Int4 strand
Definition: ftablock.h:107
Int4 topology
Definition: ftablock.h:109
Int4 date
Definition: ftablock.h:111
Int4 bp
Definition: ftablock.h:106
Int4 div
Definition: ftablock.h:110
char *(* ff_get_qscore_pp)(const char *accession, Int2 v, Parser *pp)
vector< IndexblkPtr > entrylist
bool allow_crossdb_featloc
char *(* ff_get_qscore)(const char *accession, Int2 v)
TEntryList entries
CScope & GetScope()
void MaybeCutGbblockSource(TEntryList &seq_entries)
Definition: utilfeat.cpp:454
bool GetGenomeInfo(CBioSource &bsp, const Char *bptr)
Definition: utilfeat.cpp:244
bool HasHtg(const TKeywordList &keywords)
Definition: utilfun.cpp:1719
bool HasHtc(const TKeywordList &keywords)
Definition: utilfun.cpp:1748
char * GetBlkDataReplaceNewLine(char *bptr, char *eptr, Int2 start_col_data)
Definition: utilfun.cpp:740
char * SrchTheChar(char *bptr, char *eptr, Char letter)
Definition: utilfun.cpp:903
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1281
void RemoveHtgPhase(TKeywordList &keywords)
Definition: utilfun.cpp:1733
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1422
void fta_remove_tpa_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1408
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
Definition: utilfun.cpp:1377
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1437
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
Definition: utilfun.cpp:1108
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
Definition: utilfun.cpp:1340
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1641
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
Definition: utilfun.cpp:1139
void fta_remove_mag_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1466
bool IsCancelled(const TKeywordList &keywords)
Definition: utilfun.cpp:1708
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1247
void fta_remove_env_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1452
bool fta_tpa_keywords_check(const TKeywordList &kwds)
Definition: utilfun.cpp:1165
bool fta_check_mga_keywords(CMolInfo &mol_info, const TKeywordList &kwds)
Definition: utilfun.cpp:1592
CRef< CSeq_loc > xgbparseint_ver(const char *raw_intervals, bool &keep_rawPt, int &numErrors, const TSeqIdList &seq_ids, bool accver)
Definition: xgbparint.cpp:1466
void XGappedSeqLocsToDeltaSeqs(const TSeqLocList &locs, TDeltaList &deltas)
Definition: xutils.cpp:91
Modified on Mon Dec 11 02:40:43 2023 by modify_doxy.py rev. 669887