NCBI C++ ToolKit
em_ascii.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: em_ascii.cpp 101097 2023-10-27 17:41:46Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: em_ascii.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Preprocessing embl from blocks in memory to asn.
32  * Build EMBL format entry block.
33  *
34  */
35 
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 
42 #include <objects/general/Date.hpp>
43 #include <objects/seq/Seq_inst.hpp>
45 #include <objects/seq/Seq_ext.hpp>
50 #include <objmgr/scope.hpp>
51 #include <objects/seq/MolInfo.hpp>
61 #include <objects/seq/Pubdesc.hpp>
63 
64 #include "index.h"
65 #include "embl.h"
66 
68 #include "ftanet.h"
70 
71 #include "ftaerr.hpp"
72 #include "indx_blk.h"
73 #include "asci_blk.h"
74 #include "utilfun.h"
75 #include "utilref.h"
76 #include "em_ascii.h"
77 #include "add.h"
78 #include "utilfeat.h"
79 #include "loadfeat.h"
80 #include "nucprot.h"
81 #include "fta_qscore.h"
82 #include "citation.h"
83 #include "fcleanup.h"
84 #include "entry.h"
85 #include "ref.h"
86 #include "xgbparint.h"
87 #include "xutils.h"
88 #include "fta_xml.h"
89 #include "keyword_parse.hpp"
90 
91 #ifdef THIS_FILE
92 # undef THIS_FILE
93 #endif
94 #define THIS_FILE "em_ascii.cpp"
95 
96 
99 
100 // clang-format off
101 
102 /* For new stile of ID line in EMBL data check the "data class"
103  * field first to figure out division code
104  */
105 static const char* ParFlat_Embl_dataclass_array[] = {
106  "ANN", "CON", "PAT", "EST", "GSS", "HTC", "HTG", "STS", "TSA",
107  nullptr
108 };
109 
110 /* order by EMBL-block in asn.all
111  */
112 static const char* ParFlat_Embl_DIV_array[] = {
113  "FUN", "INV", "MAM", "ORG", "PHG", "PLN", "PRI", "PRO", "ROD",
114  "SYN", "UNA", "VRL", "VRT", "PAT", "EST", "STS", "UNC", "GSS",
115  "HUM", "HTG", "HTC", "CON", "ENV", "MUS", "TGN", "TSA",
116  nullptr
117 };
118 
119 /* correspond "DIV" genbank string. Must have the same number
120  * of elements !
121  */
122 static const char* ParFlat_GBDIV_array[] = {
123  "PLN", "INV", "MAM", "UNA", "PHG", "PLN", "PRI", "BCT", "ROD",
124  "SYN", "UNA", "VRL", "VRT", "PAT", "EST", "STS", "UNA", "GSS",
125  "PRI", "HTG", "HTC", "CON", "ENV", "ROD", "SYN", "TSA",
126  nullptr
127 };
128 
129 // clang-format on
130 
131 static const char* ParFlat_DBname_array[] = {
132  "EMBL",
133  "GENBANK",
134  "DDBJ",
135  "GENINFO",
136  "MEDLINE",
137  "SWISS-PROT",
138  "PIR",
139  "PDB",
140  "EPD",
141  "ECD",
142  "TFD",
143  "FLYBASE",
144  "PROSITE",
145  "ENZYME",
146  "MIM",
147  "ECOSEQ",
148  "HIV",
149  nullptr
150 };
151 
152 static const char* ParFlat_DRname_array[] = {
153  "ARAPORT",
154  "ARRAYEXPRESS",
155  "ASTD",
156  "BEEBASE",
157  "BGD",
158  "BIOGRID",
159  "BIOMUTA",
160  "BIOSAMPLE",
161  "CABRI",
162  "CCDS",
163  "CHEMBL",
164  "CHITARS",
165  "COLLECTF",
166  "DEPOD",
167  "DMDM",
168  "DNASU",
169  "ENA",
170  "ENA-CON",
171  "ENSEMBL",
172  "ENSEMBL-GN",
173  "ENSEMBL-SCAFFOLDS",
174  "ENSEMBL-TR",
175  "ENSEMBLGENOMES",
176  "ENSEMBLGENOMES-GN",
177  "ENSEMBLGENOMES-TR",
178  "ESTHER",
179  "EUROPEPMC",
180  "EVOLUTIONARYTRACE",
181  "EXPRESSIONATLAS",
182  "GENE3D",
183  "GENEDB",
184  "GENEREVIEWS",
185  "GENEVISIBLE",
186  "GENEWIKI",
187  "GENOMERNAI",
188  "GDB",
189  "GOA",
190  "GR",
191  "GRAINGENES",
192  "GUIDETOPHARMACOLOGY",
193  "H-INVDB",
194  "HGNC",
195  "HOMD",
196  "HSSP",
197  "IMAGENES",
198  "IMGT/GENE-DB",
199  "IMGT/HLA",
200  "IMGT/LIGM",
201  "IMGT_GENE-DB",
202  "INTERPRO",
203  "IPD-KIR",
204  "IPTMNET",
205  "KEGG",
206  "KO",
207  "MALACARDS",
208  "MAXQB",
209  "MGI",
210  "MIRBASE",
211  "MOONPROT",
212  "MYCOBANK",
213  "MYCOCLAP",
214  "PATRIC",
215  "PAXDB",
216  "POMBASE",
217  "PR2",
218  "PRO",
219  "PROTEOMES",
220  "RFAM",
221  "RZPD",
222  "SABIO-RK",
223  "SFLD",
224  "SGN",
225  "SIGNALINK",
226  "SIGNALLINK",
227  "SIGNOR",
228  "SILVA-LSU",
229  "SILVA-SSU",
230  "STRAININFO",
231  "SWISSLIPIDS",
232  "SWISSPALM",
233  "TMRNA-WEBSITE",
234  "TOPDOWNPROTEOMICS",
235  "TRANSFAC",
236  "TREEFAM",
237  "UNICARBKB",
238  "UNILIB",
239  "UNIPATHWAY",
240  "UNIPROT/SWISS-PROT",
241  "UNIPROT/TREMBL",
242  "UNIPROTKB/SWISS-PROT",
243  "UNIPROTKB/TREMBL",
244  "UNITE",
245  "VBASE2",
246  "VEGA-TR",
247  "VEGA-GN",
248  "VGNC",
249  "WBPARASITE",
250  "WORMBASE",
251  "ZFIN",
252  nullptr
253 };
254 
255 
256 /**********************************************************
257  *
258  * static void GetEmblDate(source, entry, crdate, update):
259  *
260  * Contain two lines, first created date, second
261  * updated date.
262  * In the direct submission, it may only have one
263  * DT line, if it is, then created date = update date.
264  *
265  * 9-24-93
266  *
267  * Skip XX line between DT line.
268  *
269  * 12-22-93
270  *
271  **********************************************************/
272 static void GetEmblDate(Parser::ESource source, const DataBlk& entry, CRef<CDate_std>& crdate, CRef<CDate_std>& update)
273 {
274  char* offset;
275  char* eptr;
276  size_t len;
277 
278  crdate.Reset();
279  update.Reset();
280  offset = xSrchNodeType(entry, ParFlat_DT, &len);
281  if (! offset)
282  return;
283 
284  eptr = offset + len;
286  while (offset < eptr) {
287  offset = SrchTheChar(offset, eptr, '\n');
288  if (! offset)
289  break;
290 
291  offset++; /* newline */
292  if (StringEquN(offset, "DT", 2)) {
294  source);
295  break;
296  }
297  }
298  if (update.Empty()) {
299  update.Reset(new CDate_std);
300  update->SetDay(crdate->GetDay());
301  update->SetMonth(crdate->GetMonth());
302  update->SetYear(crdate->GetYear());
303  }
304 }
305 
306 /**********************************************************/
307 static bool OutputEmblAsn(bool seq_long, ParserPtr pp, TEntryList& seq_entries)
308 {
309  DealWithGenes(seq_entries, pp);
310 
311  if (seq_entries.empty()) {
313  return false;
314  }
315 
316  fta_find_pub_explore(pp, seq_entries);
317 
318  /* change qual "citation" on features to SeqFeat.cit find citation
319  * in the list by serial_number. If serial number not found remove
320  * /citation
321  */
322  ProcessCitations(seq_entries);
323 
324  if (pp->convert) {
325  if (pp->cleanup <= 1) {
326  FinalCleanup(seq_entries);
327 
328  if (pp->qamode && ! seq_entries.empty())
329  fta_remove_cleanup_user_object(*(*seq_entries.begin()));
330  }
331 
332  MaybeCutGbblockSource(seq_entries);
333  }
334 
335  EntryCheckDivCode(seq_entries, pp);
336 
337  if (pp->xml_comp)
338  fta_set_strandedness(seq_entries);
339 
340  if (fta_EntryCheckGBBlock(seq_entries)) {
341  ErrPostStr(SEV_WARNING, ERR_ENTRY_GBBlock_not_Empty, "Attention: GBBlock is not empty");
342  }
343 
344  if (pp->qamode) {
345  fta_sort_descr(seq_entries);
346  fta_sort_seqfeat_cit(seq_entries);
347  }
348 
349  if (pp->citat) {
350  StripSerialNumbers(seq_entries);
351  }
352 
353  PackEntries(seq_entries);
354  CheckDupDates(seq_entries);
355 
356  if (seq_long) {
357  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence, "Sequence %s|%s is longer than limit %ld", pp->entrylist[pp->curindx]->locusname, pp->entrylist[pp->curindx]->acnum, pp->limit);
358  } else {
359  pp->entries.splice(pp->entries.end(), seq_entries);
360  }
361 
362  seq_entries.clear();
364 
365  return true;
366 }
367 
368 static void SetXrefObjId(CEMBL_xref& xref, const string& str)
369 {
370  if (str.empty())
371  return;
372 
373  CEMBL_xref::TId& ids = xref.SetId();
374 
375  bool found = false;
376  for (const auto& id : ids) {
377  if (id->IsStr() && id->GetStr() == str) {
378  found = true;
379  break;
380  }
381  }
382 
383  if (found)
384  return;
385 
386  CRef<CObject_id> obj_id(new CObject_id);
387  obj_id->SetStr(str);
388 
389  ids.push_back(obj_id);
390 }
391 
392 /**********************************************************
393  *
394  * static void GetEmblBlockXref(entry, xip,
395  * chentry, dr_ena,
396  * dr_biosample,
397  * drop):
398  *
399  * Return a list of EMBLXrefPtr, one EMBLXrefPtr per
400  * type (DR) line.
401  *
402  **********************************************************/
403 static void GetEmblBlockXref(const DataBlk& entry, XmlIndexPtr xip, const char* chentry, TStringList& dr_ena, TStringList& dr_biosample, bool* drop, CEMBL_block& embl)
404 {
405  const char** b;
406 
407  const char* drline;
408 
409  char* bptr;
410  char* eptr;
411  char* ptr;
412  char* xref;
413  char* p;
414  char* q;
415 
416  bool valid_biosample;
417  bool many_biosample;
418  size_t len;
419 
420  Int2 col_data;
421  Int2 code;
422 
423  CEMBL_block::TXref new_xrefs;
424 
425  if (! xip) {
426  bptr = xSrchNodeType(entry, ParFlat_DR, &len);
427  col_data = ParFlat_COL_DATA_EMBL;
428  xref = nullptr;
429  } else {
430  bptr = XMLFindTagValue(chentry, xip, INSDSEQ_DATABASE_REFERENCE);
431  if (bptr)
432  len = StringLen(bptr);
433  col_data = 0;
434  xref = bptr;
435  }
436 
437  if (! bptr)
438  return;
439 
440  for (eptr = bptr + len; bptr < eptr; bptr = ptr) {
441  drline = bptr;
442  bptr += col_data; /* bptr points to database_identifier */
444 
445  string name;
446  if (code < 0) {
447  ptr = SrchTheChar(bptr, eptr, ';');
448  name.assign(bptr, ptr);
449 
450  if (NStr::EqualNocase(name, "MD5")) {
451  while (ptr < eptr) {
452  if (NStr::Equal(ptr, 0, 2, "DR"))
453  break;
454 
455  ptr = SrchTheChar(ptr, eptr, '\n');
456  if (*ptr == '\n')
457  ptr++;
458  }
459  continue;
460  }
461 
462  for (b = ParFlat_DRname_array; *b; b++) {
463  if (NStr::EqualNocase(name, *b))
464  break;
465  }
466 
467  if (! *b)
468  ErrPostEx(SEV_WARNING, ERR_DRXREF_UnknownDBname, "Encountered a new/unknown database name in DR line: \"%s\".", name.c_str());
469  else if (NStr::EqualNocase(*b, "UNIPROT/SWISS-PROT")) {
470  name = "UniProtKB/Swiss-Prot";
471  } else if (NStr::EqualNocase(*b, "UNIPROT/TREMBL")) {
472  name = "UniProtKB/TrEMBL";
473  }
474  }
475 
476  bptr = PointToNextToken(bptr); /* bptr points to primary_identifier */
477  p = SrchTheChar(bptr, eptr, '\n');
478  ptr = SrchTheChar(bptr, eptr, ';');
479 
480  string id, id1;
481 
482  if (ptr && ptr < p) {
483  id.assign(bptr, ptr);
485 
486  bptr = PointToNextToken(ptr); /* points to
487  secondary_identifier */
488  }
489  if (p) {
490  id1.assign(bptr, p);
492  }
493 
494  if (id.empty()) {
495  id = id1;
496  id1.clear();
497  }
498 
499  if (name == "BioSample" && ! id.empty()) {
500  many_biosample = (! id.empty() && ! id1.empty());
501  valid_biosample = fta_if_valid_biosample(id.c_str(), false);
502  if (! id1.empty() && fta_if_valid_biosample(id1.c_str(), false) == false)
503  valid_biosample = false;
504  if (many_biosample || ! valid_biosample) {
505  q = nullptr;
506  if (! drline)
507  drline = "[Empty]";
508  else {
509  q = StringChr(const_cast<char*>(drline), '\n');
510  if (q)
511  *q = '\0';
512  }
513  if (many_biosample)
514  ErrPostEx(SEV_REJECT, ERR_DRXREF_InvalidBioSample, "Multiple BioSample ids provided in the same DR line: \"%s\".", drline);
515  if (! valid_biosample)
516  ErrPostEx(SEV_REJECT, ERR_DRXREF_InvalidBioSample, "Invalid BioSample id(s) provided in DR line: \"%s\".", drline);
517  *drop = true;
518  if (q)
519  *q = '\n';
520  } else {
521  bool found = false;
522  for (const string& val : dr_biosample) {
523  if (val == id) {
524  found = true;
525  break;
526  }
527  }
528 
529  if (found) {
530  ErrPostEx(SEV_WARNING, ERR_DRXREF_DuplicatedBioSamples, "Duplicated BioSample ids found within DR lines contents: \"%s\".", id.c_str());
531  } else {
532  dr_biosample.push_back(id);
533  }
534  }
535  } else if (name == "ENA" && ! id.empty() && fta_if_valid_sra(id.c_str(), false)) {
536  if (! id.empty() && ! id1.empty()) {
537  q = nullptr;
538  if (! drline)
539  drline = "[Empty]";
540  else {
541  q = StringChr(const_cast<char*>(drline), '\n');
542  if (q)
543  *q = '\0';
544  }
545  ErrPostEx(SEV_REJECT, ERR_DRXREF_InvalidSRA, "Multiple possible SRA ids provided in the same DR line: \"%s\".", drline);
546  *drop = true;
547  if (q)
548  *q = '\n';
549  } else {
550  bool found = false;
551  for (const string& val : dr_ena) {
552  if (val == id) {
553  found = true;
554  break;
555  }
556  }
557 
558  if (found) {
559  ErrPostEx(SEV_WARNING, ERR_DRXREF_DuplicatedSRA, "Duplicated Sequence Read Archive ids found within DR lines contents: \"%s\".", id.c_str());
560  } else {
561  dr_ena.push_back(id);
562  }
563  }
564  } else {
565  CRef<CEMBL_xref> new_xref(new CEMBL_xref);
566 
567  if (code != -1)
568  new_xref->SetDbname().SetCode(static_cast<CEMBL_dbname::ECode>(code));
569  else
570  new_xref->SetDbname().SetName(name);
571 
572  if (! id.empty())
573  SetXrefObjId(*new_xref, id);
574 
575  if (! id1.empty())
576  SetXrefObjId(*new_xref, id1);
577 
578  new_xrefs.push_back(new_xref);
579  }
580 
581  ptr = p + 1;
582 
583  if (xip)
584  continue;
585 
586  /* skip "XX" line
587  */
588  while (ptr < eptr) {
589  if (StringEquN(ptr, "DR", 2))
590  break;
591 
592  ptr = SrchTheChar(ptr, eptr, '\n');
593  if (*ptr == '\n')
594  ptr++;
595  }
596  }
597 
598  if (xref)
599  MemFree(xref);
600 
601  if (! new_xrefs.empty())
602  embl.SetXref().swap(new_xrefs);
603 }
604 
606 {
607  static CTextseq_id noTextId;
608 
609  switch (id.Which()) {
610  case CSeq_id::e_Genbank:
611  return id.SetGenbank();
612  case CSeq_id::e_Embl:
613  return id.SetEmbl();
614  case CSeq_id::e_Pir:
615  return id.SetPir();
617  return id.SetSwissprot();
618  case CSeq_id::e_Other:
619  return id.SetOther();
620  case CSeq_id::e_Ddbj:
621  return id.SetDdbj();
622  case CSeq_id::e_Prf:
623  return id.SetPrf();
624  case CSeq_id::e_Tpg:
625  return id.SetTpg();
626  case CSeq_id::e_Tpe:
627  return id.SetTpe();
628  case CSeq_id::e_Tpd:
629  return id.SetTpd();
630  case CSeq_id::e_Gpipe:
631  return id.SetGpipe();
633  return id.SetNamed_annot_track();
634  default:; // do nothing
635  }
636 
637  return noTextId;
638 }
639 
640 /**********************************************************/
641 static void GetReleaseInfo(const DataBlk& entry)
642 {
643  EntryBlkPtr ebp;
644 
645  char* offset;
646  char* bptr;
647  char* eptr;
648 
649  size_t len;
650 
651  ebp = static_cast<EntryBlk*>(entry.mpData);
652  CBioseq& bioseq = ebp->seq_entry->SetSeq();
653  CTextseq_id& id = SetTextIdRef(*(bioseq.SetId().front()));
654 
655  offset = xSrchNodeType(entry, ParFlat_DT, &len);
656  if (! offset)
657  return;
658 
659  eptr = offset + len;
660  offset = SrchTheChar(offset, eptr, '\n');
661  if (! offset)
662  return;
663 
664  bptr = SrchTheStr(offset, eptr, "Version");
665  if (! bptr)
666  return;
667 
668  bptr = PointToNextToken(bptr); /* bptr points to next token */
669 
670  id.SetVersion(NStr::StringToInt(bptr, NStr::fAllowTrailingSymbols));
671 }
672 
673 /**********************************************************
674  *
675  * static OrgRefPtr GetEmblOrgRef(dbp):
676  *
677  * >= 1 OS per entry.
678  *
679  **********************************************************/
681 {
682  const char* bptr = dbp->mOffset;
683  const char* eptr = bptr + dbp->len;
684 
685  string sTaxname;
686  vector<string> taxLines;
687  NStr::Split(CTempString(bptr, eptr - bptr), "\n", taxLines);
688  for (auto line : taxLines) {
690  if (line.empty() || NStr::StartsWith(line, "XX")) {
691  continue;
692  }
693  if (! sTaxname.empty()) {
694  sTaxname += ' ';
695  }
696  sTaxname += line.substr(ParFlat_COL_DATA_EMBL);
697  }
698 
699  CRef<COrg_ref> org_ref;
700  if (sTaxname.empty()) {
701  return org_ref;
702  }
703 
704  org_ref.Reset(new COrg_ref);
705  org_ref->SetTaxname(sTaxname);
706 
707  auto openP = sTaxname.find('(');
708  if (openP != string::npos) {
709  auto sCommonName = sTaxname.substr(0, openP);
710  auto commonTerm = sCommonName.find_last_not_of(" \t(");
711  if (commonTerm != string::npos) {
712  sCommonName = sCommonName.substr(0, commonTerm + 1);
713  org_ref->SetCommon(sCommonName);
714  }
715  }
716  return org_ref;
717 }
718 
719 /**********************************************************/
721 {
722  bool condiv = (NStr::CompareNocase(ibp->division, "CON") == 0);
723 
724  bool result = true;
725  if (condiv && ibp->segnum != 0) {
726  ErrPostEx(SEV_ERROR, ERR_DIVISION_ConDivInSegset, "Use of the CON division is not allowed for members of segmented set : %s|%s. Entry skipped.", ibp->locusname, ibp->acnum);
727  // ibp->drop = true;
728  result = false;
729  }
730 
731  if (! condiv && ibp->is_contig == false && ibp->origin == false) {
732  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingSequenceData, "Required sequence data is absent. Entry dropped.");
733  // ibp->drop = true;
734  result = false;
735  } else if (! condiv && ibp->is_contig && ibp->origin == false) {
736  ErrPostEx(SEV_WARNING, ERR_DIVISION_MappedtoCON, "Division [%s] mapped to CON based on the existence of CONTIG line.", ibp->division);
737  } else if (ibp->is_contig && ibp->origin) {
739  ErrPostEx(SEV_INFO, ERR_FORMAT_ContigWithSequenceData, "The CONTIG/CO linetype and sequence data are both present. Ignoring sequence data.");
740  } else {
741  ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigWithSequenceData, "The CONTIG/CO linetype and sequence data may not both be present in a sequence record.");
742  // ibp->drop = true;
743  result = false;
744  }
745  } else if (condiv && ! ibp->is_contig && ! ibp->origin) {
746  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingContigFeature, "No CONTIG data in GenBank format file, entry dropped.");
747  // ibp->drop = true;
748  result = false;
749  } else if (condiv && ! ibp->is_contig && ibp->origin) {
750  ErrPostEx(SEV_WARNING, ERR_DIVISION_ConDivLacksContig, "Division is CON, but CONTIG data have not been found.");
751  }
752  return result;
753 }
754 
755 /**********************************************************/
756 bool GetEmblInstContig(const DataBlk& entry, CBioseq& bioseq, ParserPtr pp)
757 {
758  DataBlkPtr dbp;
759 
760  char* p;
761  char* q;
762  char* r;
763  bool locmap;
764 
765  bool allow_crossdb_featloc;
766  int numerr;
767 
768  dbp = TrackNodeType(entry, ParFlat_CO);
769  if (! dbp || ! dbp->mOffset)
770  return true;
771 
772  Int4 i = static_cast<Int4>(dbp->len) - ParFlat_COL_DATA_EMBL;
773  if (i <= 0)
774  return false;
775 
776  p = MemNew(i + 1);
778  p[i - 1] = '\0';
779  for (q = p; *q != '\0'; q++) {
780  if (*q == '\t')
781  *q = ' ';
782  else if (*q == '\n') {
783  *q = ' ';
784  if (q[1] == 'C' && q[2] == 'O' && q[3] == ' ') {
785  q[1] = ' ';
786  q[2] = ' ';
787  }
788  }
789  }
790  for (q = p, r = p; *q != '\0'; q++)
791  if (*q != ' ')
792  *r++ = *q;
793  *r = '\0';
794 
795  for (q = p; *q != '\0'; q++)
796  if ((q[0] == ',' && q[1] == ',') || (q[0] == '(' && q[1] == ',') ||
797  (q[0] == ',' && q[1] == ')'))
798  break;
799  if (*q != '\0') {
800  ErrPostEx(SEV_REJECT, ERR_LOCATION_ContigHasNull, "The join() statement for this record's contig line contains one or more comma-delimited components which are null.");
801  MemFree(p);
802  return false;
803  }
804 
805  if (pp->buf)
806  MemFree(pp->buf);
807  pp->buf = nullptr;
808 
809  CRef<CSeq_loc> loc = xgbparseint_ver(p, locmap, numerr, bioseq.GetId(), pp->accver);
810 
811  if (loc.NotEmpty() && loc->IsMix()) {
812  allow_crossdb_featloc = pp->allow_crossdb_featloc;
813  pp->allow_crossdb_featloc = true;
814 
815  TSeqLocList locs;
816  locs.push_back(loc);
817 
818  i = fta_fix_seq_loc_id(locs, pp, p, nullptr, true);
819  if (i > 999)
821  pp->allow_crossdb_featloc = allow_crossdb_featloc;
822 
823  XGappedSeqLocsToDeltaSeqs(loc->GetMix(), bioseq.SetInst().SetExt().SetDelta().Set());
824  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_delta);
825  } else
826  bioseq.SetInst().ResetExt();
827 
828  MemFree(p);
829  return true;
830 }
831 
832 /**********************************************************
833  *
834  * bool GetEmblInst(pp, entry, dnaconv):
835  *
836  * Fills in Seq-inst for an entry. Assumes Bioseq
837  * already allocated.
838  *
839  **********************************************************/
840 static bool GetEmblInst(ParserPtr pp, const DataBlk& entry, unsigned char* dnaconv)
841 {
842  EntryBlkPtr ebp;
843  IndexblkPtr ibp;
844 
845  char* p;
846  char* q;
847  char* r;
848 
849  Int4 i;
850  Int2 strand;
851 
852  ebp = static_cast<EntryBlk*>(entry.mpData);
853 
854  CBioseq& bioseq = ebp->seq_entry->SetSeq();
855 
856  CSeq_inst& inst = bioseq.SetInst();
858 
859  ibp = pp->entrylist[pp->curindx];
860 
861  /* p points to 2nd token
862  */
864  p = PointToNextToken(p); /* p points to 3rd token */
865 
866  if (ibp->embl_new_ID)
867  p = PointToNextToken(p);
868 
869  /* some entries have "circular" before molecule type in embl
870  */
871  if (StringEquNI(p, "circular", 8)) {
873  p = PointToNextToken(p);
874  } else if (ibp->embl_new_ID)
875  p = PointToNextToken(p);
876 
877  r = StringChr(p, ';');
878  if (r)
879  *r = '\0';
880 
881  for (i = 0, q = p; *q != '\0'; q++) {
882  if (*q != ' ')
883  continue;
884 
885  while (*q == ' ')
886  q++;
887  if (*q != '\0')
888  i++;
889  q--;
890  }
891 
892  if (ibp->embl_new_ID == false && inst.GetTopology() != CSeq_inst::eTopology_circular &&
893  ! StringStr(p, "DNA") && ! StringStr(p, "RNA") &&
894  (pp->source != Parser::ESource::EMBL || (! StringStr(p, "xxx") &&
895  ! StringStr(p, "XXX")))) {
896  ErrPostEx(SEV_WARNING, ERR_LOCUS_WrongTopology, "Other than circular topology found in EMBL, \"%s\", assign default topology", p);
897  }
898 
899  /* the "p" must be the mol-type
900  */
901  if (i == 0 && pp->source == Parser::ESource::NCBI) {
902  /* source = NCBI can be full variety of strands/mol-type
903  */
904  strand = CheckSTRAND(p);
905  if (strand > 0)
906  inst.SetStrand(static_cast<CSeq_inst::EStrand>(strand));
907  }
908 
909  if (r)
910  *r = ';';
911 
912  if (! GetSeqData(pp, entry, bioseq, ParFlat_SQ, dnaconv, eSeq_code_type_iupacna))
913  return false;
914 
915  if (ibp->is_contig && ! GetEmblInstContig(entry, bioseq, pp))
916  return false;
917 
918  return true;
919 }
920 
921 /**********************************************************
922  *
923  * static CRef<CEMBL_block> GetDescrEmblBlock(pp, entry, mfp,
924  * gbdiv, biosp,
925  * dr_ena, dr_biosample):
926  *
927  * class is 2nd token of ID line.
928  * div :
929  * - 4th or 5th (if circular) token of ID line;
930  * - but actually Genbank DIV string has to get by
931  * mapping GBDIV_array;
932  * - EST DIV string by searching KW line to map
933  * ParFlat_EST_kw_array;
934  * - PAT DIV string by accession number starting
935  * with "A".
936  * DR line for xref.
937  *
938  **********************************************************/
940  ParserPtr pp, const DataBlk& entry, CMolInfo& mol_info, string& gbdiv, const CBioSource* bio_src, TStringList& dr_ena, TStringList& dr_biosample)
941 {
942  CRef<CEMBL_block> ret, embl(new CEMBL_block);
943 
944  IndexblkPtr ibp;
945  char* bptr;
946  char* kw;
947  char* kwp;
948  Char dataclass[4];
949  Char ch;
950 
951  CEMBL_block::TDiv div;
952  TKeywordList keywords;
953 
954  bool if_cds;
955  bool pat_ref = false;
956  bool est_kwd = false;
957  bool sts_kwd = false;
958  bool gss_kwd = false;
959  bool htc_kwd = false;
960  bool fli_kwd = false;
961  bool wgs_kwd = false;
962  bool tpa_kwd = false;
963  bool tsa_kwd = false;
964  bool tls_kwd = false;
965  bool env_kwd = false;
966  bool mga_kwd = false;
967 
968  bool cancelled;
969  bool drop;
970  char* tempdiv;
971  Int4 i;
972 
973  ibp = pp->entrylist[pp->curindx];
974 
975  /* bptr points to 2nd token
976  */
978 
979  if (ibp->embl_new_ID == false) {
980  if (StringEquNI(bptr, "standard", 8)) {
981  // embl->SetClass(CEMBL_block::eClass_standard);
982  } else if (StringEquNI(bptr, "unannotated", 11)) {
983  embl->SetClass(CEMBL_block::eClass_unannotated);
984  } else if (StringEquNI(bptr, "unreviewed", 10) ||
985  StringEquNI(bptr, "preliminary", 11)) {
986  embl->SetClass(CEMBL_block::eClass_other);
987  } else {
988  embl->SetClass(CEMBL_block::eClass_not_set);
989  }
990 
991  bptr = StringChr(bptr, ';');
992  if (bptr)
993  bptr = StringChr(bptr + 1, ';');
994  } else {
995  bptr = StringChr(bptr, ';');
996  if (bptr)
997  bptr = StringChr(bptr + 1, ';');
998  if (bptr)
999  bptr = StringChr(bptr + 1, ';');
1000  if (bptr) {
1001  while (*bptr == ' ' || *bptr == ';')
1002  bptr++;
1004  if (i < 0)
1005  bptr = StringChr(bptr, ';');
1006  else if (i == 0)
1007  bptr = (char*)"CON";
1008  }
1009  }
1010 
1011  if (bptr) {
1012  while (*bptr == ' ' || *bptr == ';')
1013  bptr++;
1014  StringNCpy(dataclass, bptr, 3);
1015  dataclass[3] = '\0';
1016  if (StringEqu(dataclass, "TSA"))
1017  ibp->is_tsa = true;
1018  } else {
1019  bptr = (char*)" ";
1020  dataclass[0] = '\0';
1021  }
1022 
1023  if_cds = check_cds(entry, pp->format);
1024 
1025  if (ibp->psip.NotEmpty())
1026  pat_ref = true;
1027 
1028  pp->KeywordParser().Cleanup();
1029  keywords = pp->KeywordParser().KeywordList();
1030 
1031  embl->SetKeywords() = keywords;
1032  if (ibp->is_tpa && ! fta_tpa_keywords_check(keywords)) {
1033  return ret;
1034  }
1035 
1036  if (ibp->is_tsa && ! fta_tsa_keywords_check(keywords, pp->source)) {
1037  return ret;
1038  }
1039 
1040  if (ibp->is_tls && ! fta_tls_keywords_check(keywords, pp->source)) {
1041  return ret;
1042  }
1043 
1044  for (const string& key : keywords) {
1045  fta_keywords_check(key.c_str(), &est_kwd, &sts_kwd, &gss_kwd, &htc_kwd, &fli_kwd, &wgs_kwd, &tpa_kwd, &env_kwd, &mga_kwd, &tsa_kwd, &tls_kwd);
1046  }
1047 
1048  if (ibp->env_sample_qual == false && env_kwd) {
1049  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ENV_NoMatchingQualifier, "This record utilizes the ENV keyword, but there are no /environmental_sample qualifiers among its source features.");
1050  return ret;
1051  }
1052 
1053  div = static_cast<CEMBL_block::TDiv>(fta_StringMatch(ParFlat_Embl_DIV_array, bptr));
1054  if (div < 0) {
1055  ch = bptr[3];
1056  bptr[3] = '\0';
1057  ErrPostEx(SEV_REJECT, ERR_DIVISION_UnknownDivCode, "Unknown division code \"%s\" found in Embl flatfile. Record rejected.", bptr);
1058  bptr[3] = ch;
1059  return ret;
1060  }
1061 
1062  /* Embl has recently (7-19-93, email) decided to change the name of
1063  * its "UNA"==10 division to "UNC"==16 (for "unclassified")
1064  */
1065  if (div == 16)
1066  div = CEMBL_block::eDiv_una;
1067 
1069 
1070  /* 06-10-96 new HUM division replaces the PRI
1071  * it's temporarily mapped to 'other' in asn.1 embl-block.
1072  * Divisions GSS, HUM, HTG, CON, ENV and MUS are mapped to other.
1073  */
1074  int thtg = (div == 18) ? CEMBL_block::eDiv_pri : div;
1075  gbdiv = ParFlat_GBDIV_array[thtg];
1076 
1077  if (div <= CEMBL_block::eDiv_sts)
1078  embl->SetDiv(div);
1079 
1080  const char* p = gbdiv.c_str();
1081  if (ibp->is_tpa &&
1082  (StringEqu(p, "EST") || StringEqu(p, "GSS") ||
1083  StringEqu(p, "PAT") || StringEqu(p, "HTG"))) {
1084  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTPADivcode, "Division code \"%s\" is not legal for TPA records. Entry dropped.", p);
1085  return ret;
1086  }
1087 
1088  if (ibp->is_tsa && ! StringEqu(p, "TSA")) {
1089  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTSADivcode, "Division code \"%s\" is not legal for TSA records. Entry dropped.", p);
1090  return ret;
1091  }
1092 
1093  cancelled = IsCancelled(embl->GetKeywords());
1094 
1095  if (div == 19) /* HTG */
1096  {
1097  if (! HasHtg(embl->GetKeywords())) {
1098  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTGKeywords, "Division is HTG, but entry lacks HTG-related keywords. Entry dropped.");
1099  return ret;
1100  }
1101  tempdiv = StringSave("HTG");
1102  } else
1103  tempdiv = nullptr;
1104 
1105  fta_check_htg_kwds(embl->SetKeywords(), pp->entrylist[pp->curindx], mol_info);
1106 
1107  DefVsHTGKeywords(mol_info.GetTech(), entry, ParFlat_DE, ParFlat_SQ, cancelled);
1108  if ((mol_info.GetTech() == CMolInfo::eTech_htgs_0 || mol_info.GetTech() == CMolInfo::eTech_htgs_1 ||
1109  mol_info.GetTech() == CMolInfo::eTech_htgs_2) &&
1110  ! gbdiv.empty()) {
1111  gbdiv.clear();
1112  }
1113 
1114  CheckHTGDivision(tempdiv, mol_info.GetTech());
1115  if (tempdiv)
1116  MemFree(tempdiv);
1117 
1118  i = 0;
1119  if (est_kwd)
1120  i++;
1121  if (sts_kwd)
1122  i++;
1123  if (gss_kwd)
1124  i++;
1125  if (ibp->htg > 0)
1126  i++;
1127  if (htc_kwd)
1128  i++;
1129  if (fli_kwd)
1130  i++;
1131  if (wgs_kwd)
1132  i++;
1133  if (env_kwd)
1134  i++;
1135  if (mga_kwd) {
1136  if (ibp->is_mga == false) {
1137  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeCAGE, "This is apparently _not_ a CAGE record, but the special keywords are present. Entry dropped.");
1138  return ret;
1139  }
1140  i++;
1141  } else if (ibp->is_mga) {
1142  ErrPostEx(SEV_REJECT, ERR_KEYWORD_NoGeneExpressionKeywords, "This is apparently a CAGE or 5'-SAGE record, but it lacks the required keywords. Entry dropped.");
1143  return ret;
1144  }
1145  if (tpa_kwd) {
1146  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL) {
1147  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA\" and/or \"Third Party Annotation\" keywords are present. Entry dropped.");
1148  return ret;
1149  }
1150  i++;
1151  } else if (ibp->is_tpa) {
1152  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA\" and/or \"Third Party Annotation\" keywords. Entry dropped.");
1153  return ret;
1154  }
1155 
1156  if (tsa_kwd) {
1157  if (ibp->is_tsa == false) {
1158  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords are present. Entry dropped.");
1159  return ret;
1160  }
1161  i++;
1162  } else if (ibp->is_tsa) {
1163  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords. Entry dropped.");
1164  return ret;
1165  }
1166  if (tls_kwd) {
1167  if (ibp->is_tls == false) {
1168  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS\" and/or \"Targeted Locus Study\" keywords are present. Entry dropped.");
1169  return ret;
1170  }
1171  i++;
1172  } else if (ibp->is_tls) {
1173  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS\" and/or \"Targeted Locus Study\" keywords. Entry dropped.");
1174  return ret;
1175  }
1176  if (i > 1) {
1177  if (i == 2 && ibp->htg > 0 && env_kwd)
1178  ErrPostEx(SEV_WARNING, ERR_KEYWORD_HTGPlusENV, "This HTG record also has the ENV keyword, which is an unusual combination. Confirmation that isolation and cloning steps actually occured might be appropriate.");
1179  else if ((i == 2 && wgs_kwd && tpa_kwd) ||
1180  (i == 2 && tsa_kwd && tpa_kwd)) {
1181  } else if (i != 2 || env_kwd == false ||
1182  (est_kwd == false && gss_kwd == false && wgs_kwd == false)) {
1183  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingKeywords, "This record contains more than one of the special keywords used to indicate that a sequence is an HTG, EST, GSS, STS, HTC, WGS, ENV, FLI_CDNA, TPA, CAGE, TSA or TLS sequence.");
1184  return ret;
1185  }
1186  }
1187 
1188  if (wgs_kwd)
1189  i--;
1190  if (ibp->is_contig && i > 0 &&
1191  wgs_kwd == false && tpa_kwd == false && env_kwd == false) {
1192  ErrPostEx(SEV_REJECT, ERR_KEYWORD_IllegalForCON, "This CON record should not have HTG, EST, GSS, STS, HTC, FLI_CDNA, CAGE, TSA or TLS special keywords. Entry dropped.");
1193  return ret;
1194  }
1195 
1196  CMolInfo::TTech tech = mol_info.GetTech();
1197  if (tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
1198  tech == CMolInfo::eTech_htgs_2 || tech == CMolInfo::eTech_htgs_3) {
1199  RemoveHtgPhase(embl->SetKeywords());
1200  }
1201 
1202  size_t len = 0;
1203  bptr = xSrchNodeType(entry, ParFlat_KW, &len);
1204  if (bptr) {
1206 
1207  kwp = StringStr(kw, "EST");
1208  if (kwp && ! est_kwd) {
1209  ErrPostEx(SEV_WARNING, ERR_KEYWORD_ESTSubstring, "Keyword %s has substring EST, but no official EST keywords found", kw);
1210  }
1211  kwp = StringStr(kw, "STS");
1212  if (kwp && ! sts_kwd) {
1213  ErrPostEx(SEV_WARNING, ERR_KEYWORD_STSSubstring, "Keyword %s has substring STS, but no official STS keywords found", kw);
1214  }
1215  kwp = StringStr(kw, "GSS");
1216  if (kwp && ! gss_kwd) {
1217  ErrPostEx(SEV_WARNING, ERR_KEYWORD_GSSSubstring, "Keyword %s has substring GSS, but no official GSS keywords found", kw);
1218  }
1219  MemFree(kw);
1220  }
1221 
1222  if (! ibp->is_contig) {
1223  drop = false;
1224  CMolInfo::TTech tech = mol_info.GetTech();
1225 
1226  check_div(ibp->is_pat, pat_ref, est_kwd, sts_kwd, gss_kwd, if_cds, gbdiv, &tech, ibp->bases, pp->source, drop);
1227  if (tech != CMolInfo::eTech_unknown)
1228  mol_info.SetTech(tech);
1229  else
1230  mol_info.ResetTech();
1231 
1232  if (drop) {
1233  return ret;
1234  }
1235  } else if (! gbdiv.empty() && StringEqu(gbdiv.c_str(), "CON")) {
1236  gbdiv.clear();
1237  }
1238 
1239  bool is_htc_div = ! gbdiv.empty() && StringEqu(gbdiv.c_str(), "HTC");
1240  bool has_htc = HasHtc(embl->GetKeywords());
1241 
1242  if (is_htc_div && ! has_htc) {
1243  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTCKeyword, "This record is in the HTC division, but lacks the required HTC keyword.");
1244  return ret;
1245  }
1246  if (! is_htc_div && has_htc) {
1247  ErrPostEx(SEV_ERROR, ERR_DIVISION_InvalidHTCKeyword, "This record has the special HTC keyword, but is not in HTC division. If this record has graduated out of HTC, then the keyword should be removed.");
1248  return ret;
1249  }
1250 
1251  if (is_htc_div) {
1252  char* p;
1253  p = entry.mOffset + ParFlat_COL_DATA_EMBL; /* p points to 1st token */
1254  p = PointToNextToken(p); /* p points to 2nd token */
1255  p = PointToNextToken(p); /* p points to 3rd token */
1256 
1257  if (ibp->embl_new_ID) {
1258  p = PointToNextToken(p);
1259  p = PointToNextToken(p);
1260  } else if (StringEquNI(p, "circular", 8))
1261  p = PointToNextToken(p); /* p points to 4th token */
1262 
1263  if (StringEquN(p + 1, "s-", 2))
1264  p += 3;
1265  if (*p == 'm' || *p == 'r')
1266  p++;
1267  else if (StringEquN(p, "pre-", 4))
1268  p += 4;
1269  else if (StringEquN(p, "transcribed ", 12))
1270  p += 12;
1271 
1272  if (! StringEquN(p, "RNA", 3)) {
1273  ErrPostEx(SEV_ERROR, ERR_DIVISION_HTCWrongMolType, "All HTC division records should have a moltype of pre-RNA, mRNA or RNA.");
1274  return ret;
1275  }
1276  }
1277 
1278  if (fli_kwd)
1280 
1281  /* will be used in flat file database
1282  */
1283  if (! gbdiv.empty()) {
1284  if (StringEqu(gbdiv.c_str(), "EST")) {
1285  ibp->EST = true;
1286  mol_info.SetTech(CMolInfo::eTech_est);
1287  } else if (StringEqu(gbdiv.c_str(), "STS")) {
1288  ibp->STS = true;
1289  mol_info.SetTech(CMolInfo::eTech_sts);
1290  } else if (StringEqu(gbdiv.c_str(), "GSS")) {
1291  ibp->GSS = true;
1292  mol_info.SetTech(CMolInfo::eTech_survey);
1293  } else if (StringEqu(gbdiv.c_str(), "HTC")) {
1294  ibp->HTC = true;
1295  mol_info.SetTech(CMolInfo::eTech_htc);
1296  gbdiv.clear();
1297  } else if (StringEqu(gbdiv.c_str(), "SYN") && bio_src &&
1298  bio_src->IsSetOrigin() && bio_src->GetOrigin() == CBioSource::eOrigin_synthetic) {
1299  gbdiv.clear();
1300  }
1301  } else if (mol_info.IsSetTech()) {
1302  if (mol_info.GetTech() == CMolInfo::eTech_est)
1303  ibp->EST = true;
1304  if (mol_info.GetTech() == CMolInfo::eTech_sts)
1305  ibp->STS = true;
1306  if (mol_info.GetTech() == CMolInfo::eTech_survey)
1307  ibp->GSS = true;
1308  if (mol_info.GetTech() == CMolInfo::eTech_htc)
1309  ibp->HTC = true;
1310  }
1311 
1312  if (mol_info.IsSetTech())
1313  fta_remove_keywords(mol_info.GetTech(), embl->SetKeywords());
1314 
1315  if (ibp->is_tpa)
1316  fta_remove_tpa_keywords(embl->SetKeywords());
1317 
1318  if (ibp->is_tsa)
1319  fta_remove_tsa_keywords(embl->SetKeywords(), pp->source);
1320 
1321  if (ibp->is_tls)
1322  fta_remove_tls_keywords(embl->SetKeywords(), pp->source);
1323 
1324  if (bio_src && bio_src->IsSetSubtype()) {
1325  for (const auto& subtype : bio_src->GetSubtype()) {
1326  if (subtype->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
1327  fta_remove_env_keywords(embl->SetKeywords());
1328  break;
1329  }
1330  }
1331  }
1332 
1333 
1334  CRef<CDate_std> std_creation_date,
1335  std_update_date;
1336 
1337  GetEmblDate(pp->source, entry, std_creation_date, std_update_date);
1338 
1339  embl->SetCreation_date().SetStd(*std_creation_date);
1340  embl->SetUpdate_date().SetStd(*std_update_date);
1341 
1342  ibp->wgssec[0] = '\0';
1343  GetExtraAccession(ibp, pp->allow_uwsec, pp->source, embl->SetExtra_acc());
1344 
1345  GetEmblBlockXref(entry, nullptr, nullptr, dr_ena, dr_biosample, &ibp->drop, *embl);
1346 
1347  if (StringEqu(dataclass, "ANN") || StringEqu(dataclass, "CON")) {
1348  if (StringLen(ibp->acnum) == 8 &&
1349  (StringEquN(ibp->acnum, "CT", 2) ||
1350  StringEquN(ibp->acnum, "CU", 2))) {
1351  bool found = false;
1352  for (const string& acc : embl->SetExtra_acc()) {
1353  if (fta_if_wgs_acc(acc.c_str()) == 0 &&
1354  (acc[0] == 'C' || acc[0] == 'U')) {
1355  found = true;
1356  break;
1357  }
1358  }
1359  if (found)
1360  mol_info.SetTech(CMolInfo::eTech_wgs);
1361  }
1362  }
1363 
1364  return embl;
1365 }
1366 
1367 
1368 static bool s_DuplicatesBiosource(const CBioSource& biosource, const string& gbdiv)
1369 {
1370  return (biosource.IsSetOrg() &&
1371  biosource.GetOrg().IsSetOrgname() &&
1372  biosource.GetOrg().GetOrgname().IsSetDiv() &&
1373  NStr::Equal(biosource.GetOrg().GetOrgname().GetDiv(), gbdiv));
1374 }
1375 
1376 /**********************************************************/
1377 static CRef<CGB_block> GetEmblGBBlock(ParserPtr pp, const DataBlk& entry, const string& gbdiv, CBioSource* bio_src)
1378 {
1379  IndexblkPtr ibp;
1380 
1381  CRef<CGB_block> gbb(new CGB_block);
1382 
1383  ibp = pp->entrylist[pp->curindx];
1384 
1385  if (pp->source == Parser::ESource::NCBI) {
1386  ibp->wgssec[0] = '\0';
1387  GetExtraAccession(ibp, pp->allow_uwsec, pp->source, gbb->SetExtra_accessions());
1388  pp->KeywordParser().Cleanup();
1389  gbb->SetKeywords() = pp->KeywordParser().KeywordList();
1390  }
1391 
1392  if (! gbdiv.empty()) {
1393  if (NStr::EqualNocase(gbdiv.c_str(), "ENV") &&
1394  bio_src && bio_src->IsSetSubtype()) {
1395  const auto& subtype = bio_src->GetSubtype();
1396  const auto it =
1397  find_if(begin(subtype), end(subtype), [](auto pSubSource) {
1398  return pSubSource->GetSubtype() == CSubSource::eSubtype_environmental_sample;
1399  });
1400  if ((it == subtype.end()) && ! s_DuplicatesBiosource(*bio_src, gbdiv)) { // Not found
1401  gbb->SetDiv(gbdiv);
1402  }
1403  } else if (! bio_src ||
1404  ! s_DuplicatesBiosource(*bio_src, gbdiv)) {
1405  gbb->SetDiv(gbdiv);
1406  }
1407  }
1408 
1409  if (! gbb->IsSetExtra_accessions() && ! gbb->IsSetKeywords() && ! gbb->IsSetDiv())
1410  gbb.Reset();
1411 
1412  return gbb;
1413 }
1414 
1415 /**********************************************************
1416  *
1417  * static MolInfoPtr GetEmblMolInfo(entry, pp, orp):
1418  *
1419  * 3rd or 4th token in the ID line.
1420  * OG line.
1421  *
1422  **********************************************************/
1423 static CRef<CMolInfo> GetEmblMolInfo(ParserPtr pp, const DataBlk& entry, const COrg_ref* org_ref)
1424 {
1425  IndexblkPtr ibp;
1426 
1427  char* bptr;
1428  char* p;
1429  char* q;
1430  char* r;
1431  Int4 i;
1432 
1433  ibp = pp->entrylist[pp->curindx];
1434  bptr = entry.mOffset + ParFlat_COL_DATA_EMBL; /* bptr points to 1st
1435  token */
1436  bptr = PointToNextToken(bptr); /* bptr points to 2nd token */
1437  bptr = PointToNextToken(bptr); /* bptr points to 3rd token */
1438 
1439  if (StringEquNI(bptr, "circular", 8) || ibp->embl_new_ID)
1440  bptr = PointToNextToken(bptr); /* bptr points to 4th token */
1441  if (ibp->embl_new_ID)
1442  bptr = PointToNextToken(bptr); /* bptr points to 5th token */
1443 
1444  r = StringChr(bptr, ';');
1445  if (r)
1446  *r = '\0';
1447 
1448  for (i = 0, q = bptr; *q != '\0'; q++) {
1449  if (*q != ' ')
1450  continue;
1451 
1452  while (*q == ' ')
1453  q++;
1454  if (*q != '\0')
1455  i++;
1456  q--;
1457  }
1458 
1459  if (r)
1460  for (p = r + 1; *p == ' ' || *p == ';';)
1461  p++;
1462  else
1463  p = bptr;
1464 
1465  CRef<CMolInfo> mol_info(new CMolInfo);
1466 
1467  if (StringEquN(p, "EST", 3))
1468  mol_info->SetTech(CMolInfo::eTech_est);
1469  else if (ibp->is_wgs) {
1470  if (ibp->is_tsa)
1471  mol_info->SetTech(CMolInfo::eTech_tsa);
1472  else if (ibp->is_tls)
1473  mol_info->SetTech(CMolInfo::eTech_targeted);
1474  else
1475  mol_info->SetTech(CMolInfo::eTech_wgs);
1476  } else if (ibp->is_tsa)
1477  mol_info->SetTech(CMolInfo::eTech_tsa);
1478  else if (ibp->is_tls)
1479  mol_info->SetTech(CMolInfo::eTech_targeted);
1480 
1481  if (i == 0 && CheckSTRAND(bptr) >= 0)
1482  bptr = bptr + 3;
1483 
1484  GetFlatBiomol(mol_info->SetBiomol(), mol_info->GetTech(), bptr, pp, entry, org_ref);
1485  if (mol_info->GetBiomol() == CMolInfo::eBiomol_unknown) // not set
1486  mol_info->ResetBiomol();
1487 
1488  if (r)
1489  *r = ';';
1490 
1491  return mol_info;
1492 }
1493 
1494 /**********************************************************/
1496 {
1497  CRef<CUser_field> field;
1498  if (! tag || lst.empty())
1499  return field;
1500 
1501  field.Reset(new CUser_field);
1502  field->SetLabel().SetStr(tag);
1503  field->SetNum(static_cast<CUser_field::TNum>(lst.size()));
1504 
1505  for (const string& item : lst) {
1506  field->SetData().SetStrs().push_back(item);
1507  }
1508 
1509  return field;
1510 }
1511 
1512 /**********************************************************/
1514 {
1515  bool got = false;
1516 
1517  if (dr_ena.empty() && dr_biosample.empty())
1518  return;
1519 
1520  CUser_object* user_obj_ptr = nullptr;
1521 
1522  for (auto& descr : descrs) {
1523  if (! descr->IsUser() || ! descr->GetUser().IsSetType())
1524  continue;
1525 
1526  const CObject_id& obj_id = descr->GetUser().GetType();
1527 
1528  if (obj_id.IsStr() && obj_id.GetStr() == "DBLink") {
1529  user_obj_ptr = &descr->SetUser();
1530  got = true;
1531  break;
1532  }
1533  }
1534 
1535  CRef<CUser_field> field_bs;
1536  if (! dr_biosample.empty())
1537  field_bs = fta_create_user_field("BioSample", dr_biosample);
1538 
1539  CRef<CUser_field> field_ena;
1540  if (! dr_ena.empty()) {
1541  field_ena = fta_create_user_field("Sequence Read Archive", dr_ena);
1542  }
1543 
1544  if (field_bs.Empty() && field_ena.Empty())
1545  return;
1546 
1547  CRef<CUser_object> user_obj;
1548 
1549  if (! got) {
1550  user_obj.Reset(new CUser_object);
1551  user_obj->SetType().SetStr("DBLink");
1552 
1553  user_obj_ptr = user_obj.GetPointer();
1554  }
1555 
1556  if (field_bs.NotEmpty())
1557  user_obj_ptr->SetData().push_back(field_bs);
1558  if (field_ena.NotEmpty())
1559  user_obj_ptr->SetData().push_back(field_ena);
1560 
1561  if (! got) {
1562  CRef<CSeqdesc> descr(new CSeqdesc);
1563  descr->SetUser(*user_obj);
1564  descrs.push_back(descr);
1565  }
1566 
1567  if (! got)
1568  dbuop = user_obj;
1569  else {
1570  dbuop.Reset(new CUser_object);
1571  dbuop->Assign(*user_obj_ptr);
1572  }
1573 }
1574 
1575 /**********************************************************/
1576 static void fta_create_imgt_misc_feat(CBioseq& bioseq, CEMBL_block& embl_block, IndexblkPtr ibp)
1577 {
1578  if (! embl_block.IsSetXref())
1579  return;
1580 
1581  CSeq_feat::TDbxref xrefs;
1582  for (const auto& xref : embl_block.GetXref()) {
1583  if (! xref->IsSetDbname() || ! xref->GetDbname().IsName() ||
1584  ! StringEquN(xref->GetDbname().GetName().c_str(), "IMGT/", 5))
1585  continue;
1586 
1587  bool empty = true;
1588  for (const auto& id : xref->GetId()) {
1589  if (id->IsStr() && ! id->GetStr().empty()) {
1590  empty = false;
1591  break;
1592  }
1593  }
1594 
1595  if (empty)
1596  continue;
1597 
1598  CRef<CDbtag> tag(new CDbtag);
1599  tag->SetDb(xref->GetDbname().GetName());
1600 
1601  string& id_str = tag->SetTag().SetStr();
1602 
1603  bool need_delimiter = false;
1604  for (const auto& id : xref->GetId()) {
1605  if (id->IsStr() && ! id->GetStr().empty()) {
1606  if (need_delimiter)
1607  id_str += "; ";
1608  else
1609  need_delimiter = true;
1610 
1611  id_str += id->GetStr();
1612  }
1613  }
1614 
1615  xrefs.push_back(tag);
1616  }
1617 
1618  if (xrefs.empty())
1619  return;
1620 
1621  CRef<CSeq_feat> feat(new CSeq_feat);
1622  CImp_feat& imp = feat->SetData().SetImp();
1623  imp.SetKey("misc_feature");
1624  feat->SetDbxref().swap(xrefs);
1625  feat->SetLocation(*fta_get_seqloc_int_whole(*(*bioseq.SetId().begin()), ibp->bases));
1626 
1627  CBioseq::TAnnot& annot = bioseq.SetAnnot();
1628  if (annot.empty() || ! (*annot.begin())->IsFtable()) {
1629  CRef<CSeq_annot> new_annot(new CSeq_annot);
1630  new_annot->SetData().SetFtable().push_back(feat);
1631 
1632  annot.push_back(new_annot);
1633  } else {
1634  CSeq_annot& old_annot = *(*annot.begin());
1635  old_annot.SetData().SetFtable().push_front(feat);
1636  }
1637 }
1638 
1639 static bool s_HasTPAPrefix(const CTempString& line)
1640 {
1641  return NStr::StartsWith(line, "TPA:") ||
1642  NStr::StartsWith(line, "TPA_exp:") ||
1643  NStr::StartsWith(line, "TPA_inf:") ||
1644  NStr::StartsWith(line, "TPA_asm:") ||
1645  NStr::StartsWith(line, "TPA_reasm:") ||
1646  NStr::StartsWith(line, "TPA_specdb:");
1647 }
1648 
1649 /**********************************************************/
1650 static void GetEmblDescr(ParserPtr pp, const DataBlk& entry, CBioseq& bioseq)
1651 {
1652  IndexblkPtr ibp;
1653  DataBlkPtr dbp;
1654 
1655  char* offset;
1656  char* str;
1657  string gbdiv;
1658  char* p;
1659  char* q;
1660 
1661  bool is_htg = false;
1662 
1663  size_t len;
1664 
1665  ibp = pp->entrylist[pp->curindx];
1666 
1667  /* pp->source == NCBI then no embl-block, only GB-block
1668  */
1669 
1670  /* DE data ==> descr_title
1671  */
1672  str = nullptr;
1673  offset = xSrchNodeType(entry, ParFlat_DE, &len);
1674 
1675  string title;
1676 
1677  if (offset) {
1679 
1680  for (p = str, q = p; *q != '\0';) {
1681  *p++ = *q;
1682  if (*q++ == ';')
1683  while (*q == ';')
1684  q++;
1685  }
1686 
1687  *p = '\0';
1688  if (p > str) {
1689  for (p--; *p == ' ' || *p == ';'; p--)
1690  if (p == str)
1691  break;
1692  if (*p != ' ' && *p != ';')
1693  p++;
1694  *p = '\0';
1695  }
1696 
1697  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL &&
1698  StringEquN(str, "TPA:", 4)) {
1699  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA:\" prefix is present on its definition line. Entry dropped.");
1700  ibp->drop = true;
1701  return;
1702  }
1703 
1704  if (ibp->is_tsa == false && StringEquN(str, "TSA:", 4)) {
1705  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA:\" prefix is present on its definition line. Entry dropped.");
1706  ibp->drop = true;
1707  return;
1708  }
1709 
1710  if (ibp->is_tls == false && StringEquN(str, "TLS:", 4)) {
1711  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS:\" prefix is present on its definition line. Entry dropped.");
1712  ibp->drop = true;
1713  return;
1714  }
1715 
1716  if (StringEquN(str, "TPA:", 4)) {
1717  string str1;
1718  if (ibp->assembly)
1719  str1 = "TPA_asm:";
1720  else if (ibp->specialist_db)
1721  str1 = "TPA_specdb:";
1722  else if (ibp->inferential)
1723  str1 = "TPA_inf:";
1724  else if (ibp->experimental)
1725  str1 = "TPA_exp:";
1726 
1727  if (! str1.empty()) {
1728  str1.append(str + 4);
1729  MemFree(str);
1730  str = StringSave(str1.c_str());
1731  }
1732  }
1733 
1734  CRef<CSeqdesc> descr(new CSeqdesc);
1735  descr->SetTitle(str);
1736  bioseq.SetDescr().Set().push_back(descr);
1737 
1738  title = str;
1739  MemFree(str);
1740  str = nullptr;
1741  }
1742 
1743  offset = xSrchNodeType(entry, ParFlat_PR, &len);
1744  if (offset)
1746 
1747  if (ibp->is_tpa &&
1748  (title.empty() || ! s_HasTPAPrefix(title))) {
1749  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA:\" prefix on its definition line. Entry dropped.");
1750  ibp->drop = true;
1751  return;
1752  }
1753 
1754  if (ibp->is_tsa && ! ibp->is_tpa &&
1755  (title.empty() || ! StringEquN(title.c_str(), "TSA:", 4))) {
1756  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TSA:\" prefix on its definition line. Entry dropped.");
1757  ibp->drop = true;
1758  return;
1759  }
1760 
1761  if (ibp->is_tls && (title.empty() || ! StringEquN(title.c_str(), "TLS:", 4))) {
1762  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS:\" prefix on its definition line. Entry dropped.");
1763  ibp->drop = true;
1764  return;
1765  }
1766 
1767  /* RN data ==> pub should be before GBblock because we need patent ref
1768  */
1769  dbp = TrackNodeType(entry, ParFlat_REF_END);
1770  for (; dbp; dbp = dbp->mpNext) {
1771  if (dbp->mType != ParFlat_REF_END)
1772  continue;
1773 
1774  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, ParFlat_COL_DATA_EMBL);
1775  if (pubdesc.NotEmpty()) {
1776  CRef<CSeqdesc> descr(new CSeqdesc);
1777  descr->SetPub(*pubdesc);
1778  bioseq.SetDescr().Set().push_back(descr);
1779  }
1780  }
1781 
1782  dbp = TrackNodeType(entry, ParFlat_REF_NO_TARGET);
1783  for (; dbp; dbp = dbp->mpNext) {
1784  if (dbp->mType != ParFlat_REF_NO_TARGET)
1785  continue;
1786 
1787  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, ParFlat_COL_DATA_EMBL);
1788  if (pubdesc.NotEmpty()) {
1789  CRef<CSeqdesc> descr(new CSeqdesc);
1790  descr->SetPub(*pubdesc);
1791  bioseq.SetDescr().Set().push_back(descr);
1792  }
1793  }
1794 
1795  /* OS data ==> descr_org
1796  */
1797  CBioSource* bio_src = nullptr;
1798  COrg_ref* org_ref = nullptr;
1799 
1800  for (auto& descr : bioseq.SetDescr().Set()) {
1801  if (descr->IsSource()) {
1802  bio_src = &(descr->SetSource());
1803  if (bio_src->IsSetOrg())
1804  org_ref = &bio_src->SetOrg();
1805  break;
1806  }
1807  }
1808 
1809  /* MolInfo, 3rd or 4th token in the ID line
1810  */
1811  CRef<CMolInfo> mol_info = GetEmblMolInfo(pp, entry, org_ref);
1812 
1813  TStringList dr_ena,
1814  dr_biosample;
1815 
1816  CRef<CEMBL_block> embl_block =
1817  GetDescrEmblBlock(pp, entry, *mol_info, gbdiv, bio_src, dr_ena, dr_biosample);
1818 
1819  if (pp->source == Parser::ESource::EMBL && embl_block.NotEmpty())
1820  fta_create_imgt_misc_feat(bioseq, *embl_block, ibp);
1821 
1822  if ((pp->source == Parser::ESource::DDBJ || pp->source == Parser::ESource::EMBL) &&
1823  ibp->is_contig && ! mol_info->IsSetTech()) {
1824  CMolInfo::TTech tech = fta_check_con_for_wgs(bioseq);
1825  if (tech == CMolInfo::eTech_unknown)
1826  mol_info->ResetTech();
1827  else
1828  mol_info->SetTech(tech);
1829  }
1830 
1831  if (mol_info->IsSetBiomol() || mol_info->IsSetTech()) {
1832  CRef<CSeqdesc> descr(new CSeqdesc);
1833  descr->SetMolinfo(*mol_info);
1834  bioseq.SetDescr().Set().push_back(descr);
1835 
1836  if (mol_info->IsSetTech() && (mol_info->GetTech() == CMolInfo::eTech_htgs_0 || mol_info->GetTech() == CMolInfo::eTech_htgs_1 ||
1837  mol_info->GetTech() == CMolInfo::eTech_htgs_2))
1838  is_htg = true;
1839  } else {
1840  mol_info.Reset();
1841  }
1842 
1843  CRef<CUser_object> dbuop;
1844  if (! dr_ena.empty() || ! dr_biosample.empty())
1845  fta_build_ena_user_object(bioseq.SetDescr().Set(), dr_ena, dr_biosample, dbuop);
1846 
1847  if (embl_block.Empty()) {
1848  ibp->drop = true;
1849  return;
1850  }
1851 
1852  if (StringEquNI(ibp->division, "CON", 3))
1853  fta_add_hist(pp, bioseq, embl_block->SetExtra_acc(), Parser::ESource::EMBL, CSeq_id::e_Embl, true, ibp->acnum);
1854  else
1855  fta_add_hist(pp, bioseq, embl_block->SetExtra_acc(), Parser::ESource::EMBL, CSeq_id::e_Embl, false, ibp->acnum);
1856 
1857  if (embl_block->GetExtra_acc().empty())
1858  embl_block->ResetExtra_acc();
1859 
1860  CRef<CGB_block> gbb;
1861 
1862  if (pp->source == Parser::ESource::NCBI || (! embl_block->IsSetDiv() && ! gbdiv.empty())) {
1863  gbb = GetEmblGBBlock(pp, entry, gbdiv, bio_src); /* GB-block */
1864  }
1865 
1866  gbdiv.clear();
1867 
1868  bool hasEmblBlock = false;
1869  if (pp->source != Parser::ESource::NCBI) {
1870  CRef<CSeqdesc> descr(new CSeqdesc);
1871  descr->SetEmbl(*embl_block);
1872  bioseq.SetDescr().Set().push_back(descr);
1873  hasEmblBlock = true;
1874  }
1875 
1876  offset = xSrchNodeType(entry, ParFlat_AH, &len);
1877  if (! offset && ibp->is_tpa && ibp->is_wgs == false) {
1878  if (ibp->inferential || ibp->experimental) {
1879  if (! fta_dblink_has_sra(dbuop)) {
1880  ErrPostEx(SEV_REJECT, ERR_TPA_TpaSpansMissing, "TPA:%s record lacks both AH/PRIMARY linetype and Sequence Read Archive links. Entry dropped.", (ibp->inferential == false) ? "experimental" : "inferential");
1881  ibp->drop = true;
1882  return;
1883  }
1884  } else if (ibp->specialist_db == false) {
1885  ErrPostEx(SEV_REJECT, ERR_TPA_TpaSpansMissing, "TPA record lacks required AH/PRIMARY linetype. Entry dropped.");
1886  ibp->drop = true;
1887  return;
1888  }
1889  }
1890 
1891  if (offset && len > 0 &&
1892  fta_parse_tpa_tsa_block(bioseq, offset, ibp->acnum, ibp->vernum, len, ParFlat_COL_DATA_EMBL, ibp->is_tpa) == false) {
1893  ibp->drop = true;
1894  return;
1895  }
1896 
1897  /* GB-block and div
1898  */
1899  if (pp->taxserver == 1) {
1900  if (hasEmblBlock && embl_block->IsSetDiv() && embl_block->GetDiv() < 15) {
1901  if (org_ref && org_ref->IsSetOrgname() && ! org_ref->GetOrgname().IsSetDiv() &&
1902  (! org_ref->IsSetDb() || ! fta_orgref_has_taxid(org_ref->GetDb()))) {
1903  org_ref->SetOrgname().SetDiv(ParFlat_GBDIV_array[embl_block->GetDiv()]);
1904  }
1905 
1906  if (bioseq.IsSetAnnot()) {
1907  for (auto& pAnnot : bioseq.SetAnnot()) {
1908  if (pAnnot->IsFtable()) {
1909  for (auto& pFeat : pAnnot->SetData().SetFtable()) {
1910  if (pFeat->IsSetData() && pFeat->SetData().IsBiosrc()) {
1911  auto& biosrc = pFeat->SetData().SetBiosrc();
1912  if (biosrc.IsSetOrg() &&
1913  (! biosrc.GetOrg().IsSetDb() ||
1914  ! fta_orgref_has_taxid(biosrc.GetOrg().GetDb()))) {
1915  biosrc.SetOrg().SetOrgname().SetDiv(ParFlat_GBDIV_array[embl_block->GetDiv()]);
1916  }
1917  }
1918  }
1919  }
1920  }
1921  }
1922  } else if (gbb && gbb->IsSetDiv()) {
1923  fta_fix_orgref_div(bioseq.GetAnnot(), org_ref, *gbb);
1924  }
1925  }
1926 
1927  if (gbb) {
1928  CRef<CSeqdesc> descr(new CSeqdesc);
1929  descr->SetGenbank(*gbb);
1930  bioseq.SetDescr().Set().push_back(descr);
1931  }
1932 
1933  /* all CC data ==> comment
1934  */
1935  offset = xSrchNodeType(entry, ParFlat_CC, &len);
1936  if (offset && len > 0) {
1937  str = GetDescrComment(offset, len, ParFlat_COL_DATA_EMBL, (pp->xml_comp ? false : is_htg), ibp->is_pat);
1938  if (str) {
1939  bool bad = false;
1940  TUserObjVector user_objs;
1941 
1942  fta_parse_structured_comment(str, bad, user_objs);
1943  if (bad) {
1944  ibp->drop = true;
1945  MemFree(str);
1946  return;
1947  }
1948 
1949  for (auto& user_obj : user_objs) {
1950  CRef<CSeqdesc> descr(new CSeqdesc);
1951  descr->SetUser(*user_obj);
1952  bioseq.SetDescr().Set().push_back(descr);
1953  }
1954 
1955  if (pp->xml_comp) {
1956  for (q = str, p = q; *p != '\0';) {
1957  if (*p == ';' && (p[1] == ' ' || p[1] == '~'))
1958  *p = ' ';
1959  if (*p == '~' || *p == ' ') {
1960  *q++ = ' ';
1961  for (p++; *p == ' ' || *p == '~';)
1962  p++;
1963  } else
1964  *q++ = *p++;
1965  }
1966  *q = '\0';
1967  }
1968 
1969  if (str[0] != 0) {
1970  CRef<CSeqdesc> descr(new CSeqdesc);
1971  descr->SetComment(str);
1972  bioseq.SetDescr().Set().push_back(descr);
1973  }
1974  MemFree(str);
1975  }
1976  }
1977 
1978  if (pp->no_date)
1979  return;
1980 
1981  /* DT data ==> create-date, update-date
1982  */
1983 
1984  CRef<CDate_std> std_creation_date,
1985  std_update_date;
1986  GetEmblDate(pp->source, entry, std_creation_date, std_update_date);
1987  if (std_creation_date.NotEmpty()) {
1988  CRef<CSeqdesc> descr(new CSeqdesc);
1989  descr->SetCreate_date().SetStd(*std_creation_date);
1990  bioseq.SetDescr().Set().push_back(descr);
1991  }
1992 
1993  if (std_update_date.NotEmpty()) {
1994  CRef<CSeqdesc> descr(new CSeqdesc);
1995  descr->SetUpdate_date().SetStd(*std_update_date);
1996  bioseq.SetDescr().Set().push_back(descr);
1997 
1998  if (std_creation_date.NotEmpty() && std_creation_date->Compare(*std_update_date) == CDate::eCompare_after) {
1999  string crdate_str, update_str;
2000  std_creation_date->GetDate(&crdate_str, "%2M-%2D-%4Y");
2001  std_update_date->GetDate(&crdate_str, "%2M-%2D-%4Y");
2002  ErrPostEx(SEV_ERROR, ERR_DATE_IllegalDate, "Update-date \"%s\" precedes create-date \"%s\".", update_str.c_str(), crdate_str.c_str());
2003  }
2004  }
2005 }
2006 
2007 /**********************************************************/
2008 static void FakeEmblBioSources(const DataBlk& entry, CBioseq& bioseq)
2009 {
2010  DataBlkPtr dbp;
2011  DataBlkPtr subdbp;
2012 
2013  char* p;
2014  char* q;
2015  Char ch;
2016 
2017  dbp = TrackNodeType(entry, ParFlat_OS);
2018  if (! dbp) {
2019  ErrPostStr(SEV_WARNING, ERR_ORGANISM_NoOrganism, "No Organism data in Embl format file");
2020  return;
2021  }
2022 
2023  for (; dbp; dbp = dbp->mpNext) {
2024  if (dbp->mType != ParFlat_OS)
2025  continue;
2026 
2027  CRef<COrg_ref> org_ref = GetEmblOrgRef(dbp);
2028  if (org_ref.Empty())
2029  continue;
2030 
2031  CRef<CBioSource> bio_src(new CBioSource);
2032  bio_src->SetOrg(*org_ref);
2033 
2034  string& taxname_str = org_ref->SetTaxname();
2035  size_t off_pos = 0;
2036  if (GetGenomeInfo(*bio_src, taxname_str.c_str()) && bio_src->GetGenome() != 9) /* ! Plasmid */
2037  {
2038  while (taxname_str[off_pos] != ' ' && off_pos < taxname_str.size())
2039  ++off_pos;
2040  while (taxname_str[off_pos] == ' ' && off_pos < taxname_str.size())
2041  ++off_pos;
2042  }
2043 
2044  taxname_str = taxname_str.substr(off_pos);
2045  if (taxname_str == "Unknown.") {
2046  taxname_str = taxname_str.substr(0, taxname_str.size() - 1);
2047  }
2048 
2049  subdbp = static_cast<DataBlk*>(dbp->mpData);
2050  for (; subdbp; subdbp = subdbp->mpNext) {
2051  if (subdbp->mType == ParFlat_OG) {
2052  GetGenomeInfo(*bio_src, subdbp->mOffset + ParFlat_COL_DATA_EMBL);
2053  continue;
2054  }
2055  if (subdbp->mType != ParFlat_OC || ! subdbp->mOffset ||
2056  subdbp->len < ParFlat_COL_DATA_EMBL)
2057  continue;
2058 
2059  ch = subdbp->mOffset[subdbp->len];
2060  subdbp->mOffset[subdbp->len] = '\0';
2061  q = StringSave(subdbp->mOffset + ParFlat_COL_DATA_EMBL);
2062  subdbp->mOffset[subdbp->len] = ch;
2063  for (p = q; p;) {
2064  p = StringStr(p, "\nOC ");
2065  if (p)
2066  fta_StringCpy(p, p + 5);
2067  }
2068  for (p = q; *p != '\0';)
2069  p++;
2070  if (p == q) {
2071  MemFree(q);
2072  continue;
2073  }
2074  for (p--;; p--) {
2075  if (*p != ' ' && *p != '\t' && *p != '\n' && *p != '.' &&
2076  *p != ';') {
2077  p++;
2078  break;
2079  }
2080  if (p == q)
2081  break;
2082  }
2083  if (p == q) {
2084  MemFree(q);
2085  continue;
2086  }
2087  *p = '\0';
2088 
2089  if (! org_ref->IsSetOrgname()) {
2090  org_ref->SetOrgname().SetLineage(q);
2091  }
2092  MemFree(q);
2093  }
2094 
2095  CRef<CSeqdesc> descr(new CSeqdesc);
2096  descr->SetSource(*bio_src);
2097  bioseq.SetDescr().Set().push_front(descr);
2098  }
2099 }
2100 
2101 /**********************************************************/
2102 static void EmblGetDivision(IndexblkPtr ibp, const DataBlk& entry)
2103 {
2104  const char* p;
2105  const char* q;
2106 
2107  p = StringChr(entry.mOffset, ';');
2108  if (! p)
2109  p = entry.mOffset;
2110  else {
2111  q = StringChr(p + 1, ';');
2112  if (q)
2113  p = q;
2114  }
2115  while (*p == ' ' || *p == ';')
2116  p++;
2117 
2118  StringNCpy(ibp->division, p, 3);
2119  ibp->division[3] = '\0';
2120 }
2121 
2122 /**********************************************************/
2123 static void EmblGetDivisionNewID(IndexblkPtr ibp, const DataBlk& entry)
2124 {
2125  const char* p;
2126  Int4 i;
2127 
2128  for (i = 0, p = entry.mOffset; *p != '\0' && i < 4; p++)
2129  if (*p == ';' && p[1] == ' ')
2130  i++;
2131 
2132  while (*p == ' ')
2133  p++;
2134 
2136  if (i < 0) {
2137  p = StringChr(p, ';');
2138  if (p)
2139  for (p++; *p == ' ';)
2140  p++;
2141  } else if (i == 0)
2142  p = "CON";
2143 
2144  if (! p)
2145  p = " ";
2146 
2147  StringNCpy(ibp->division, p, 3);
2148  ibp->division[3] = '\0';
2149 }
2150 
2151 /**********************************************************
2152  *
2153  * bool EmblAscii(pp):
2154  *
2155  * Return FALSE if allocate entry block failed.
2156  *
2157  **********************************************************/
2159 {
2160  Int2 curkw;
2161  Int4 i;
2162  Int4 imax;
2163  Int4 total = 0;
2164  char* ptr;
2165  char* eptr;
2166 
2167  // DataBlkPtr entry;
2168  EntryBlkPtr ebp;
2169  TEntryList seq_entries;
2170  CSeq_loc locs;
2171 
2172  bool reject_set;
2173  bool seq_long = false;
2174  IndexblkPtr ibp;
2175 
2176  auto dnaconv = GetDNAConv(); /* set up sequence alphabets */
2177 
2178  for (imax = pp->indx, i = 0; i < imax; i++) {
2179  pp->curindx = i;
2180  ibp = pp->entrylist[i];
2181 
2182  err_install(ibp, pp->accver);
2183  if (! ibp->drop) {
2184  unique_ptr<DataBlk, decltype(&xFreeEntry)> pEntry(
2185  LoadEntry(pp, ibp->offset, ibp->len), &xFreeEntry);
2186  // pEntry.reset(LoadEntry(pp, ibp->offset, ibp->len));
2187  if (! pEntry) {
2189  return false;
2190  }
2191  ebp = static_cast<EntryBlk*>(pEntry->mpData);
2192  ptr = pEntry->mOffset; /* points to beginning of the
2193  memory line */
2194  eptr = ptr + pEntry->len;
2195  curkw = ParFlat_ID;
2196 
2197  // TODO: below is a potentially infinite cycle!!!!
2198  while (curkw != ParFlatEM_END) {
2199  /* ptr points to current keyword's memory line */
2200  ptr = GetEmblBlock(&ebp->chain, ptr, &curkw, pp->format, eptr);
2201  }
2202 
2203  if (ibp->embl_new_ID)
2204  EmblGetDivisionNewID(ibp, *pEntry);
2205  else
2206  EmblGetDivision(ibp, *pEntry);
2207 
2208  if (StringEqu(ibp->division, "TSA")) {
2209  if (ibp->tsa_allowed == false)
2210  ErrPostEx(SEV_WARNING, ERR_TSA_UnexpectedPrimaryAccession, "The record with accession \"%s\" is not expected to have a TSA division code.", ibp->acnum);
2211  ibp->is_tsa = true;
2212  }
2213 
2214  if (! CheckEmblContigEverywhere(ibp, pp->source)) {
2215  // if (ibp->drop) {
2216  ibp->drop = true;
2217  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2218  continue;
2219  }
2220 
2221  if (ptr >= eptr) {
2222  ibp->drop = true;
2223  ErrPostStr(SEV_ERROR, ERR_FORMAT_MissingEnd, "Missing end of the entry, entry dropped");
2224  if (pp->segment == false) {
2225  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2226  }
2227  continue;
2228  }
2229  GetEmblSubBlock(ibp->bases, pp->source, *pEntry);
2230 
2231  CRef<CBioseq> bioseq = CreateEntryBioseq(pp);
2232  AddNIDSeqId(*bioseq, *pEntry, ParFlat_NI, ParFlat_COL_DATA_EMBL, pp->source);
2233 
2234  ebp->seq_entry.Reset(new CSeq_entry);
2235  ebp->seq_entry->SetSeq(*bioseq);
2236  GetScope().AddBioseq(*bioseq);
2237 
2238  if (! pp->accver) {
2239  GetReleaseInfo(*pEntry);
2240  }
2241  if (! GetEmblInst(pp, *pEntry, dnaconv.get())) {
2242  ibp->drop = true;
2243  ErrPostStr(SEV_REJECT, ERR_SEQUENCE_BadData, "Bad sequence data, entry dropped");
2244  if (pp->segment == false) {
2245  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2246  }
2247  continue;
2248  }
2249 
2250  FakeEmblBioSources(*pEntry, *bioseq);
2251  LoadFeat(pp, *pEntry, *bioseq);
2252 
2253  if (! bioseq->IsSetAnnot() && ibp->drop) {
2254  if (pp->segment == false) {
2255  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2256  }
2257  continue;
2258  }
2259 
2260  GetEmblDescr(pp, *pEntry, *bioseq);
2261 
2262  if (ibp->drop) {
2263  if (pp->segment == false) {
2264  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2265  }
2266  continue;
2267  }
2268 
2269  fta_set_molinfo_completeness(*bioseq, ibp);
2270 
2271  if (ibp->is_tsa)
2272  fta_tsa_tls_comment_dblink_check(*bioseq, true);
2273 
2274  if (ibp->is_tls)
2275  fta_tsa_tls_comment_dblink_check(*bioseq, false);
2276 
2277  if (bioseq->GetInst().IsNa()) {
2278  if (bioseq->GetInst().GetRepr() == CSeq_inst::eRepr_raw) {
2279  if (ibp->gaps)
2280  GapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
2281  else if (ibp->htg == 4 || ibp->htg == 1 || ibp->htg == 2 ||
2282  (ibp->is_pat && pp->source == Parser::ESource::DDBJ))
2283  SeqToDelta(*bioseq, ibp->htg);
2284  } else if (ibp->gaps)
2285  AssemblyGapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
2286  }
2287 
2288  if (pEntry->mpQscore.empty() && pp->accver) {
2289  if (pp->ff_get_qscore)
2290  pEntry->mpQscore = (*pp->ff_get_qscore)(ibp->acnum, ibp->vernum);
2291  else if (pp->ff_get_qscore_pp)
2292  pEntry->mpQscore = (*pp->ff_get_qscore_pp)(ibp->acnum, ibp->vernum, pp);
2293  if (pp->qsfd && ibp->qslength > 0)
2294  pEntry->mpQscore = GetQSFromFile(pp->qsfd, ibp);
2295  }
2296 
2297  if (! QscoreToSeqAnnot(pEntry->mpQscore, *bioseq, ibp->acnum, ibp->vernum, false, false)) {
2298  if (pp->ign_bad_qs == false) {
2299  ibp->drop = true;
2300  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse, "Error while parsing QScore. Entry dropped.");
2301  if (pp->segment == false) {
2302  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2303  }
2304  continue;
2305  }
2306  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse, "Error while parsing QScore.");
2307  }
2308 
2309  pEntry->mpQscore.clear();
2310 
2311  /* add PatentSeqId if patent is found in reference
2312  */
2313  if (ibp->psip.NotEmpty()) {
2314  CRef<CSeq_id> id(new CSeq_id);
2315  id->SetPatent(*ibp->psip);
2316  bioseq->SetId().push_back(id);
2317  ibp->psip.Reset();
2318  }
2319 
2320  if (no_reference(*bioseq) && pp->debug == false) {
2321  ibp->drop = true;
2322  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No reference for the entry, entry dropped");
2323  if (pp->segment == false) {
2324  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2325  }
2326  continue;
2327  }
2328 
2329  seq_entries.push_back(ebp->seq_entry);
2330  ebp->seq_entry.Reset();
2331 
2332  if (pp->segment == false) {
2333  if (pp->limit != 0 && ibp->bases > (size_t)pp->limit) {
2334  if (ibp->htg == 4 || ibp->htg == 1 || ibp->htg == 2) {
2335  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence, "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem", ibp->locusname, ibp->acnum, pp->limit);
2336  } else
2337  seq_long = true;
2338  }
2339 
2340  if (! OutputEmblAsn(seq_long, pp, seq_entries))
2341  ibp->drop = true;
2342  else if (! ibp->drop)
2343  total++;
2344  seq_long = false;
2345  } else {
2346  GetSeqExt(pp, locs);
2347  }
2348  GetScope().ResetHistory();
2349  } /* if, not drop */
2350 
2351  if (pp->segment == false) {
2352  if (! ibp->drop) {
2353  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry parsed successfully: \"%s|%s\".", ibp->locusname, ibp->acnum);
2354  } else {
2355  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2356  }
2357  }
2358  } /* while, ascii block entries */
2359 
2360  if (pp->segment) {
2361  /* reject the whole set if any one entry was rejected
2362  */
2363  for (reject_set = false, i = 0; i < imax; i++) {
2364  if (pp->entrylist[i]->drop) {
2365  reject_set = true;
2366  break;
2367  }
2368  }
2369  if (pp->limit != 0 && ! reject_set) {
2370  for (seq_long = false, i = 0; i < imax; i++) {
2371  ibp = pp->entrylist[i];
2372  if (ibp->bases > (size_t)pp->limit && ibp->htg != 1 &&
2373  ibp->htg != 2 && ibp->htg != 4) {
2374  seq_long = true;
2375  break;
2376  }
2377  }
2378  if (! seq_long) {
2379  for (i = 0; i < imax; i++) {
2380  ibp = pp->entrylist[i];
2381  if (ibp->bases > (size_t)pp->limit &&
2382  (ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 4)) {
2383  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence, "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem", ibp->locusname, ibp->acnum, pp->limit);
2384  }
2385  }
2386  }
2387  }
2388  if (! reject_set) {
2389  // LCOV_EXCL_START
2390  // Excluded per Mark's request on 12/14/2016
2391  BuildBioSegHeader(pp, seq_entries, locs);
2392  // LCOV_EXCL_STOP
2393 
2394  if (! OutputEmblAsn(seq_long, pp, seq_entries))
2395  reject_set = true;
2396  }
2397  if (! reject_set) {
2398  for (i = 0; i < imax; i++) {
2399  ibp = pp->entrylist[i];
2400  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry parsed successfully: \"%s|%s\".", ibp->locusname, ibp->acnum);
2401  }
2402  total = imax;
2403  } else {
2404  ErrPostEx(SEV_WARNING, ERR_SEGMENT_Rejected, "Reject the whole segmented set.");
2405  for (i = 0; i < imax; i++) {
2406  ibp = pp->entrylist[i];
2407  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2408  }
2409  }
2410  }
2411 
2413 
2414  ErrPostEx(SEV_INFO, ERR_ENTRY_ParsingComplete, "COMPLETED : SUCCEEDED = %d; SKIPPED = %d.", total, imax - total);
2415  return true;
2416 }
2417 
2418 /**********************************************************/
2419 const char* GetEmblDiv(Uint1 num)
2420 {
2421  if (num > 15)
2422  return nullptr;
2423  return ParFlat_Embl_DIV_array[num];
2424 }
2425 
2426 /**********************************************************/
2427 CRef<CEMBL_block> XMLGetEMBLBlock(ParserPtr pp, const char* entry, CMolInfo& mol_info, string& gbdiv, CBioSource* bio_src, TStringList& dr_ena, TStringList& dr_biosample)
2428 {
2429  CRef<CEMBL_block> embl(new CEMBL_block),
2430  ret;
2431 
2432  IndexblkPtr ibp;
2433  char* bptr;
2434  char* kw;
2435  char* kwp;
2436 
2437  CEMBL_block::EDiv div;
2438 
2439  bool pat_ref = false;
2440  bool est_kwd = false;
2441  bool sts_kwd = false;
2442  bool gss_kwd = false;
2443  bool htc_kwd = false;
2444  bool fli_kwd = false;
2445  bool wgs_kwd = false;
2446  bool tpa_kwd = false;
2447  bool env_kwd = false;
2448  bool mga_kwd = false;
2449  bool tsa_kwd = false;
2450  bool tls_kwd = false;
2451  bool cancelled;
2452 
2453  char* tempdiv;
2454  char* r;
2455  Int4 i;
2456  Char dataclass[4];
2457 
2458  ibp = pp->entrylist[pp->curindx];
2459 
2460  bool if_cds = XMLCheckCDS(entry, ibp->xip);
2461 
2462  if (ibp->psip.NotEmpty())
2463  pat_ref = true;
2464 
2465  if (! ibp->keywords.empty()) {
2466  embl->SetKeywords().swap(ibp->keywords);
2467  ibp->keywords.clear();
2468  } else
2469  XMLGetKeywords(entry, ibp->xip, embl->SetKeywords());
2470 
2471  for (const string& key : embl->GetKeywords()) {
2472  fta_keywords_check(key.c_str(), &est_kwd, &sts_kwd, &gss_kwd, &htc_kwd, &fli_kwd, &wgs_kwd, &tpa_kwd, &env_kwd, &mga_kwd, &tsa_kwd, &tls_kwd);
2473  }
2474 
2475  if (ibp->env_sample_qual == false && env_kwd) {
2476  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ENV_NoMatchingQualifier, "This record utilizes the ENV keyword, but there are no /environmental_sample qualifiers among its source features.");
2477  return ret;
2478  }
2479 
2480  bptr = XMLFindTagValue(entry, ibp->xip, INSDSEQ_DIVISION);
2481  div = static_cast<CEMBL_block::TDiv>(fta_StringMatch(ParFlat_Embl_DIV_array, bptr));
2482  dataclass[0] = '\0';
2483  if (bptr) {
2484  bptr[3] = '\0';
2485  StringCpy(dataclass, bptr);
2486  }
2487  if (div < 0) {
2488  ErrPostEx(SEV_REJECT, ERR_DIVISION_UnknownDivCode, "Unknown division code \"%s\" found in Embl flatfile. Record rejected.", bptr);
2489  if (bptr)
2490  MemFree(bptr);
2491  return ret;
2492  }
2493 
2494  if (bptr)
2495  MemFree(bptr);
2496 
2497  /* Embl has recently (7-19-93, email) decided to change the name of
2498  * its "UNA"==10 division to "UNC"==16 (for "unclassified")
2499  */
2500  if (div == 16)
2501  div = CEMBL_block::eDiv_una;
2502 
2504 
2505  /* 06-10-96 new HUM division replaces the PRI
2506  * it's temporarily mapped to 'other' in asn.1 embl-block.
2507  * Divisions GSS, HUM, HTG, CON, ENV and MUS are mapped to other.
2508  */
2509  int thtg = (div == 18) ? CEMBL_block::eDiv_pri : div;
2510  gbdiv = ParFlat_GBDIV_array[thtg];
2511 
2512  if (div <= CEMBL_block::eDiv_sts)
2513  embl->SetDiv(div);
2514 
2515  const char* p = gbdiv.c_str();
2516  if (ibp->is_tpa &&
2517  (StringEqu(p, "EST") || StringEqu(p, "GSS") ||
2518  StringEqu(p, "PAT") || StringEqu(p, "HTG"))) {
2519  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTPADivcode, "Division code \"%s\" is not legal for TPA records. Entry dropped.", p);
2520  return ret;
2521  }
2522 
2523  if (ibp->is_tsa && ! StringEqu(p, "TSA")) {
2524  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTSADivcode, "Division code \"%s\" is not legal for TSA records. Entry dropped.", p);
2525  return ret;
2526  }
2527 
2528  cancelled = IsCancelled(embl->GetKeywords());
2529 
2530  if (div == 19) /* HTG */
2531  {
2532  if (! HasHtg(embl->GetKeywords())) {
2533  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTGKeywords, "Division is HTG, but entry lacks HTG-related keywords. Entry dropped.");
2534  return ret;
2535  }
2536  tempdiv = StringSave("HTG");
2537  } else
2538  tempdiv = nullptr;
2539 
2540  fta_check_htg_kwds(embl->SetKeywords(), ibp, mol_info);
2541 
2542  XMLDefVsHTGKeywords(mol_info.GetTech(), entry, ibp->xip, cancelled);
2543  if ((mol_info.GetTech() == CMolInfo::eTech_htgs_0 || mol_info.GetTech() == CMolInfo::eTech_htgs_1 ||
2544  mol_info.GetTech() == CMolInfo::eTech_htgs_2) &&
2545  ! gbdiv.empty()) {
2546  gbdiv.clear();
2547  }
2548 
2549  CheckHTGDivision(tempdiv, mol_info.GetTech());
2550  if (tempdiv)
2551  MemFree(tempdiv);
2552 
2553  i = 0;
2554  if (est_kwd)
2555  i++;
2556  if (sts_kwd)
2557  i++;
2558  if (gss_kwd)
2559  i++;
2560  if (ibp->htg > 0)
2561  i++;
2562  if (htc_kwd)
2563  i++;
2564  if (fli_kwd)
2565  i++;
2566  if (wgs_kwd)
2567  i++;
2568  if (env_kwd)
2569  i++;
2570  if (mga_kwd) {
2571  if (ibp->is_mga == false) {
2572  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeCAGE, "This is apparently _not_ a CAGE record, but the special keywords are present. Entry dropped.");
2573  return ret;
2574  }
2575  i++;
2576  } else if (ibp->is_mga) {
2577  ErrPostEx(SEV_REJECT, ERR_KEYWORD_NoGeneExpressionKeywords, "This is apparently a CAGE or 5'-SAGE record, but it lacks the required keywords. Entry dropped.");
2578  return ret;
2579  }
2580 
2581  if (tpa_kwd) {
2582  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL) {
2583  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA\" and/or \"Third Party Annotation\" keywords are present. Entry dropped.");
2584  return ret;
2585  }
2586  i++;
2587  } else if (ibp->is_tpa) {
2588  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA\" and/or \"Third Party Annotation\" keywords. Entry dropped.");
2589  return ret;
2590  }
2591 
2592  if (tsa_kwd) {
2593  if (ibp->is_tsa == false) {
2594  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords are present. Entry dropped.");
2595  return ret;
2596  }
2597  i++;
2598  } else if (ibp->is_tsa) {
2599  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TPA\" and/or \"Transcriptome Shotgun Assembly\" keywords. Entry dropped.");
2600  return ret;
2601  }
2602 
2603  if (tls_kwd) {
2604  if (ibp->is_tls == false) {
2605  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS\" and/or \"Targeted Locus Study\" keywords are present. Entry dropped.");
2606  return ret;
2607  }
2608  i++;
2609  } else if (ibp->is_tls) {
2610  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS\" and/or \"Targeted Locus Study\" keywords. Entry dropped.");
2611  return ret;
2612  }
2613 
2614  if (i > 1) {
2615  if (i == 2 && ibp->htg > 0 && env_kwd)
2616  ErrPostEx(SEV_WARNING, ERR_KEYWORD_HTGPlusENV, "This HTG record also has the ENV keyword, which is an unusual combination. Confirmation that isolation and cloning steps actually occured might be appropriate.");
2617  else if (i != 2 || env_kwd == false ||
2618  (est_kwd == false && gss_kwd == false && wgs_kwd == false)) {
2619  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingKeywords, "This record contains more than one of the special keywords used to indicate that a sequence is an HTG, EST, GSS, STS, HTC, WGS, ENV, FLI_CDNA, TPA, CAGE, TSA or TLS sequence.");
2620  return ret;
2621  }
2622  }
2623 
2624  if (wgs_kwd)
2625  i--;
2626  if (ibp->is_contig && i > 0 &&
2627  wgs_kwd == false && tpa_kwd == false && env_kwd == false) {
2628  ErrPostEx(SEV_REJECT, ERR_KEYWORD_IllegalForCON, "This CON record should not have HTG, EST, GSS, STS, HTC, FLI_CDNA, CAGE, TSA or TLS special keywords. Entry dropped.");
2629  return ret;
2630  }
2631 
2632  CMolInfo::TTech tech = mol_info.GetTech();
2633  if (tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
2634  tech == CMolInfo::eTech_htgs_2 || tech == CMolInfo::eTech_htgs_3) {
2635  RemoveHtgPhase(embl->SetKeywords());
2636  }
2637 
2638  kw = XMLConcatSubTags(entry, ibp->xip, INSDSEQ_KEYWORDS, ';');
2639  if (kw) {
2640  kwp = StringStr(kw, "EST");
2641  if (kwp && est_kwd == false) {
2642  ErrPostEx(SEV_WARNING, ERR_KEYWORD_ESTSubstring, "Keyword %s has substring EST, but no official EST keywords found", kw);
2643  }
2644  kwp = StringStr(kw, "STS");
2645  if (kwp && sts_kwd == false) {
2646  ErrPostEx(SEV_WARNING, ERR_KEYWORD_STSSubstring, "Keyword %s has substring STS, but no official STS keywords found", kw);
2647  }
2648  kwp = StringStr(kw, "GSS");
2649  if (kwp && gss_kwd == false) {
2650  ErrPostEx(SEV_WARNING, ERR_KEYWORD_GSSSubstring, "Keyword %s has substring GSS, but no official GSS keywords found", kw);
2651  }
2652  MemFree(kw);
2653  }
2654  if (! ibp->is_contig) {
2655  bool drop = false;
2656  CMolInfo::TTech tech = mol_info.GetTech();
2657 
2658  check_div(ibp->is_pat, pat_ref, est_kwd, sts_kwd, gss_kwd, if_cds, gbdiv, &tech, ibp->bases, pp->source, drop);
2659  if (tech != CMolInfo::eTech_unknown)
2660  mol_info.SetTech(tech);
2661  else
2662  mol_info.ResetTech();
2663 
2664  if (drop) {
2665  return ret;
2666  }
2667  } else if (! gbdiv.empty() && StringEqu(gbdiv.c_str(), "CON")) {
2668  gbdiv.clear();
2669  }
2670 
2671  bool is_htc_div = ! gbdiv.empty() && StringEqu(gbdiv.c_str(), "HTC");
2672  bool has_htc = HasHtc(embl->GetKeywords());
2673 
2674  if (is_htc_div && ! has_htc) {
2675  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTCKeyword, "This record is in the HTC division, but lacks the required HTC keyword.");
2676  return ret;
2677  }
2678  if (! is_htc_div && has_htc) {
2679  ErrPostEx(SEV_ERROR, ERR_DIVISION_InvalidHTCKeyword, "This record has the special HTC keyword, but is not in HTC division. If this record has graduated out of HTC, then the keyword should be removed.");
2680  return ret;
2681  }
2682 
2683  if (is_htc_div) {
2684  r = XMLFindTagValue(entry, ibp->xip, INSDSEQ_MOLTYPE);
2685  if (r) {
2686  p = r;
2687  if (*r == 'm' || *r == 'r')
2688  p = r + 1;
2689  else if (StringEquN(r, "pre-", 4))
2690  p = r + 4;
2691  else if (StringEquN(r, "transcribed ", 12))
2692  p = r + 12;
2693 
2694  if (! StringEquN(p, "RNA", 3)) {
2695  ErrPostEx(SEV_ERROR, ERR_DIVISION_HTCWrongMolType, "All HTC division records should have a moltype of pre-RNA, mRNA or RNA.");
2696  MemFree(r);
2697  return ret;
2698  }
2699  MemFree(r);
2700  }
2701  }
2702 
2703  if (fli_kwd)
2705 
2706  /* will be used in flat file database
2707  */
2708  if (! gbdiv.empty()) {
2709  if (StringEqu(gbdiv.c_str(), "EST")) {
2710  ibp->EST = true;
2711  mol_info.SetTech(CMolInfo::eTech_est);
2712  } else if (StringEqu(gbdiv.c_str(), "STS")) {
2713  ibp->STS = true;
2714  mol_info.SetTech(CMolInfo::eTech_sts);
2715  } else if (StringEqu(gbdiv.c_str(), "GSS")) {
2716  ibp->GSS = true;
2717  mol_info.SetTech(CMolInfo::eTech_survey);
2718  } else if (StringEqu(gbdiv.c_str(), "HTC")) {
2719  ibp->HTC = true;
2720  mol_info.SetTech(CMolInfo::eTech_htc);
2721  gbdiv.clear();
2722  } else if (StringEqu(gbdiv.c_str(), "SYN") && bio_src &&
2723  bio_src->IsSetOrigin() && bio_src->GetOrigin() == CBioSource::eOrigin_synthetic) {
2724  gbdiv.clear();
2725  }
2726  } else if (mol_info.IsSetTech()) {
2727  if (mol_info.GetTech() == CMolInfo::eTech_est)
2728  ibp->EST = true;
2729  if (mol_info.GetTech() == CMolInfo::eTech_sts)
2730  ibp->STS = true;
2731  if (mol_info.GetTech() == CMolInfo::eTech_survey)
2732  ibp->GSS = true;
2733  if (mol_info.GetTech() == CMolInfo::eTech_htc)
2734  ibp->HTC = true;
2735  }
2736 
2737  if (mol_info.IsSetTech())
2738  fta_remove_keywords(mol_info.GetTech(), embl->SetKeywords());
2739 
2740  if (ibp->is_tpa)
2741  fta_remove_tpa_keywords(embl->SetKeywords());
2742 
2743  if (ibp->is_tsa)
2744  fta_remove_tsa_keywords(embl->SetKeywords(), pp->source);
2745 
2746  if (ibp->is_tls)
2747  fta_remove_tls_keywords(embl->SetKeywords(), pp->source);
2748 
2749  ibp->wgssec[0] = '\0';
2750  GetExtraAccession(ibp, pp->allow_uwsec, pp->source, embl->SetExtra_acc());
2751 
2752 
2753  CRef<CDate_std> std_creation_date, std_update_date;
2754  if (char* p = XMLFindTagValue(entry, ibp->xip, INSDSEQ_CREATE_DATE)) {
2755  std_creation_date = GetUpdateDate(p, pp->source);
2756  embl->SetCreation_date().SetStd(*std_creation_date);
2757  MemFree(p);
2758  }
2759  if (char* p = XMLFindTagValue(entry, ibp->xip, INSDSEQ_UPDATE_DATE)) {
2760  std_update_date = GetUpdateDate(p, pp->source);
2761  embl->SetUpdate_date().SetStd(*std_update_date);
2762  MemFree(p);
2763  }
2764 
2765  if (std_update_date.Empty() && std_creation_date.NotEmpty())
2766  embl->SetUpdate_date().SetStd(*std_creation_date);
2767 
2768  GetEmblBlockXref(DataBlk(), ibp->xip, entry, dr_ena, dr_biosample, &ibp->drop, *embl);
2769 
2770  if (StringEqu(dataclass, "ANN") || StringEqu(dataclass, "CON")) {
2771  if (StringLen(ibp->acnum) == 8 && StringEquN(ibp->acnum, "CT", 2)) {
2772  bool found = false;
2773  for (const string& acc : embl->SetExtra_acc()) {
2774  if (fta_if_wgs_acc(acc.c_str()) == 0 &&
2775  (acc[0] == 'C' || acc[0] == 'U')) {
2776  found = true;
2777  break;
2778  }
2779  }
2780  if (found)
2781  mol_info.SetTech(CMolInfo::eTech_wgs);
2782  }
2783  }
2784 
2785  return embl;
2786 }
2787 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
bool no_reference(const CBioseq &bioseq)
Definition: add.cpp:208
void SeqToDelta(CBioseq &bioseq, Int2 tech)
Definition: add.cpp:493
CMolInfo::TTech fta_check_con_for_wgs(CBioseq &bioseq)
Definition: add.cpp:2047
bool fta_check_htg_kwds(TKeywordList &kwds, IndexblkPtr ibp, CMolInfo &mol_info)
Definition: add.cpp:885
void fta_set_molinfo_completeness(CBioseq &bioseq, const Indexblk *ibp)
Definition: add.cpp:2770
void fta_add_hist(ParserPtr pp, CBioseq &bioseq, CGB_block::TExtra_accessions &extra_accs, Parser::ESource source, CSeq_id::E_Choice acctype, bool pricon, const char *acc)
Definition: add.cpp:781
void AssemblyGapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:327
bool fta_parse_tpa_tsa_block(CBioseq &bioseq, char *offset, char *acnum, Int2 vernum, size_t len, Int2 col_data, bool tpa)
Definition: add.cpp:1090
bool fta_if_valid_biosample(const Char *id, bool dblink)
Definition: add.cpp:1729
string GetQSFromFile(FILE *fd, const Indexblk *ibp)
Definition: add.cpp:2673
void fta_get_project_user_object(TSeqdescList &descrs, char *offset, Parser::EFormat format, bool *drop, Parser::ESource source)
Definition: add.cpp:1582
bool check_cds(const DataBlk &entry, Parser::EFormat format)
Definition: add.cpp:246
void fta_create_far_fetch_policy_user_object(CBioseq &bsp, Int4 num)
Definition: add.cpp:2795
void fta_tsa_tls_comment_dblink_check(const CBioseq &bioseq, bool is_tsa)
Definition: add.cpp:2725
void fta_remove_cleanup_user_object(CSeq_entry &seq_entry)
Definition: add.cpp:2692
bool fta_if_valid_sra(const Char *id, bool dblink)
Definition: add.cpp:1708
bool fta_dblink_has_sra(const CRef< CUser_object > &uop)
Definition: add.cpp:2864
CRef< CSeq_loc > fta_get_seqloc_int_whole(CSeq_id &seq_id, size_t len)
Definition: add.cpp:1425
void GapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:375
void err_install(const Indexblk *ibp, bool accver)
Definition: add.cpp:290
Int4 fta_fix_seq_loc_id(TSeqLocList &locs, ParserPtr pp, char *location, const char *name, bool iscon)
Definition: add.cpp:2293
void fta_parse_structured_comment(char *str, bool &bad, TUserObjVector &objs)
Definition: add.cpp:2555
void StripSerialNumbers(TEntryList &seq_entries)
Definition: asci_blk.cpp:3377
void AddNIDSeqId(CBioseq &bioseq, const DataBlk &entry, Int2 type, Int2 coldata, Parser::ESource source)
Definition: asci_blk.cpp:2692
void fta_fix_orgref_div(const CBioseq::TAnnot &annots, COrg_ref *org_ref, CGB_block &gbb)
Definition: asci_blk.cpp:3241
char * GetDescrComment(char *offset, size_t len, Int2 col_data, bool is_htg, bool is_pat)
Definition: asci_blk.cpp:1105
void DefVsHTGKeywords(CMolInfo::TTech tech, const DataBlk &entry, Int2 what, Int2 ori, bool cancelled)
Definition: asci_blk.cpp:2787
void fta_sort_seqfeat_cit(TEntryList &seq_entries)
Definition: asci_blk.cpp:3213
void PackEntries(TEntryList &seq_entries)
Definition: asci_blk.cpp:3477
void fta_set_strandedness(TEntryList &seq_entries)
Definition: asci_blk.cpp:3312
void CheckHTGDivision(const char *div, CMolInfo::TTech tech)
Definition: asci_blk.cpp:2917
unique_ptr< unsigned char[]> GetDNAConv(void)
Definition: asci_blk.cpp:1744
bool XMLCheckCDS(const char *entry, XmlIndexPtr xip)
Definition: asci_blk.cpp:3281
bool fta_orgref_has_taxid(const COrg_ref::TDb &dbtags)
Definition: asci_blk.cpp:3229
void EntryCheckDivCode(TEntryList &seq_entries, ParserPtr pp)
Definition: asci_blk.cpp:2776
void GetEmblSubBlock(size_t bases, Parser::ESource source, const DataBlk &entry)
Definition: asci_blk.cpp:686
char * GetEmblBlock(DataBlkPtr *chain, char *ptr, short *retkw, Parser::EFormat format, char *eptr)
Definition: asci_blk.cpp:491
void GetSeqExt(ParserPtr pp, CSeq_loc &seq_loc)
Definition: asci_blk.cpp:2439
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
Definition: asci_blk.cpp:1632
bool fta_EntryCheckGBBlock(TEntryList &seq_entries)
Definition: asci_blk.cpp:3084
void fta_sort_descr(TEntryList &seq_entries)
Definition: asci_blk.cpp:3156
void XMLDefVsHTGKeywords(CMolInfo::TTech tech, const char *entry, XmlIndexPtr xip, bool cancelled)
Definition: asci_blk.cpp:2860
void BuildBioSegHeader(ParserPtr pp, TEntryList &entries, const CSeq_loc &seqloc)
Definition: asci_blk.cpp:2464
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
Definition: asci_blk.cpp:1274
bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string &div, CMolInfo::TTech *tech, size_t bases, Parser::ESource source, bool &drop)
Definition: asci_blk.cpp:2535
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
Definition: asci_blk.cpp:1020
void xFreeEntry(DataBlkPtr entry)
Definition: block.cpp:109
list< string > TStringList
Definition: cgictx.cpp:719
void ProcessCitations(TEntryList &seq_entries)
Definition: citation.cpp:307
CDate::ECompare Compare(const CDate_std &date) const
Indicate how *this relates to another date.
Definition: Date_std.cpp:91
void GetDate(string *label, const string &format) const
Append a custom string representation of the date to the label.
Definition: Date_std.cpp:159
@ eCompare_after
*this comes second.
Definition: Date.hpp:76
Definition: Dbtag.hpp:53
CEMBL_block –.
Definition: EMBL_block.hpp:66
CEMBL_xref –.
Definition: EMBL_xref.hpp:66
@Imp_feat.hpp User-defined methods of the data storage class.
Definition: Imp_feat.hpp:54
const list< string > KeywordList() const
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
char * mOffset
Definition: ftablock.h:332
size_t len
Definition: ftablock.h:333
CFlatFileData * mpData
Definition: ftablock.h:331
DataBlk * mpNext
Definition: ftablock.h:336
int mType
Definition: ftablock.h:330
USING_SCOPE(objects)
static bool GetEmblInst(ParserPtr pp, const DataBlk &entry, unsigned char *dnaconv)
Definition: em_ascii.cpp:840
static bool CheckEmblContigEverywhere(const IndexblkPtr ibp, Parser::ESource source)
Definition: em_ascii.cpp:720
static const char * ParFlat_DRname_array[]
Definition: em_ascii.cpp:152
static bool OutputEmblAsn(bool seq_long, ParserPtr pp, TEntryList &seq_entries)
Definition: em_ascii.cpp:307
static const char * ParFlat_Embl_DIV_array[]
Definition: em_ascii.cpp:112
static void GetEmblDescr(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
Definition: em_ascii.cpp:1650
const char * GetEmblDiv(Uint1 num)
Definition: em_ascii.cpp:2419
static CRef< CUser_field > fta_create_user_field(const char *tag, TStringList &lst)
Definition: em_ascii.cpp:1495
static void fta_create_imgt_misc_feat(CBioseq &bioseq, CEMBL_block &embl_block, IndexblkPtr ibp)
Definition: em_ascii.cpp:1576
static bool s_DuplicatesBiosource(const CBioSource &biosource, const string &gbdiv)
Definition: em_ascii.cpp:1368
static CTextseq_id & SetTextIdRef(CSeq_id &id)
Definition: em_ascii.cpp:605
static void EmblGetDivisionNewID(IndexblkPtr ibp, const DataBlk &entry)
Definition: em_ascii.cpp:2123
static const char * ParFlat_DBname_array[]
Definition: em_ascii.cpp:131
bool GetEmblInstContig(const DataBlk &entry, CBioseq &bioseq, ParserPtr pp)
Definition: em_ascii.cpp:756
static void EmblGetDivision(IndexblkPtr ibp, const DataBlk &entry)
Definition: em_ascii.cpp:2102
void fta_build_ena_user_object(CSeq_descr::Tdata &descrs, TStringList &dr_ena, TStringList &dr_biosample, CRef< CUser_object > &dbuop)
Definition: em_ascii.cpp:1513
static CRef< CMolInfo > GetEmblMolInfo(ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
Definition: em_ascii.cpp:1423
static CRef< CGB_block > GetEmblGBBlock(ParserPtr pp, const DataBlk &entry, const string &gbdiv, CBioSource *bio_src)
Definition: em_ascii.cpp:1377
static const char * ParFlat_Embl_dataclass_array[]
Definition: em_ascii.cpp:105
static CRef< CEMBL_block > GetDescrEmblBlock(ParserPtr pp, const DataBlk &entry, CMolInfo &mol_info, string &gbdiv, const CBioSource *bio_src, TStringList &dr_ena, TStringList &dr_biosample)
Definition: em_ascii.cpp:939
static void FakeEmblBioSources(const DataBlk &entry, CBioseq &bioseq)
Definition: em_ascii.cpp:2008
bool EmblAscii(ParserPtr pp)
Definition: em_ascii.cpp:2158
static void SetXrefObjId(CEMBL_xref &xref, const string &str)
Definition: em_ascii.cpp:368
static const char * ParFlat_GBDIV_array[]
Definition: em_ascii.cpp:122
static void GetReleaseInfo(const DataBlk &entry)
Definition: em_ascii.cpp:641
static CRef< COrg_ref > GetEmblOrgRef(const DataBlkPtr dbp)
Definition: em_ascii.cpp:680
static bool s_HasTPAPrefix(const CTempString &line)
Definition: em_ascii.cpp:1639
static void GetEmblBlockXref(const DataBlk &entry, XmlIndexPtr xip, const char *chentry, TStringList &dr_ena, TStringList &dr_biosample, bool *drop, CEMBL_block &embl)
Definition: em_ascii.cpp:403
CRef< CEMBL_block > XMLGetEMBLBlock(ParserPtr pp, const char *entry, CMolInfo &mol_info, string &gbdiv, CBioSource *bio_src, TStringList &dr_ena, TStringList &dr_biosample)
Definition: em_ascii.cpp:2427
static void GetEmblDate(Parser::ESource source, const DataBlk &entry, CRef< CDate_std > &crdate, CRef< CDate_std > &update)
Definition: em_ascii.cpp:272
@ ParFlat_NI
Definition: embl.h:44
@ ParFlat_OC
Definition: embl.h:61
@ ParFlat_PR
Definition: embl.h:57
@ ParFlat_KW
Definition: embl.h:47
@ ParFlat_AH
Definition: embl.h:56
@ ParFlat_DT
Definition: embl.h:45
@ ParFlat_SQ
Definition: embl.h:53
@ ParFlat_DR
Definition: embl.h:50
@ ParFlat_OS
Definition: embl.h:48
@ ParFlat_CC
Definition: embl.h:51
@ ParFlat_CO
Definition: embl.h:55
@ ParFlat_OG
Definition: embl.h:62
@ ParFlat_DE
Definition: embl.h:46
@ ParFlat_ID
Definition: embl.h:42
@ ParFlatEM_END
Definition: embl.h:58
#define ParFlat_COL_DATA_EMBL
Definition: embl.h:38
DataBlkPtr LoadEntry(ParserPtr pp, size_t offset, size_t len)
Definition: entry.cpp:300
void FinalCleanup(TEntryList &seq_entries)
Definition: fcleanup.cpp:377
#define ERR_DRXREF_DuplicatedSRA
Definition: flat2err.h:600
#define ERR_SEQUENCE_BadData
Definition: flat2err.h:150
#define ERR_TPA_TpaSpansMissing
Definition: flat2err.h:593
#define ERR_ENTRY_LongSequence
Definition: flat2err.h:82
#define ERR_FORMAT_MissingContigFeature
Definition: flat2err.h:43
#define ERR_KEYWORD_ShouldNotBeTPA
Definition: flat2err.h:208
#define ERR_DIVISION_BadTSADivcode
Definition: flat2err.h:261
#define ERR_FORMAT_MissingSequenceData
Definition: flat2err.h:41
#define ERR_DIVISION_InvalidHTCKeyword
Definition: flat2err.h:254
#define ERR_DRXREF_InvalidSRA
Definition: flat2err.h:599
#define ERR_KEYWORD_IllegalForCON
Definition: flat2err.h:210
#define ERR_DIVISION_MissingHTGKeywords
Definition: flat2err.h:249
#define ERR_QSCORE_FailedToParse
Definition: flat2err.h:577
#define ERR_ENTRY_LongHTGSSequence
Definition: flat2err.h:86
#define ERR_KEYWORD_MissingTSA
Definition: flat2err.h:216
#define ERR_DIVISION_BadTPADivcode
Definition: flat2err.h:257
#define ERR_DRXREF_InvalidBioSample
Definition: flat2err.h:597
#define ERR_LOCUS_WrongTopology
Definition: flat2err.h:180
#define ERR_REFERENCE_No_references
Definition: flat2err.h:289
#define ERR_KEYWORD_ShouldNotBeTLS
Definition: flat2err.h:218
#define ERR_ENTRY_GBBlock_not_Empty
Definition: flat2err.h:85
#define ERR_KEYWORD_HTGPlusENV
Definition: flat2err.h:217
#define ERR_DEFINITION_MissingTPA
Definition: flat2err.h:269
#define ERR_ENTRY_Skipped
Definition: flat2err.h:80
#define ERR_DEFINITION_MissingTLS
Definition: flat2err.h:273
#define ERR_KEYWORD_ESTSubstring
Definition: flat2err.h:204
#define ERR_KEYWORD_ConflictingKeywords
Definition: flat2err.h:207
#define ERR_DIVISION_ConDivLacksContig
Definition: flat2err.h:252
#define ERR_LOCATION_ContigHasNull
Definition: flat2err.h:397
#define ERR_KEYWORD_ENV_NoMatchingQualifier
Definition: flat2err.h:214
#define ERR_KEYWORD_ShouldNotBeTSA
Definition: flat2err.h:215
#define ERR_KEYWORD_STSSubstring
Definition: flat2err.h:205
#define ERR_DIVISION_UnknownDivCode
Definition: flat2err.h:222
#define ERR_KEYWORD_MissingTLS
Definition: flat2err.h:219
#define ERR_DEFINITION_ShouldNotBeTSA
Definition: flat2err.h:270
#define ERR_SEGMENT_Rejected
Definition: flat2err.h:166
#define ERR_DIVISION_MissingHTCKeyword
Definition: flat2err.h:253
#define ERR_DIVISION_MappedtoCON
Definition: flat2err.h:248
#define ERR_FORMAT_ContigWithSequenceData
Definition: flat2err.h:42
#define ERR_DRXREF_UnknownDBname
Definition: flat2err.h:596
#define ERR_DRXREF_DuplicatedBioSamples
Definition: flat2err.h:598
#define ERR_KEYWORD_NoGeneExpressionKeywords
Definition: flat2err.h:213
#define ERR_DEFINITION_MissingTSA
Definition: flat2err.h:271
#define ERR_KEYWORD_GSSSubstring
Definition: flat2err.h:206
#define ERR_DEFINITION_ShouldNotBeTPA
Definition: flat2err.h:268
#define ERR_FORMAT_MissingEnd
Definition: flat2err.h:39
#define ERR_KEYWORD_MissingTPA
Definition: flat2err.h:209
#define ERR_DIVISION_ConDivInSegset
Definition: flat2err.h:251
#define ERR_ENTRY_ParsingComplete
Definition: flat2err.h:79
#define ERR_ORGANISM_NoOrganism
Definition: flat2err.h:184
#define ERR_DATE_IllegalDate
Definition: flat2err.h:102
#define ERR_ENTRY_Parsed
Definition: flat2err.h:83
#define ERR_DIVISION_HTCWrongMolType
Definition: flat2err.h:255
#define ERR_KEYWORD_ShouldNotBeCAGE
Definition: flat2err.h:211
#define ERR_DEFINITION_ShouldNotBeTLS
Definition: flat2err.h:272
#define ERR_TSA_UnexpectedPrimaryAccession
Definition: flat2err.h:609
list< CRef< objects::CSeq_entry > > TEntryList
bool QscoreToSeqAnnot(const string &qscore, CBioseq &bioseq, char *acc, Int2 ver, bool check_minmax, bool allow_na)
#define INSDSEQ_MOLTYPE
Definition: fta_xml.h:45
char * XMLFindTagValue(const char *entry, const XmlIndex *xip, Int4 tag)
Definition: xm_index.cpp:213
void XMLGetKeywords(const char *entry, const XmlIndex *xip, TKeywordList &keywords)
Definition: xm_index.cpp:1522
#define INSDSEQ_KEYWORDS
Definition: fta_xml.h:58
#define INSDSEQ_DATABASE_REFERENCE
Definition: fta_xml.h:67
#define INSDSEQ_CREATE_DATE
Definition: fta_xml.h:49
#define INSDSEQ_DIVISION
Definition: fta_xml.h:47
#define INSDSEQ_UPDATE_DATE
Definition: fta_xml.h:48
char * XMLConcatSubTags(const char *entry, const XmlIndex *xip, Int4 tag, Char sep)
Definition: xm_index.cpp:1548
std::list< std::string > TKeywordList
Definition: ftablock.h:166
std::vector< CRef< objects::CUser_object > > TUserObjVector
Definition: ftablock.h:61
char * StringSave(const char *s)
Definition: ftacpp.hpp:61
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:116
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:106
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:96
void StringCpy(char *d, const char *s)
Definition: ftacpp.hpp:74
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:75
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
char * MemNew(size_t sz)
Definition: ftacpp.hpp:43
void FtaDeletePrefix(int prefix)
Definition: ftaerr.cpp:344
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
void fta_find_pub_explore(ParserPtr pp, TEntryList &seq_entries)
Definition: ftanet.cpp:762
void DealWithGenes(TEntryList &seq_entries, ParserPtr pp)
Definition: genref.cpp:2981
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
#define StringStr
Definition: ncbistr.hpp:322
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
void ResetHistory(EActionIfLocked action=eKeepIfLocked)
Clean all unused TSEs from the scope's cache and release the memory.
Definition: scope.cpp:325
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
void ResetDataAndHistory(void)
Clear all information in the scope except added data loaders.
Definition: scope.cpp:331
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3197
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5352
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5383
@ fAllowTrailingSymbols
Ignore trailing non-numerics characters.
Definition: ncbistr.hpp:298
list< CRef< CObject_id > > TId
Definition: EMBL_xref_.hpp:91
const TXref & GetXref(void) const
Get the Xref member data.
TXref & SetXref(void)
Assign a value to Xref data member.
TId & SetId(void)
Assign a value to Id data member.
Definition: EMBL_xref_.hpp:245
bool IsSetXref(void) const
Check if a value has been assigned to Xref data member.
list< CRef< CEMBL_xref > > TXref
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
TOrigin GetOrigin(void) const
Get the Origin member data.
Definition: BioSource_.hpp:472
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
Definition: BioSource_.hpp:447
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
@ eOrigin_synthetic
purely synthetic
Definition: BioSource_.hpp:134
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
void SetYear(TYear value)
Assign a value to Year data member.
Definition: Date_std_.hpp:435
void SetMonth(TMonth value)
Assign a value to Month data member.
Definition: Date_std_.hpp:482
TStd & SetStd(void)
Select the variant.
Definition: Date_.cpp:115
void SetDay(TDay value)
Assign a value to Day data member.
Definition: Date_std_.hpp:529
TData & SetData(void)
Assign a value to Data data member.
void SetNum(TNum value)
Assign a value to Num data member.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
void SetLabel(TLabel &value)
Assign a value to Label data member.
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
TYear GetYear(void) const
Get the Year member data.
Definition: Date_std_.hpp:426
TMonth GetMonth(void) const
Get the Month member data.
Definition: Date_std_.hpp:473
TDay GetDay(void) const
Get the Day member data.
Definition: Date_std_.hpp:520
bool IsSetDb(void) const
ids in taxonomic or culture dbases Check if a value has been assigned to Db data member.
Definition: Org_ref_.hpp:479
const TDiv & GetDiv(void) const
Get the Div member data.
Definition: OrgName_.hpp:1005
void SetCommon(const TCommon &value)
Assign a value to Common data member.
Definition: Org_ref_.hpp:428
const TDb & GetDb(void) const
Get the Db member data.
Definition: Org_ref_.hpp:491
bool IsSetDiv(void) const
GenBank division code Check if a value has been assigned to Div data member.
Definition: OrgName_.hpp:993
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
Definition: Org_ref_.hpp:381
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSeq_code_type_iupacna
IUPAC 1 letter nuc acid code.
vector< CRef< CDbtag > > TDbxref
Definition: Seq_feat_.hpp:123
TDbxref & SetDbxref(void)
Assign a value to Dbxref data member.
Definition: Seq_feat_.hpp:1339
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void SetKey(const TKey &value)
Assign a value to Key data member.
Definition: Imp_feat_.hpp:268
bool IsMix(void) const
Check if variant Mix is selected.
Definition: Seq_loc_.hpp:552
const TMix & GetMix(void) const
Get the variant data.
Definition: Seq_loc_.cpp:282
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
Definition: Seq_id_.hpp:113
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ e_Named_annot_track
Internal named annotation tracking ID.
Definition: Seq_id_.hpp:114
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
TPub & SetPub(void)
Select the variant.
Definition: Seqdesc_.cpp:362
TTopology GetTopology(void) const
Get the Topology member data.
Definition: Seq_inst_.hpp:733
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
TGenbank & SetGenbank(void)
Select the variant.
Definition: Seqdesc_.cpp:340
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
TComment & SetComment(void)
Select the variant.
Definition: Seqdesc_.hpp:1065
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
void ResetTech(void)
Reset Tech data member.
Definition: MolInfo_.hpp:484
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
void SetTopology(TTopology value)
Assign a value to Topology data member.
Definition: Seq_inst_.hpp:739
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
TEmbl & SetEmbl(void)
Select the variant.
Definition: Seqdesc_.cpp:456
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
EStrand
strandedness in living organism
Definition: Seq_inst_.hpp:133
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
void SetStrand(TStrand value)
Assign a value to Strand data member.
Definition: Seq_inst_.hpp:786
void SetTech(TTech value)
Assign a value to Tech data member.
Definition: MolInfo_.hpp:503
TMolinfo & SetMolinfo(void)
Select the variant.
Definition: Seqdesc_.cpp:594
TCreate_date & SetCreate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:478
TUpdate_date & SetUpdate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:500
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_targeted
targeted locus sets/studies
Definition: MolInfo_.hpp:147
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ ParFlat_REF_NO_TARGET
Definition: index.h:63
@ ParFlat_REF_END
Definition: index.h:60
CRef< CDate_std > GetUpdateDate(const char *ptr, Parser::ESource source)
Definition: indx_blk.cpp:611
int CheckSTRAND(const string &str)
Definition: indx_blk.cpp:467
int fta_if_wgs_acc(const CTempString &accession)
Definition: indx_blk.cpp:1193
int i
int len
void GetFlatBiomol(CMolInfo::TBiomol &biomol, CMolInfo::TTech tech, char *molstr, ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
Definition: loadfeat.cpp:5131
void LoadFeat(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
Definition: loadfeat.cpp:4825
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
const char * tag
std::list< SeqLoc > TSeqLocList
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void CheckDupDates(TEntryList &seq_entries)
Definition: nucprot.cpp:2691
CRef< CPubdesc > DescrRefs(ParserPtr pp, DataBlkPtr dbp, Int4 col_data)
Definition: ref.cpp:2445
int offset
Definition: replacements.h:160
static const char * str(char *buf, int n)
Definition: stats.c:84
DataBlkPtr chain
Definition: ftablock.h:344
CRef< objects::CSeq_entry > seq_entry
Definition: ftablock.h:346
Char acnum[200]
Definition: ftablock.h:169
CRef< objects::CPatent_seq_id > psip
Definition: ftablock.h:193
Char division[4]
Definition: ftablock.h:174
bool assembly
Definition: ftablock.h:244
bool is_mga
Definition: ftablock.h:202
bool tsa_allowed
Definition: ftablock.h:214
Int2 htg
Definition: ftablock.h:199
bool is_tls
Definition: ftablock.h:211
Int2 vernum
Definition: ftablock.h:170
bool is_tpa
Definition: ftablock.h:209
TKeywordList keywords
Definition: ftablock.h:243
bool embl_new_ID
Definition: ftablock.h:221
bool is_wgs
Definition: ftablock.h:208
bool origin
Definition: ftablock.h:204
bool is_contig
Definition: ftablock.h:200
bool STS
Definition: ftablock.h:196
bool is_pat
Definition: ftablock.h:205
bool HTC
Definition: ftablock.h:198
bool drop
Definition: ftablock.h:185
bool experimental
Definition: ftablock.h:250
size_t bases
Definition: ftablock.h:175
bool inferential
Definition: ftablock.h:248
bool is_tsa
Definition: ftablock.h:210
bool EST
Definition: ftablock.h:195
size_t len
Definition: ftablock.h:187
GapFeatsPtr gaps
Definition: ftablock.h:217
string wgssec
Definition: ftablock.h:239
size_t offset
Definition: ftablock.h:171
bool specialist_db
Definition: ftablock.h:246
Uint2 segnum
Definition: ftablock.h:176
Char locusname[200]
Definition: ftablock.h:173
bool env_sample_qual
Definition: ftablock.h:222
XmlIndexPtr xip
Definition: ftablock.h:220
size_t qslength
Definition: ftablock.h:233
bool GSS
Definition: ftablock.h:197
char *(* ff_get_qscore_pp)(const char *accession, Int2 v, Parser *pp)
vector< IndexblkPtr > entrylist
bool allow_crossdb_featloc
CKeywordParser & KeywordParser()
char *(* ff_get_qscore)(const char *accession, Int2 v)
TEntryList entries
Definition: inftrees.h:24
else result
Definition: token2.c:20
CScope & GetScope()
void MaybeCutGbblockSource(TEntryList &seq_entries)
Definition: utilfeat.cpp:454
bool GetGenomeInfo(CBioSource &bsp, const Char *bptr)
Definition: utilfeat.cpp:244
bool HasHtg(const TKeywordList &keywords)
Definition: utilfun.cpp:1719
bool HasHtc(const TKeywordList &keywords)
Definition: utilfun.cpp:1748
char * GetBlkDataReplaceNewLine(char *bptr, char *eptr, Int2 start_col_data)
Definition: utilfun.cpp:740
char * SrchTheChar(char *bptr, char *eptr, Char letter)
Definition: utilfun.cpp:903
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1281
void RemoveHtgPhase(TKeywordList &keywords)
Definition: utilfun.cpp:1733
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1422
void fta_remove_tpa_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1408
Int2 fta_StringMatch(const Char **array, const Char *text)
Definition: utilfun.cpp:557
void CleanTailNoneAlphaCharInString(string &str)
Definition: utilfun.cpp:823
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
Definition: utilfun.cpp:1377
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1437
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
Definition: utilfun.cpp:1108
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
Definition: utilfun.cpp:1340
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1641
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
Definition: utilfun.cpp:1139
bool IsCancelled(const TKeywordList &keywords)
Definition: utilfun.cpp:1708
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1247
void fta_remove_env_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1452
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
Definition: utilfun.cpp:923
bool fta_tpa_keywords_check(const TKeywordList &kwds)
Definition: utilfun.cpp:1165
char * PointToNextToken(char *ptr)
Definition: utilfun.cpp:847
CRef< CSeq_loc > xgbparseint_ver(const char *raw_intervals, bool &keep_rawPt, int &numErrors, const TSeqIdList &seq_ids, bool accver)
Definition: xgbparint.cpp:1466
void XGappedSeqLocsToDeltaSeqs(const TSeqLocList &locs, TDeltaList &deltas)
Definition: xutils.cpp:91
Modified on Sat Dec 02 09:22:21 2023 by modify_doxy.py rev. 669887