NCBI C++ ToolKit
em_ascii.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: em_ascii.cpp 102411 2024-05-02 10:00:24Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: em_ascii.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Preprocessing embl from blocks in memory to asn.
32  * Build EMBL format entry block.
33  *
34  */
35 
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 
42 #include <objects/general/Date.hpp>
43 #include <objects/seq/Seq_inst.hpp>
45 #include <objects/seq/Seq_ext.hpp>
50 #include <objmgr/scope.hpp>
51 #include <objects/seq/MolInfo.hpp>
61 #include <objects/seq/Pubdesc.hpp>
63 
64 #include "index.h"
65 #include "embl.h"
66 
68 #include "ftanet.h"
70 
71 #include "ftaerr.hpp"
72 #include "indx_blk.h"
73 #include "asci_blk.h"
74 #include "utilfun.h"
75 #include "utilref.h"
76 #include "em_ascii.h"
77 #include "add.h"
78 #include "utilfeat.h"
79 #include "loadfeat.h"
80 #include "nucprot.h"
81 #include "fta_qscore.h"
82 #include "citation.h"
83 #include "fcleanup.h"
84 #include "entry.h"
85 #include "ref.h"
86 #include "xgbparint.h"
87 #include "xutils.h"
88 #include "fta_xml.h"
89 #include "keyword_parse.hpp"
90 
91 #ifdef THIS_FILE
92 # undef THIS_FILE
93 #endif
94 #define THIS_FILE "em_ascii.cpp"
95 
96 
99 
100 // clang-format off
101 
102 /* For new stile of ID line in EMBL data check the "data class"
103  * field first to figure out division code
104  */
105 static const char* ParFlat_Embl_dataclass_array[] = {
106  "ANN", "CON", "PAT", "EST", "GSS", "HTC", "HTG", "STS", "TSA",
107  nullptr
108 };
109 
110 /* order by EMBL-block in asn.all
111  */
112 static const char* ParFlat_Embl_DIV_array[] = {
113  "FUN", "INV", "MAM", "ORG", "PHG", "PLN", "PRI", "PRO", "ROD",
114  "SYN", "UNA", "VRL", "VRT", "PAT", "EST", "STS", "UNC", "GSS",
115  "HUM", "HTG", "HTC", "CON", "ENV", "MUS", "TGN", "TSA",
116  nullptr
117 };
118 
119 /* correspond "DIV" genbank string. Must have the same number
120  * of elements !
121  */
122 static const char* ParFlat_GBDIV_array[] = {
123  "PLN", "INV", "MAM", "UNA", "PHG", "PLN", "PRI", "BCT", "ROD",
124  "SYN", "UNA", "VRL", "VRT", "PAT", "EST", "STS", "UNA", "GSS",
125  "PRI", "HTG", "HTC", "CON", "ENV", "ROD", "SYN", "TSA",
126  nullptr
127 };
128 
129 // clang-format on
130 
131 static const char* ParFlat_DBname_array[] = {
132  "EMBL",
133  "GENBANK",
134  "DDBJ",
135  "GENINFO",
136  "MEDLINE",
137  "SWISS-PROT",
138  "PIR",
139  "PDB",
140  "EPD",
141  "ECD",
142  "TFD",
143  "FLYBASE",
144  "PROSITE",
145  "ENZYME",
146  "MIM",
147  "ECOSEQ",
148  "HIV",
149  nullptr
150 };
151 
152 static const char* ParFlat_DRname_array[] = {
153  "ARAPORT",
154  "ARRAYEXPRESS",
155  "ASTD",
156  "BEEBASE",
157  "BGD",
158  "BIOGRID",
159  "BIOMUTA",
160  "BIOSAMPLE",
161  "CABRI",
162  "CCDS",
163  "CHEMBL",
164  "CHITARS",
165  "COLLECTF",
166  "DEPOD",
167  "DMDM",
168  "DNASU",
169  "ENA",
170  "ENA-CON",
171  "ENSEMBL",
172  "ENSEMBL-GN",
173  "ENSEMBL-SCAFFOLDS",
174  "ENSEMBL-TR",
175  "ENSEMBLGENOMES",
176  "ENSEMBLGENOMES-GN",
177  "ENSEMBLGENOMES-TR",
178  "ESTHER",
179  "EUROPEPMC",
180  "EVOLUTIONARYTRACE",
181  "EXPRESSIONATLAS",
182  "GENE3D",
183  "GENEDB",
184  "GENEREVIEWS",
185  "GENEVISIBLE",
186  "GENEWIKI",
187  "GENOMERNAI",
188  "GDB",
189  "GOA",
190  "GR",
191  "GRAINGENES",
192  "GUIDETOPHARMACOLOGY",
193  "H-INVDB",
194  "HGNC",
195  "HOMD",
196  "HSSP",
197  "IMAGENES",
198  "IMGT/GENE-DB",
199  "IMGT/HLA",
200  "IMGT/LIGM",
201  "IMGT_GENE-DB",
202  "INTERPRO",
203  "IPD-KIR",
204  "IPTMNET",
205  "KEGG",
206  "KO",
207  "MALACARDS",
208  "MAXQB",
209  "MGI",
210  "MIRBASE",
211  "MOONPROT",
212  "MYCOBANK",
213  "MYCOCLAP",
214  "PATRIC",
215  "PAXDB",
216  "POMBASE",
217  "PR2",
218  "PRO",
219  "PROTEOMES",
220  "RFAM",
221  "RZPD",
222  "SABIO-RK",
223  "SFLD",
224  "SGN",
225  "SIGNALINK",
226  "SIGNALLINK",
227  "SIGNOR",
228  "SILVA-LSU",
229  "SILVA-SSU",
230  "STRAININFO",
231  "SWISSLIPIDS",
232  "SWISSPALM",
233  "TMRNA-WEBSITE",
234  "TOPDOWNPROTEOMICS",
235  "TRANSFAC",
236  "TREEFAM",
237  "UNICARBKB",
238  "UNILIB",
239  "UNIPATHWAY",
240  "UNIPROT/SWISS-PROT",
241  "UNIPROT/TREMBL",
242  "UNIPROTKB/SWISS-PROT",
243  "UNIPROTKB/TREMBL",
244  "UNITE",
245  "VBASE2",
246  "VEGA-TR",
247  "VEGA-GN",
248  "VGNC",
249  "WBPARASITE",
250  "WORMBASE",
251  "ZFIN",
252  nullptr
253 };
254 
255 
256 /**********************************************************
257  *
258  * static void GetEmblDate(source, entry, crdate, update):
259  *
260  * Contain two lines, first created date, second
261  * updated date.
262  * In the direct submission, it may only have one
263  * DT line, if it is, then created date = update date.
264  *
265  * 9-24-93
266  *
267  * Skip XX line between DT line.
268  *
269  * 12-22-93
270  *
271  **********************************************************/
272 static void GetEmblDate(Parser::ESource source, const DataBlk& entry, CRef<CDate_std>& crdate, CRef<CDate_std>& update)
273 {
274  char* offset;
275  char* eptr;
276  size_t len;
277 
278  crdate.Reset();
279  update.Reset();
280  offset = xSrchNodeType(entry, ParFlat_DT, &len);
281  if (! offset)
282  return;
283 
284  eptr = offset + len;
286  while (offset < eptr) {
287  offset = SrchTheChar(offset, eptr, '\n');
288  if (! offset)
289  break;
290 
291  offset++; /* newline */
292  if (StringEquN(offset, "DT", 2)) {
294  source);
295  break;
296  }
297  }
298  if (update.Empty()) {
299  update.Reset(new CDate_std);
300  update->SetDay(crdate->GetDay());
301  update->SetMonth(crdate->GetMonth());
302  update->SetYear(crdate->GetYear());
303  }
304 }
305 
306 /**********************************************************/
307 static bool OutputEmblAsn(bool seq_long, ParserPtr pp, TEntryList& seq_entries)
308 {
309  DealWithGenes(seq_entries, pp);
310 
311  if (seq_entries.empty()) {
313  return false;
314  }
315 
316  fta_find_pub_explore(pp, seq_entries);
317 
318  /* change qual "citation" on features to SeqFeat.cit find citation
319  * in the list by serial_number. If serial number not found remove
320  * /citation
321  */
322  ProcessCitations(seq_entries);
323 
324  if (pp->convert) {
325  if (pp->cleanup <= 1) {
326  FinalCleanup(seq_entries);
327 
328  if (pp->qamode && ! seq_entries.empty())
329  fta_remove_cleanup_user_object(*(*seq_entries.begin()));
330  }
331 
332  MaybeCutGbblockSource(seq_entries);
333  }
334 
335  EntryCheckDivCode(seq_entries, pp);
336 
337  if (pp->xml_comp)
338  fta_set_strandedness(seq_entries);
339 
340  if (fta_EntryCheckGBBlock(seq_entries)) {
341  ErrPostStr(SEV_WARNING, ERR_ENTRY_GBBlock_not_Empty, "Attention: GBBlock is not empty");
342  }
343 
344  if (pp->qamode) {
345  fta_sort_descr(seq_entries);
346  fta_sort_seqfeat_cit(seq_entries);
347  }
348 
349  if (pp->citat) {
350  StripSerialNumbers(seq_entries);
351  }
352 
353  PackEntries(seq_entries);
354  CheckDupDates(seq_entries);
355 
356  if (seq_long) {
357  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence, "Sequence %s|%s is longer than limit %ld", pp->entrylist[pp->curindx]->locusname, pp->entrylist[pp->curindx]->acnum, pp->limit);
358  } else {
359  pp->entries.splice(pp->entries.end(), seq_entries);
360  }
361 
362  seq_entries.clear();
364 
365  return true;
366 }
367 
368 static void SetXrefObjId(CEMBL_xref& xref, const string& str)
369 {
370  if (str.empty())
371  return;
372 
373  CEMBL_xref::TId& ids = xref.SetId();
374 
375  bool found = false;
376  for (const auto& id : ids) {
377  if (id->IsStr() && id->GetStr() == str) {
378  found = true;
379  break;
380  }
381  }
382 
383  if (found)
384  return;
385 
386  CRef<CObject_id> obj_id(new CObject_id);
387  obj_id->SetStr(str);
388 
389  ids.push_back(obj_id);
390 }
391 
392 /**********************************************************
393  *
394  * static void GetEmblBlockXref(entry, xip,
395  * chentry, dr_ena,
396  * dr_biosample,
397  * drop):
398  *
399  * Return a list of EMBLXrefPtr, one EMBLXrefPtr per
400  * type (DR) line.
401  *
402  **********************************************************/
403 static void GetEmblBlockXref(const DataBlk& entry, XmlIndexPtr xip, const char* chentry, TStringList& dr_ena, TStringList& dr_biosample, bool* drop, CEMBL_block& embl)
404 {
405  const char** b;
406 
407  const char* drline;
408 
409  char* bptr;
410  char* eptr;
411  char* ptr;
412  char* xref;
413  char* p;
414  char* q;
415 
416  bool valid_biosample;
417  bool many_biosample;
418  size_t len;
419 
420  Int2 col_data;
421  Int2 code;
422 
423  CEMBL_block::TXref new_xrefs;
424 
425  if (! xip) {
426  bptr = xSrchNodeType(entry, ParFlat_DR, &len);
427  col_data = ParFlat_COL_DATA_EMBL;
428  xref = nullptr;
429  } else {
431  if (bptr)
432  len = StringLen(bptr);
433  col_data = 0;
434  xref = bptr;
435  }
436 
437  if (! bptr)
438  return;
439 
440  for (eptr = bptr + len; bptr < eptr; bptr = ptr) {
441  drline = bptr;
442  bptr += col_data; /* bptr points to database_identifier */
444 
445  string name;
446  if (code < 0) {
447  ptr = SrchTheChar(bptr, eptr, ';');
448  name.assign(bptr, ptr);
449 
450  if (NStr::EqualNocase(name, "MD5")) {
451  while (ptr < eptr) {
452  if (NStr::Equal(ptr, 0, 2, "DR"))
453  break;
454 
455  ptr = SrchTheChar(ptr, eptr, '\n');
456  if (*ptr == '\n')
457  ptr++;
458  }
459  continue;
460  }
461 
462  for (b = ParFlat_DRname_array; *b; b++) {
463  if (NStr::EqualNocase(name, *b))
464  break;
465  }
466 
467  if (! *b)
468  ErrPostEx(SEV_WARNING, ERR_DRXREF_UnknownDBname, "Encountered a new/unknown database name in DR line: \"%s\".", name.c_str());
469  else if (NStr::EqualNocase(*b, "UNIPROT/SWISS-PROT")) {
470  name = "UniProtKB/Swiss-Prot";
471  } else if (NStr::EqualNocase(*b, "UNIPROT/TREMBL")) {
472  name = "UniProtKB/TrEMBL";
473  }
474  }
475 
476  bptr = PointToNextToken(bptr); /* bptr points to primary_identifier */
477  p = SrchTheChar(bptr, eptr, '\n');
478  ptr = SrchTheChar(bptr, eptr, ';');
479 
480  string id, id1;
481 
482  if (ptr && ptr < p) {
483  id.assign(bptr, ptr);
485 
486  bptr = PointToNextToken(ptr); /* points to
487  secondary_identifier */
488  }
489  if (p) {
490  id1.assign(bptr, p);
492  }
493 
494  if (id.empty()) {
495  id = id1;
496  id1.clear();
497  }
498 
499  if (name == "BioSample" && ! id.empty()) {
500  many_biosample = (! id.empty() && ! id1.empty());
501  valid_biosample = fta_if_valid_biosample(id.c_str(), false);
502  if (! id1.empty() && fta_if_valid_biosample(id1.c_str(), false) == false)
503  valid_biosample = false;
504  if (many_biosample || ! valid_biosample) {
505  q = nullptr;
506  if (! drline)
507  drline = "[Empty]";
508  else {
509  q = StringChr(const_cast<char*>(drline), '\n');
510  if (q)
511  *q = '\0';
512  }
513  if (many_biosample)
514  ErrPostEx(SEV_REJECT, ERR_DRXREF_InvalidBioSample, "Multiple BioSample ids provided in the same DR line: \"%s\".", drline);
515  if (! valid_biosample)
516  ErrPostEx(SEV_REJECT, ERR_DRXREF_InvalidBioSample, "Invalid BioSample id(s) provided in DR line: \"%s\".", drline);
517  *drop = true;
518  if (q)
519  *q = '\n';
520  } else {
521  bool found = false;
522  for (const string& val : dr_biosample) {
523  if (val == id) {
524  found = true;
525  break;
526  }
527  }
528 
529  if (found) {
530  ErrPostEx(SEV_WARNING, ERR_DRXREF_DuplicatedBioSamples, "Duplicated BioSample ids found within DR lines contents: \"%s\".", id.c_str());
531  } else {
532  dr_biosample.push_back(id);
533  }
534  }
535  } else if (name == "ENA" && ! id.empty() && fta_if_valid_sra(id.c_str(), false)) {
536  if (! id.empty() && ! id1.empty()) {
537  q = nullptr;
538  if (! drline)
539  drline = "[Empty]";
540  else {
541  q = StringChr(const_cast<char*>(drline), '\n');
542  if (q)
543  *q = '\0';
544  }
545  ErrPostEx(SEV_REJECT, ERR_DRXREF_InvalidSRA, "Multiple possible SRA ids provided in the same DR line: \"%s\".", drline);
546  *drop = true;
547  if (q)
548  *q = '\n';
549  } else {
550  bool found = false;
551  for (const string& val : dr_ena) {
552  if (val == id) {
553  found = true;
554  break;
555  }
556  }
557 
558  if (found) {
559  ErrPostEx(SEV_WARNING, ERR_DRXREF_DuplicatedSRA, "Duplicated Sequence Read Archive ids found within DR lines contents: \"%s\".", id.c_str());
560  } else {
561  dr_ena.push_back(id);
562  }
563  }
564  } else {
565  CRef<CEMBL_xref> new_xref(new CEMBL_xref);
566 
567  if (code != -1)
568  new_xref->SetDbname().SetCode(static_cast<CEMBL_dbname::ECode>(code));
569  else
570  new_xref->SetDbname().SetName(name);
571 
572  if (! id.empty())
573  SetXrefObjId(*new_xref, id);
574 
575  if (! id1.empty())
576  SetXrefObjId(*new_xref, id1);
577 
578  new_xrefs.push_back(new_xref);
579  }
580 
581  ptr = p + 1;
582 
583  if (xip)
584  continue;
585 
586  /* skip "XX" line
587  */
588  while (ptr < eptr) {
589  if (StringEquN(ptr, "DR", 2))
590  break;
591 
592  ptr = SrchTheChar(ptr, eptr, '\n');
593  if (*ptr == '\n')
594  ptr++;
595  }
596  }
597 
598  if (xref)
599  MemFree(xref);
600 
601  if (! new_xrefs.empty())
602  embl.SetXref().swap(new_xrefs);
603 }
604 
606 {
607  static CTextseq_id noTextId;
608 
609  switch (id.Which()) {
610  case CSeq_id::e_Genbank:
611  return id.SetGenbank();
612  case CSeq_id::e_Embl:
613  return id.SetEmbl();
614  case CSeq_id::e_Pir:
615  return id.SetPir();
617  return id.SetSwissprot();
618  case CSeq_id::e_Other:
619  return id.SetOther();
620  case CSeq_id::e_Ddbj:
621  return id.SetDdbj();
622  case CSeq_id::e_Prf:
623  return id.SetPrf();
624  case CSeq_id::e_Tpg:
625  return id.SetTpg();
626  case CSeq_id::e_Tpe:
627  return id.SetTpe();
628  case CSeq_id::e_Tpd:
629  return id.SetTpd();
630  case CSeq_id::e_Gpipe:
631  return id.SetGpipe();
633  return id.SetNamed_annot_track();
634  default:; // do nothing
635  }
636 
637  return noTextId;
638 }
639 
640 /**********************************************************/
641 static void GetReleaseInfo(const DataBlk& entry)
642 {
643  EntryBlkPtr ebp;
644 
645  char* offset;
646  char* bptr;
647  char* eptr;
648 
649  size_t len;
650 
651  ebp = static_cast<EntryBlk*>(entry.mpData);
652  CBioseq& bioseq = ebp->seq_entry->SetSeq();
653  CTextseq_id& id = SetTextIdRef(*(bioseq.SetId().front()));
654 
655  offset = xSrchNodeType(entry, ParFlat_DT, &len);
656  if (! offset)
657  return;
658 
659  eptr = offset + len;
660  offset = SrchTheChar(offset, eptr, '\n');
661  if (! offset)
662  return;
663 
664  bptr = SrchTheStr(offset, eptr, "Version");
665  if (! bptr)
666  return;
667 
668  bptr = PointToNextToken(bptr); /* bptr points to next token */
669 
670  id.SetVersion(NStr::StringToInt(bptr, NStr::fAllowTrailingSymbols));
671 }
672 
673 /**********************************************************
674  *
675  * static OrgRefPtr GetEmblOrgRef(dbp):
676  *
677  * >= 1 OS per entry.
678  *
679  **********************************************************/
681 {
682  const char* bptr = dbp->mOffset;
683  const char* eptr = bptr + dbp->len;
684 
685  string sTaxname;
686  vector<string> taxLines;
687  NStr::Split(CTempString(bptr, eptr - bptr), "\n", taxLines);
688  for (auto line : taxLines) {
690  if (line.empty() || NStr::StartsWith(line, "XX")) {
691  continue;
692  }
693  if (! sTaxname.empty()) {
694  sTaxname += ' ';
695  }
696  sTaxname += line.substr(ParFlat_COL_DATA_EMBL);
697  }
698 
699  CRef<COrg_ref> org_ref;
700  if (sTaxname.empty()) {
701  return org_ref;
702  }
703 
704  org_ref.Reset(new COrg_ref);
705  org_ref->SetTaxname(sTaxname);
706 
707  auto openP = sTaxname.find('(');
708  if (openP != string::npos) {
709  auto sCommonName = sTaxname.substr(0, openP);
710  auto commonTerm = sCommonName.find_last_not_of(" \t(");
711  if (commonTerm != string::npos) {
712  sCommonName = sCommonName.substr(0, commonTerm + 1);
713  org_ref->SetCommon(sCommonName);
714  }
715  }
716  return org_ref;
717 }
718 
719 /**********************************************************/
721 {
722  bool condiv = (NStr::CompareNocase(ibp->division, "CON") == 0);
723 
724  bool result = true;
725  if (condiv && ibp->segnum != 0) {
726  ErrPostEx(SEV_ERROR, ERR_DIVISION_ConDivInSegset, "Use of the CON division is not allowed for members of segmented set : %s|%s. Entry skipped.", ibp->locusname, ibp->acnum);
727  // ibp->drop = true;
728  result = false;
729  }
730 
731  if (! condiv && ibp->is_contig == false && ibp->origin == false) {
732  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingSequenceData, "Required sequence data is absent. Entry dropped.");
733  // ibp->drop = true;
734  result = false;
735  } else if (! condiv && ibp->is_contig && ibp->origin == false) {
736  ErrPostEx(SEV_WARNING, ERR_DIVISION_MappedtoCON, "Division [%s] mapped to CON based on the existence of CONTIG line.", ibp->division);
737  } else if (ibp->is_contig && ibp->origin) {
739  ErrPostEx(SEV_INFO, ERR_FORMAT_ContigWithSequenceData, "The CONTIG/CO linetype and sequence data are both present. Ignoring sequence data.");
740  } else {
741  ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigWithSequenceData, "The CONTIG/CO linetype and sequence data may not both be present in a sequence record.");
742  // ibp->drop = true;
743  result = false;
744  }
745  } else if (condiv && ! ibp->is_contig && ! ibp->origin) {
746  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingContigFeature, "No CONTIG data in GenBank format file, entry dropped.");
747  // ibp->drop = true;
748  result = false;
749  } else if (condiv && ! ibp->is_contig && ibp->origin) {
750  ErrPostEx(SEV_WARNING, ERR_DIVISION_ConDivLacksContig, "Division is CON, but CONTIG data have not been found.");
751  }
752  return result;
753 }
754 
755 /**********************************************************/
756 bool GetEmblInstContig(const DataBlk& entry, CBioseq& bioseq, ParserPtr pp)
757 {
758  DataBlkPtr dbp;
759 
760  char* p;
761  char* q;
762  char* r;
763  bool locmap;
764 
765  bool allow_crossdb_featloc;
766  int numerr;
767 
768  dbp = TrackNodeType(entry, ParFlat_CO);
769  if (! dbp || ! dbp->mOffset)
770  return true;
771 
772  Int4 i = static_cast<Int4>(dbp->len) - ParFlat_COL_DATA_EMBL;
773  if (i <= 0)
774  return false;
775 
776  p = StringNew(i);
778  p[i - 1] = '\0';
779  for (q = p; *q != '\0'; q++) {
780  if (*q == '\t')
781  *q = ' ';
782  else if (*q == '\n') {
783  *q = ' ';
784  if (q[1] == 'C' && q[2] == 'O' && q[3] == ' ') {
785  q[1] = ' ';
786  q[2] = ' ';
787  }
788  }
789  }
790  for (q = p, r = p; *q != '\0'; q++)
791  if (*q != ' ')
792  *r++ = *q;
793  *r = '\0';
794 
795  for (q = p; *q != '\0'; q++)
796  if ((q[0] == ',' && q[1] == ',') || (q[0] == '(' && q[1] == ',') ||
797  (q[0] == ',' && q[1] == ')'))
798  break;
799  if (*q != '\0') {
800  ErrPostEx(SEV_REJECT, ERR_LOCATION_ContigHasNull, "The join() statement for this record's contig line contains one or more comma-delimited components which are null.");
801  MemFree(p);
802  return false;
803  }
804 
805  pp->buf.reset();
806 
807  CRef<CSeq_loc> loc = xgbparseint_ver(p, locmap, numerr, bioseq.GetId(), pp->accver);
808 
809  if (loc.NotEmpty() && loc->IsMix()) {
810  allow_crossdb_featloc = pp->allow_crossdb_featloc;
811  pp->allow_crossdb_featloc = true;
812 
813  TSeqLocList locs;
814  locs.push_back(loc);
815 
816  i = fta_fix_seq_loc_id(locs, pp, p, nullptr, true);
817  if (i > 999)
819  pp->allow_crossdb_featloc = allow_crossdb_featloc;
820 
821  XGappedSeqLocsToDeltaSeqs(loc->GetMix(), bioseq.SetInst().SetExt().SetDelta().Set());
822  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_delta);
823  } else
824  bioseq.SetInst().ResetExt();
825 
826  MemFree(p);
827  return true;
828 }
829 
830 /**********************************************************
831  *
832  * bool GetEmblInst(pp, entry, dnaconv):
833  *
834  * Fills in Seq-inst for an entry. Assumes Bioseq
835  * already allocated.
836  *
837  **********************************************************/
838 static bool GetEmblInst(ParserPtr pp, const DataBlk& entry, unsigned char* dnaconv)
839 {
840  EntryBlkPtr ebp;
841  IndexblkPtr ibp;
842 
843  char* p;
844  char* q;
845  char* r;
846 
847  Int4 i;
848  Int2 strand;
849 
850  ebp = static_cast<EntryBlk*>(entry.mpData);
851 
852  CBioseq& bioseq = ebp->seq_entry->SetSeq();
853 
854  CSeq_inst& inst = bioseq.SetInst();
856 
857  ibp = pp->entrylist[pp->curindx];
858 
859  /* p points to 2nd token
860  */
862  p = PointToNextToken(p); /* p points to 3rd token */
863 
864  if (ibp->embl_new_ID)
865  p = PointToNextToken(p);
866 
867  /* some entries have "circular" before molecule type in embl
868  */
869  if (StringEquNI(p, "circular", 8)) {
871  p = PointToNextToken(p);
872  } else if (ibp->embl_new_ID)
873  p = PointToNextToken(p);
874 
875  r = StringChr(p, ';');
876  if (r)
877  *r = '\0';
878 
879  for (i = 0, q = p; *q != '\0'; q++) {
880  if (*q != ' ')
881  continue;
882 
883  while (*q == ' ')
884  q++;
885  if (*q != '\0')
886  i++;
887  q--;
888  }
889 
890  if (ibp->embl_new_ID == false && inst.GetTopology() != CSeq_inst::eTopology_circular &&
891  ! StringStr(p, "DNA") && ! StringStr(p, "RNA") &&
892  (pp->source != Parser::ESource::EMBL || (! StringStr(p, "xxx") &&
893  ! StringStr(p, "XXX")))) {
894  ErrPostEx(SEV_WARNING, ERR_LOCUS_WrongTopology, "Other than circular topology found in EMBL, \"%s\", assign default topology", p);
895  }
896 
897  /* the "p" must be the mol-type
898  */
899  if (i == 0 && pp->source == Parser::ESource::NCBI) {
900  /* source = NCBI can be full variety of strands/mol-type
901  */
902  strand = CheckSTRAND(p);
903  if (strand > 0)
904  inst.SetStrand(static_cast<CSeq_inst::EStrand>(strand));
905  }
906 
907  if (r)
908  *r = ';';
909 
910  if (! GetSeqData(pp, entry, bioseq, ParFlat_SQ, dnaconv, eSeq_code_type_iupacna))
911  return false;
912 
913  if (ibp->is_contig && ! GetEmblInstContig(entry, bioseq, pp))
914  return false;
915 
916  return true;
917 }
918 
919 /**********************************************************
920  *
921  * static CRef<CEMBL_block> GetDescrEmblBlock(pp, entry, mfp,
922  * gbdiv, biosp,
923  * dr_ena, dr_biosample):
924  *
925  * class is 2nd token of ID line.
926  * div :
927  * - 4th or 5th (if circular) token of ID line;
928  * - but actually Genbank DIV string has to get by
929  * mapping GBDIV_array;
930  * - EST DIV string by searching KW line to map
931  * ParFlat_EST_kw_array;
932  * - PAT DIV string by accession number starting
933  * with "A".
934  * DR line for xref.
935  *
936  **********************************************************/
938  ParserPtr pp, const DataBlk& entry, CMolInfo& mol_info, string& gbdiv, const CBioSource* bio_src, TStringList& dr_ena, TStringList& dr_biosample)
939 {
940  CRef<CEMBL_block> ret, embl(new CEMBL_block);
941 
942  IndexblkPtr ibp;
943  char* bptr;
944  Char dataclass[4];
945  Char ch;
946 
947  CEMBL_block::TDiv div;
948  TKeywordList keywords;
949 
950  bool if_cds;
951  bool pat_ref = false;
952  bool est_kwd = false;
953  bool sts_kwd = false;
954  bool gss_kwd = false;
955  bool htc_kwd = false;
956  bool fli_kwd = false;
957  bool wgs_kwd = false;
958  bool tpa_kwd = false;
959  bool tsa_kwd = false;
960  bool tls_kwd = false;
961  bool env_kwd = false;
962  bool mga_kwd = false;
963 
964  bool cancelled;
965  bool drop;
966  char* tempdiv;
967  Int4 i;
968 
969  ibp = pp->entrylist[pp->curindx];
970 
971  /* bptr points to 2nd token
972  */
974 
975  if (ibp->embl_new_ID == false) {
976  if (StringEquNI(bptr, "standard", 8)) {
977  // embl->SetClass(CEMBL_block::eClass_standard);
978  } else if (StringEquNI(bptr, "unannotated", 11)) {
979  embl->SetClass(CEMBL_block::eClass_unannotated);
980  } else if (StringEquNI(bptr, "unreviewed", 10) ||
981  StringEquNI(bptr, "preliminary", 11)) {
982  embl->SetClass(CEMBL_block::eClass_other);
983  } else {
984  embl->SetClass(CEMBL_block::eClass_not_set);
985  }
986 
987  bptr = StringChr(bptr, ';');
988  if (bptr)
989  bptr = StringChr(bptr + 1, ';');
990  } else {
991  bptr = StringChr(bptr, ';');
992  if (bptr)
993  bptr = StringChr(bptr + 1, ';');
994  if (bptr)
995  bptr = StringChr(bptr + 1, ';');
996  if (bptr) {
997  while (*bptr == ' ' || *bptr == ';')
998  bptr++;
1000  if (i < 0)
1001  bptr = StringChr(bptr, ';');
1002  else if (i == 0)
1003  bptr = (char*)"CON";
1004  }
1005  }
1006 
1007  if (bptr) {
1008  while (*bptr == ' ' || *bptr == ';')
1009  bptr++;
1010  StringNCpy(dataclass, bptr, 3);
1011  dataclass[3] = '\0';
1012  if (StringEqu(dataclass, "TSA"))
1013  ibp->is_tsa = true;
1014  } else {
1015  bptr = (char*)" ";
1016  dataclass[0] = '\0';
1017  }
1018 
1019  if_cds = check_cds(entry, pp->format);
1020 
1021  if (ibp->psip.NotEmpty())
1022  pat_ref = true;
1023 
1024  pp->KeywordParser().Cleanup();
1025  keywords = pp->KeywordParser().KeywordList();
1026 
1027  embl->SetKeywords() = keywords;
1028  if (ibp->is_tpa && ! fta_tpa_keywords_check(keywords)) {
1029  return ret;
1030  }
1031 
1032  if (ibp->is_tsa && ! fta_tsa_keywords_check(keywords, pp->source)) {
1033  return ret;
1034  }
1035 
1036  if (ibp->is_tls && ! fta_tls_keywords_check(keywords, pp->source)) {
1037  return ret;
1038  }
1039 
1040  for (const string& key : keywords) {
1041  fta_keywords_check(key.c_str(), &est_kwd, &sts_kwd, &gss_kwd, &htc_kwd, &fli_kwd, &wgs_kwd, &tpa_kwd, &env_kwd, &mga_kwd, &tsa_kwd, &tls_kwd);
1042  }
1043 
1044  if (ibp->env_sample_qual == false && env_kwd) {
1045  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ENV_NoMatchingQualifier, "This record utilizes the ENV keyword, but there are no /environmental_sample qualifiers among its source features.");
1046  return ret;
1047  }
1048 
1049  div = static_cast<CEMBL_block::TDiv>(fta_StringMatch(ParFlat_Embl_DIV_array, bptr));
1050  if (div < 0) {
1051  ch = bptr[3];
1052  bptr[3] = '\0';
1053  ErrPostEx(SEV_REJECT, ERR_DIVISION_UnknownDivCode, "Unknown division code \"%s\" found in Embl flatfile. Record rejected.", bptr);
1054  bptr[3] = ch;
1055  return ret;
1056  }
1057 
1058  /* Embl has recently (7-19-93, email) decided to change the name of
1059  * its "UNA"==10 division to "UNC"==16 (for "unclassified")
1060  */
1061  if (div == 16)
1062  div = CEMBL_block::eDiv_una;
1063 
1065 
1066  /* 06-10-96 new HUM division replaces the PRI
1067  * it's temporarily mapped to 'other' in asn.1 embl-block.
1068  * Divisions GSS, HUM, HTG, CON, ENV and MUS are mapped to other.
1069  */
1070  int thtg = (div == 18) ? CEMBL_block::eDiv_pri : div;
1071  gbdiv = ParFlat_GBDIV_array[thtg];
1072 
1073  if (div <= CEMBL_block::eDiv_sts)
1074  embl->SetDiv(div);
1075 
1076  const char* p = gbdiv.c_str();
1077  if (ibp->is_tpa &&
1078  (StringEqu(p, "EST") || StringEqu(p, "GSS") ||
1079  StringEqu(p, "PAT") || StringEqu(p, "HTG"))) {
1080  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTPADivcode, "Division code \"%s\" is not legal for TPA records. Entry dropped.", p);
1081  return ret;
1082  }
1083 
1084  if (ibp->is_tsa && ! StringEqu(p, "TSA")) {
1085  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTSADivcode, "Division code \"%s\" is not legal for TSA records. Entry dropped.", p);
1086  return ret;
1087  }
1088 
1089  cancelled = IsCancelled(embl->GetKeywords());
1090 
1091  if (div == 19) /* HTG */
1092  {
1093  if (! HasHtg(embl->GetKeywords())) {
1094  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTGKeywords, "Division is HTG, but entry lacks HTG-related keywords. Entry dropped.");
1095  return ret;
1096  }
1097  tempdiv = StringSave("HTG");
1098  } else
1099  tempdiv = nullptr;
1100 
1101  fta_check_htg_kwds(embl->SetKeywords(), pp->entrylist[pp->curindx], mol_info);
1102 
1103  DefVsHTGKeywords(mol_info.GetTech(), entry, ParFlat_DE, ParFlat_SQ, cancelled);
1104  if ((mol_info.GetTech() == CMolInfo::eTech_htgs_0 || mol_info.GetTech() == CMolInfo::eTech_htgs_1 ||
1105  mol_info.GetTech() == CMolInfo::eTech_htgs_2) &&
1106  ! gbdiv.empty()) {
1107  gbdiv.clear();
1108  }
1109 
1110  CheckHTGDivision(tempdiv, mol_info.GetTech());
1111  if (tempdiv)
1112  MemFree(tempdiv);
1113 
1114  i = 0;
1115  if (est_kwd)
1116  i++;
1117  if (sts_kwd)
1118  i++;
1119  if (gss_kwd)
1120  i++;
1121  if (ibp->htg > 0)
1122  i++;
1123  if (htc_kwd)
1124  i++;
1125  if (fli_kwd)
1126  i++;
1127  if (wgs_kwd)
1128  i++;
1129  if (env_kwd)
1130  i++;
1131  if (mga_kwd) {
1132  if (ibp->is_mga == false) {
1133  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeCAGE, "This is apparently _not_ a CAGE record, but the special keywords are present. Entry dropped.");
1134  return ret;
1135  }
1136  i++;
1137  } else if (ibp->is_mga) {
1138  ErrPostEx(SEV_REJECT, ERR_KEYWORD_NoGeneExpressionKeywords, "This is apparently a CAGE or 5'-SAGE record, but it lacks the required keywords. Entry dropped.");
1139  return ret;
1140  }
1141  if (tpa_kwd) {
1142  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL) {
1143  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA\" and/or \"Third Party Annotation\" keywords are present. Entry dropped.");
1144  return ret;
1145  }
1146  i++;
1147  } else if (ibp->is_tpa) {
1148  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA\" and/or \"Third Party Annotation\" keywords. Entry dropped.");
1149  return ret;
1150  }
1151 
1152  if (tsa_kwd) {
1153  if (ibp->is_tsa == false) {
1154  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords are present. Entry dropped.");
1155  return ret;
1156  }
1157  i++;
1158  } else if (ibp->is_tsa) {
1159  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords. Entry dropped.");
1160  return ret;
1161  }
1162  if (tls_kwd) {
1163  if (ibp->is_tls == false) {
1164  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS\" and/or \"Targeted Locus Study\" keywords are present. Entry dropped.");
1165  return ret;
1166  }
1167  i++;
1168  } else if (ibp->is_tls) {
1169  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS\" and/or \"Targeted Locus Study\" keywords. Entry dropped.");
1170  return ret;
1171  }
1172  if (i > 1) {
1173  if (i == 2 && ibp->htg > 0 && env_kwd)
1174  ErrPostEx(SEV_WARNING, ERR_KEYWORD_HTGPlusENV, "This HTG record also has the ENV keyword, which is an unusual combination. Confirmation that isolation and cloning steps actually occured might be appropriate.");
1175  else if ((i == 2 && wgs_kwd && tpa_kwd) ||
1176  (i == 2 && tsa_kwd && tpa_kwd)) {
1177  } else if (i != 2 || env_kwd == false ||
1178  (est_kwd == false && gss_kwd == false && wgs_kwd == false)) {
1179  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingKeywords, "This record contains more than one of the special keywords used to indicate that a sequence is an HTG, EST, GSS, STS, HTC, WGS, ENV, FLI_CDNA, TPA, CAGE, TSA or TLS sequence.");
1180  return ret;
1181  }
1182  }
1183 
1184  if (wgs_kwd)
1185  i--;
1186  if (ibp->is_contig && i > 0 &&
1187  wgs_kwd == false && tpa_kwd == false && env_kwd == false) {
1188  ErrPostEx(SEV_REJECT, ERR_KEYWORD_IllegalForCON, "This CON record should not have HTG, EST, GSS, STS, HTC, FLI_CDNA, CAGE, TSA or TLS special keywords. Entry dropped.");
1189  return ret;
1190  }
1191 
1192  CMolInfo::TTech tech = mol_info.GetTech();
1193  if (tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
1194  tech == CMolInfo::eTech_htgs_2 || tech == CMolInfo::eTech_htgs_3) {
1195  RemoveHtgPhase(embl->SetKeywords());
1196  }
1197 
1198  size_t len = 0;
1199  bptr = xSrchNodeType(entry, ParFlat_KW, &len);
1200  if (bptr) {
1201  string kw = GetBlkDataReplaceNewLine(string_view(bptr, len), ParFlat_COL_DATA_EMBL);
1202 
1203  if (! est_kwd && kw.find("EST") != string::npos) {
1204  ErrPostEx(SEV_WARNING, ERR_KEYWORD_ESTSubstring, "Keyword %s has substring EST, but no official EST keywords found", kw.c_str());
1205  }
1206  if (! sts_kwd && kw.find("STS") != string::npos) {
1207  ErrPostEx(SEV_WARNING, ERR_KEYWORD_STSSubstring, "Keyword %s has substring STS, but no official STS keywords found", kw.c_str());
1208  }
1209  if (! gss_kwd && kw.find("GSS") != string::npos) {
1210  ErrPostEx(SEV_WARNING, ERR_KEYWORD_GSSSubstring, "Keyword %s has substring GSS, but no official GSS keywords found", kw.c_str());
1211  }
1212  }
1213 
1214  if (! ibp->is_contig) {
1215  drop = false;
1216  CMolInfo::TTech tech = mol_info.GetTech();
1217 
1218  check_div(ibp->is_pat, pat_ref, est_kwd, sts_kwd, gss_kwd, if_cds, gbdiv, &tech, ibp->bases, pp->source, drop);
1219  if (tech != CMolInfo::eTech_unknown)
1220  mol_info.SetTech(tech);
1221  else
1222  mol_info.ResetTech();
1223 
1224  if (drop) {
1225  return ret;
1226  }
1227  } else if (! gbdiv.empty() && StringEqu(gbdiv.c_str(), "CON")) {
1228  gbdiv.clear();
1229  }
1230 
1231  bool is_htc_div = ! gbdiv.empty() && StringEqu(gbdiv.c_str(), "HTC");
1232  bool has_htc = HasHtc(embl->GetKeywords());
1233 
1234  if (is_htc_div && ! has_htc) {
1235  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTCKeyword, "This record is in the HTC division, but lacks the required HTC keyword.");
1236  return ret;
1237  }
1238  if (! is_htc_div && has_htc) {
1239  ErrPostEx(SEV_ERROR, ERR_DIVISION_InvalidHTCKeyword, "This record has the special HTC keyword, but is not in HTC division. If this record has graduated out of HTC, then the keyword should be removed.");
1240  return ret;
1241  }
1242 
1243  if (is_htc_div) {
1244  char* p;
1245  p = entry.mOffset + ParFlat_COL_DATA_EMBL; /* p points to 1st token */
1246  p = PointToNextToken(p); /* p points to 2nd token */
1247  p = PointToNextToken(p); /* p points to 3rd token */
1248 
1249  if (ibp->embl_new_ID) {
1250  p = PointToNextToken(p);
1251  p = PointToNextToken(p);
1252  } else if (StringEquNI(p, "circular", 8))
1253  p = PointToNextToken(p); /* p points to 4th token */
1254 
1255  if (StringEquN(p + 1, "s-", 2))
1256  p += 3;
1257  if (*p == 'm' || *p == 'r')
1258  p++;
1259  else if (StringEquN(p, "pre-", 4))
1260  p += 4;
1261  else if (StringEquN(p, "transcribed ", 12))
1262  p += 12;
1263 
1264  if (! StringEquN(p, "RNA", 3)) {
1265  ErrPostEx(SEV_ERROR, ERR_DIVISION_HTCWrongMolType, "All HTC division records should have a moltype of pre-RNA, mRNA or RNA.");
1266  return ret;
1267  }
1268  }
1269 
1270  if (fli_kwd)
1272 
1273  /* will be used in flat file database
1274  */
1275  if (! gbdiv.empty()) {
1276  if (StringEqu(gbdiv.c_str(), "EST")) {
1277  ibp->EST = true;
1278  mol_info.SetTech(CMolInfo::eTech_est);
1279  } else if (StringEqu(gbdiv.c_str(), "STS")) {
1280  ibp->STS = true;
1281  mol_info.SetTech(CMolInfo::eTech_sts);
1282  } else if (StringEqu(gbdiv.c_str(), "GSS")) {
1283  ibp->GSS = true;
1284  mol_info.SetTech(CMolInfo::eTech_survey);
1285  } else if (StringEqu(gbdiv.c_str(), "HTC")) {
1286  ibp->HTC = true;
1287  mol_info.SetTech(CMolInfo::eTech_htc);
1288  gbdiv.clear();
1289  } else if (StringEqu(gbdiv.c_str(), "SYN") && bio_src &&
1290  bio_src->IsSetOrigin() && bio_src->GetOrigin() == CBioSource::eOrigin_synthetic) {
1291  gbdiv.clear();
1292  }
1293  } else if (mol_info.IsSetTech()) {
1294  if (mol_info.GetTech() == CMolInfo::eTech_est)
1295  ibp->EST = true;
1296  if (mol_info.GetTech() == CMolInfo::eTech_sts)
1297  ibp->STS = true;
1298  if (mol_info.GetTech() == CMolInfo::eTech_survey)
1299  ibp->GSS = true;
1300  if (mol_info.GetTech() == CMolInfo::eTech_htc)
1301  ibp->HTC = true;
1302  }
1303 
1304  if (mol_info.IsSetTech())
1305  fta_remove_keywords(mol_info.GetTech(), embl->SetKeywords());
1306 
1307  if (ibp->is_tpa)
1308  fta_remove_tpa_keywords(embl->SetKeywords());
1309 
1310  if (ibp->is_tsa)
1311  fta_remove_tsa_keywords(embl->SetKeywords(), pp->source);
1312 
1313  if (ibp->is_tls)
1314  fta_remove_tls_keywords(embl->SetKeywords(), pp->source);
1315 
1316  if (bio_src && bio_src->IsSetSubtype()) {
1317  for (const auto& subtype : bio_src->GetSubtype()) {
1318  if (subtype->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
1319  fta_remove_env_keywords(embl->SetKeywords());
1320  break;
1321  }
1322  }
1323  }
1324 
1325 
1326  CRef<CDate_std> std_creation_date,
1327  std_update_date;
1328 
1329  GetEmblDate(pp->source, entry, std_creation_date, std_update_date);
1330 
1331  embl->SetCreation_date().SetStd(*std_creation_date);
1332  embl->SetUpdate_date().SetStd(*std_update_date);
1333 
1334  ibp->wgssec[0] = '\0';
1335  GetExtraAccession(ibp, pp->allow_uwsec, pp->source, embl->SetExtra_acc());
1336 
1337  GetEmblBlockXref(entry, nullptr, nullptr, dr_ena, dr_biosample, &ibp->drop, *embl);
1338 
1339  if (StringEqu(dataclass, "ANN") || StringEqu(dataclass, "CON")) {
1340  if (StringLen(ibp->acnum) == 8 &&
1341  (StringEquN(ibp->acnum, "CT", 2) ||
1342  StringEquN(ibp->acnum, "CU", 2))) {
1343  bool found = false;
1344  for (const string& acc : embl->SetExtra_acc()) {
1345  if (fta_if_wgs_acc(acc) == 0 &&
1346  (acc[0] == 'C' || acc[0] == 'U')) {
1347  found = true;
1348  break;
1349  }
1350  }
1351  if (found)
1352  mol_info.SetTech(CMolInfo::eTech_wgs);
1353  }
1354  }
1355 
1356  return embl;
1357 }
1358 
1359 
1360 static bool s_DuplicatesBiosource(const CBioSource& biosource, const string& gbdiv)
1361 {
1362  return (biosource.IsSetOrg() &&
1363  biosource.GetOrg().IsSetOrgname() &&
1364  biosource.GetOrg().GetOrgname().IsSetDiv() &&
1365  NStr::Equal(biosource.GetOrg().GetOrgname().GetDiv(), gbdiv));
1366 }
1367 
1368 /**********************************************************/
1369 static CRef<CGB_block> GetEmblGBBlock(ParserPtr pp, const DataBlk& entry, const string& gbdiv, CBioSource* bio_src)
1370 {
1371  IndexblkPtr ibp;
1372 
1373  CRef<CGB_block> gbb(new CGB_block);
1374 
1375  ibp = pp->entrylist[pp->curindx];
1376 
1377  if (pp->source == Parser::ESource::NCBI) {
1378  ibp->wgssec[0] = '\0';
1379  GetExtraAccession(ibp, pp->allow_uwsec, pp->source, gbb->SetExtra_accessions());
1380  pp->KeywordParser().Cleanup();
1381  gbb->SetKeywords() = pp->KeywordParser().KeywordList();
1382  }
1383 
1384  if (! gbdiv.empty()) {
1385  if (NStr::EqualNocase(gbdiv.c_str(), "ENV") &&
1386  bio_src && bio_src->IsSetSubtype()) {
1387  const auto& subtype = bio_src->GetSubtype();
1388  const auto it =
1389  find_if(begin(subtype), end(subtype), [](auto pSubSource) {
1390  return pSubSource->GetSubtype() == CSubSource::eSubtype_environmental_sample;
1391  });
1392  if ((it == subtype.end()) && ! s_DuplicatesBiosource(*bio_src, gbdiv)) { // Not found
1393  gbb->SetDiv(gbdiv);
1394  }
1395  } else if (! bio_src ||
1396  ! s_DuplicatesBiosource(*bio_src, gbdiv)) {
1397  gbb->SetDiv(gbdiv);
1398  }
1399  }
1400 
1401  if (! gbb->IsSetExtra_accessions() && ! gbb->IsSetKeywords() && ! gbb->IsSetDiv())
1402  gbb.Reset();
1403 
1404  return gbb;
1405 }
1406 
1407 /**********************************************************
1408  *
1409  * static MolInfoPtr GetEmblMolInfo(entry, pp, orp):
1410  *
1411  * 3rd or 4th token in the ID line.
1412  * OG line.
1413  *
1414  **********************************************************/
1415 static CRef<CMolInfo> GetEmblMolInfo(ParserPtr pp, const DataBlk& entry, const COrg_ref* org_ref)
1416 {
1417  IndexblkPtr ibp;
1418 
1419  char* bptr;
1420  char* p;
1421  char* q;
1422  char* r;
1423  Int4 i;
1424 
1425  ibp = pp->entrylist[pp->curindx];
1426  bptr = entry.mOffset + ParFlat_COL_DATA_EMBL; /* bptr points to 1st
1427  token */
1428  bptr = PointToNextToken(bptr); /* bptr points to 2nd token */
1429  bptr = PointToNextToken(bptr); /* bptr points to 3rd token */
1430 
1431  if (StringEquNI(bptr, "circular", 8) || ibp->embl_new_ID)
1432  bptr = PointToNextToken(bptr); /* bptr points to 4th token */
1433  if (ibp->embl_new_ID)
1434  bptr = PointToNextToken(bptr); /* bptr points to 5th token */
1435 
1436  r = StringChr(bptr, ';');
1437  if (r)
1438  *r = '\0';
1439 
1440  for (i = 0, q = bptr; *q != '\0'; q++) {
1441  if (*q != ' ')
1442  continue;
1443 
1444  while (*q == ' ')
1445  q++;
1446  if (*q != '\0')
1447  i++;
1448  q--;
1449  }
1450 
1451  if (r)
1452  for (p = r + 1; *p == ' ' || *p == ';';)
1453  p++;
1454  else
1455  p = bptr;
1456 
1457  CRef<CMolInfo> mol_info(new CMolInfo);
1458 
1459  if (StringEquN(p, "EST", 3))
1460  mol_info->SetTech(CMolInfo::eTech_est);
1461  else if (ibp->is_wgs) {
1462  if (ibp->is_tsa)
1463  mol_info->SetTech(CMolInfo::eTech_tsa);
1464  else if (ibp->is_tls)
1465  mol_info->SetTech(CMolInfo::eTech_targeted);
1466  else
1467  mol_info->SetTech(CMolInfo::eTech_wgs);
1468  } else if (ibp->is_tsa)
1469  mol_info->SetTech(CMolInfo::eTech_tsa);
1470  else if (ibp->is_tls)
1471  mol_info->SetTech(CMolInfo::eTech_targeted);
1472 
1473  if (i == 0 && CheckSTRAND(bptr) >= 0)
1474  bptr = bptr + 3;
1475 
1476  GetFlatBiomol(mol_info->SetBiomol(), mol_info->GetTech(), bptr, pp, entry, org_ref);
1477  if (mol_info->GetBiomol() == CMolInfo::eBiomol_unknown) // not set
1478  mol_info->ResetBiomol();
1479 
1480  if (r)
1481  *r = ';';
1482 
1483  return mol_info;
1484 }
1485 
1486 /**********************************************************/
1488 {
1489  CRef<CUser_field> field;
1490  if (! tag || lst.empty())
1491  return field;
1492 
1493  field.Reset(new CUser_field);
1494  field->SetLabel().SetStr(tag);
1495  field->SetNum(static_cast<CUser_field::TNum>(lst.size()));
1496 
1497  for (const string& item : lst) {
1498  field->SetData().SetStrs().push_back(item);
1499  }
1500 
1501  return field;
1502 }
1503 
1504 /**********************************************************/
1506 {
1507  bool got = false;
1508 
1509  if (dr_ena.empty() && dr_biosample.empty())
1510  return;
1511 
1512  CUser_object* user_obj_ptr = nullptr;
1513 
1514  for (auto& descr : descrs) {
1515  if (! descr->IsUser() || ! descr->GetUser().IsSetType())
1516  continue;
1517 
1518  const CObject_id& obj_id = descr->GetUser().GetType();
1519 
1520  if (obj_id.IsStr() && obj_id.GetStr() == "DBLink") {
1521  user_obj_ptr = &descr->SetUser();
1522  got = true;
1523  break;
1524  }
1525  }
1526 
1527  CRef<CUser_field> field_bs;
1528  if (! dr_biosample.empty())
1529  field_bs = fta_create_user_field("BioSample", dr_biosample);
1530 
1531  CRef<CUser_field> field_ena;
1532  if (! dr_ena.empty()) {
1533  field_ena = fta_create_user_field("Sequence Read Archive", dr_ena);
1534  }
1535 
1536  if (field_bs.Empty() && field_ena.Empty())
1537  return;
1538 
1539  CRef<CUser_object> user_obj;
1540 
1541  if (! got) {
1542  user_obj.Reset(new CUser_object);
1543  user_obj->SetType().SetStr("DBLink");
1544 
1545  user_obj_ptr = user_obj.GetPointer();
1546  }
1547 
1548  if (field_bs.NotEmpty())
1549  user_obj_ptr->SetData().push_back(field_bs);
1550  if (field_ena.NotEmpty())
1551  user_obj_ptr->SetData().push_back(field_ena);
1552 
1553  if (! got) {
1554  CRef<CSeqdesc> descr(new CSeqdesc);
1555  descr->SetUser(*user_obj);
1556  descrs.push_back(descr);
1557  }
1558 
1559  if (! got)
1560  dbuop = user_obj;
1561  else {
1562  dbuop.Reset(new CUser_object);
1563  dbuop->Assign(*user_obj_ptr);
1564  }
1565 }
1566 
1567 /**********************************************************/
1568 static void fta_create_imgt_misc_feat(CBioseq& bioseq, CEMBL_block& embl_block, IndexblkPtr ibp)
1569 {
1570  if (! embl_block.IsSetXref())
1571  return;
1572 
1573  CSeq_feat::TDbxref xrefs;
1574  for (const auto& xref : embl_block.GetXref()) {
1575  if (! xref->IsSetDbname() || ! xref->GetDbname().IsName() ||
1576  ! StringEquN(xref->GetDbname().GetName().c_str(), "IMGT/", 5))
1577  continue;
1578 
1579  bool empty = true;
1580  for (const auto& id : xref->GetId()) {
1581  if (id->IsStr() && ! id->GetStr().empty()) {
1582  empty = false;
1583  break;
1584  }
1585  }
1586 
1587  if (empty)
1588  continue;
1589 
1590  CRef<CDbtag> tag(new CDbtag);
1591  tag->SetDb(xref->GetDbname().GetName());
1592 
1593  string& id_str = tag->SetTag().SetStr();
1594 
1595  bool need_delimiter = false;
1596  for (const auto& id : xref->GetId()) {
1597  if (id->IsStr() && ! id->GetStr().empty()) {
1598  if (need_delimiter)
1599  id_str += "; ";
1600  else
1601  need_delimiter = true;
1602 
1603  id_str += id->GetStr();
1604  }
1605  }
1606 
1607  xrefs.push_back(tag);
1608  }
1609 
1610  if (xrefs.empty())
1611  return;
1612 
1613  CRef<CSeq_feat> feat(new CSeq_feat);
1614  CImp_feat& imp = feat->SetData().SetImp();
1615  imp.SetKey("misc_feature");
1616  feat->SetDbxref().swap(xrefs);
1617  feat->SetLocation(*fta_get_seqloc_int_whole(*(*bioseq.SetId().begin()), ibp->bases));
1618 
1619  CBioseq::TAnnot& annot = bioseq.SetAnnot();
1620  if (annot.empty() || ! (*annot.begin())->IsFtable()) {
1621  CRef<CSeq_annot> new_annot(new CSeq_annot);
1622  new_annot->SetData().SetFtable().push_back(feat);
1623 
1624  annot.push_back(new_annot);
1625  } else {
1626  CSeq_annot& old_annot = *(*annot.begin());
1627  old_annot.SetData().SetFtable().push_front(feat);
1628  }
1629 }
1630 
1631 static bool s_HasTPAPrefix(const CTempString& line)
1632 {
1633  return NStr::StartsWith(line, "TPA:") ||
1634  NStr::StartsWith(line, "TPA_exp:") ||
1635  NStr::StartsWith(line, "TPA_inf:") ||
1636  NStr::StartsWith(line, "TPA_asm:") ||
1637  NStr::StartsWith(line, "TPA_reasm:") ||
1638  NStr::StartsWith(line, "TPA_specdb:");
1639 }
1640 
1641 /**********************************************************/
1642 static void GetEmblDescr(ParserPtr pp, const DataBlk& entry, CBioseq& bioseq)
1643 {
1644  IndexblkPtr ibp;
1645  DataBlkPtr dbp;
1646 
1647  char* offset;
1648  string gbdiv;
1649 
1650  bool is_htg = false;
1651 
1652  size_t len;
1653 
1654  ibp = pp->entrylist[pp->curindx];
1655 
1656  /* pp->source == NCBI then no embl-block, only GB-block
1657  */
1658 
1659  /* DE data ==> descr_title
1660  */
1661  offset = xSrchNodeType(entry, ParFlat_DE, &len);
1662 
1663  string title;
1664 
1665  if (offset) {
1666  string str = GetBlkDataReplaceNewLine(string_view(offset, len), ParFlat_COL_DATA_EMBL);
1667 
1668  for (size_t pos = 0; pos < str.size();) {
1669  pos = str.find(";;", pos);
1670  if (pos == string::npos)
1671  break;
1672  ++pos;
1673  size_t j = 0;
1674  for (size_t i = pos; i < str.size() && str[i] == ';'; ++i)
1675  ++j;
1676  str.erase(pos, j);
1677  }
1678 
1679  while (! str.empty()) {
1680  char c = str.back();
1681  if (c == ' ' || c == ';')
1682  str.pop_back();
1683  else
1684  break;
1685  }
1686 
1687  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL &&
1688  StringEquN(str.c_str(), "TPA:", 4)) {
1689  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA:\" prefix is present on its definition line. Entry dropped.");
1690  ibp->drop = true;
1691  return;
1692  }
1693 
1694  if (ibp->is_tsa == false && StringEquN(str.c_str(), "TSA:", 4)) {
1695  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA:\" prefix is present on its definition line. Entry dropped.");
1696  ibp->drop = true;
1697  return;
1698  }
1699 
1700  if (ibp->is_tls == false && StringEquN(str.c_str(), "TLS:", 4)) {
1701  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS:\" prefix is present on its definition line. Entry dropped.");
1702  ibp->drop = true;
1703  return;
1704  }
1705 
1706  if (StringEquN(str.c_str(), "TPA:", 4)) {
1707  string str1;
1708  if (ibp->assembly)
1709  str1 = "TPA_asm:";
1710  else if (ibp->specialist_db)
1711  str1 = "TPA_specdb:";
1712  else if (ibp->inferential)
1713  str1 = "TPA_inf:";
1714  else if (ibp->experimental)
1715  str1 = "TPA_exp:";
1716 
1717  if (! str1.empty())
1718  str.replace(0, 4, str1);
1719  }
1720 
1721  CRef<CSeqdesc> descr(new CSeqdesc);
1722  descr->SetTitle(str);
1723  bioseq.SetDescr().Set().push_back(descr);
1724 
1725  title = str;
1726  }
1727 
1728  offset = xSrchNodeType(entry, ParFlat_PR, &len);
1729  if (offset)
1731 
1732  if (ibp->is_tpa &&
1733  (title.empty() || ! s_HasTPAPrefix(title))) {
1734  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA:\" prefix on its definition line. Entry dropped.");
1735  ibp->drop = true;
1736  return;
1737  }
1738 
1739  if (ibp->is_tsa && ! ibp->is_tpa &&
1740  (title.empty() || ! StringEquN(title.c_str(), "TSA:", 4))) {
1741  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TSA:\" prefix on its definition line. Entry dropped.");
1742  ibp->drop = true;
1743  return;
1744  }
1745 
1746  if (ibp->is_tls && (title.empty() || ! StringEquN(title.c_str(), "TLS:", 4))) {
1747  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS:\" prefix on its definition line. Entry dropped.");
1748  ibp->drop = true;
1749  return;
1750  }
1751 
1752  /* RN data ==> pub should be before GBblock because we need patent ref
1753  */
1754  dbp = TrackNodeType(entry, ParFlat_REF_END);
1755  for (; dbp; dbp = dbp->mpNext) {
1756  if (dbp->mType != ParFlat_REF_END)
1757  continue;
1758 
1759  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, ParFlat_COL_DATA_EMBL);
1760  if (pubdesc.NotEmpty()) {
1761  CRef<CSeqdesc> descr(new CSeqdesc);
1762  descr->SetPub(*pubdesc);
1763  bioseq.SetDescr().Set().push_back(descr);
1764  }
1765  }
1766 
1767  dbp = TrackNodeType(entry, ParFlat_REF_NO_TARGET);
1768  for (; dbp; dbp = dbp->mpNext) {
1769  if (dbp->mType != ParFlat_REF_NO_TARGET)
1770  continue;
1771 
1772  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, ParFlat_COL_DATA_EMBL);
1773  if (pubdesc.NotEmpty()) {
1774  CRef<CSeqdesc> descr(new CSeqdesc);
1775  descr->SetPub(*pubdesc);
1776  bioseq.SetDescr().Set().push_back(descr);
1777  }
1778  }
1779 
1780  /* OS data ==> descr_org
1781  */
1782  CBioSource* bio_src = nullptr;
1783  COrg_ref* org_ref = nullptr;
1784 
1785  for (auto& descr : bioseq.SetDescr().Set()) {
1786  if (descr->IsSource()) {
1787  bio_src = &(descr->SetSource());
1788  if (bio_src->IsSetOrg())
1789  org_ref = &bio_src->SetOrg();
1790  break;
1791  }
1792  }
1793 
1794  /* MolInfo, 3rd or 4th token in the ID line
1795  */
1796  CRef<CMolInfo> mol_info = GetEmblMolInfo(pp, entry, org_ref);
1797 
1798  TStringList dr_ena,
1799  dr_biosample;
1800 
1801  CRef<CEMBL_block> embl_block =
1802  GetDescrEmblBlock(pp, entry, *mol_info, gbdiv, bio_src, dr_ena, dr_biosample);
1803 
1804  if (pp->source == Parser::ESource::EMBL && embl_block.NotEmpty())
1805  fta_create_imgt_misc_feat(bioseq, *embl_block, ibp);
1806 
1807  if ((pp->source == Parser::ESource::DDBJ || pp->source == Parser::ESource::EMBL) &&
1808  ibp->is_contig && ! mol_info->IsSetTech()) {
1809  CMolInfo::TTech tech = fta_check_con_for_wgs(bioseq);
1810  if (tech == CMolInfo::eTech_unknown)
1811  mol_info->ResetTech();
1812  else
1813  mol_info->SetTech(tech);
1814  }
1815 
1816  if (mol_info->IsSetBiomol() || mol_info->IsSetTech()) {
1817  CRef<CSeqdesc> descr(new CSeqdesc);
1818  descr->SetMolinfo(*mol_info);
1819  bioseq.SetDescr().Set().push_back(descr);
1820 
1821  if (mol_info->IsSetTech() && (mol_info->GetTech() == CMolInfo::eTech_htgs_0 || mol_info->GetTech() == CMolInfo::eTech_htgs_1 ||
1822  mol_info->GetTech() == CMolInfo::eTech_htgs_2))
1823  is_htg = true;
1824  } else {
1825  mol_info.Reset();
1826  }
1827 
1828  CRef<CUser_object> dbuop;
1829  if (! dr_ena.empty() || ! dr_biosample.empty())
1830  fta_build_ena_user_object(bioseq.SetDescr().Set(), dr_ena, dr_biosample, dbuop);
1831 
1832  if (embl_block.Empty()) {
1833  ibp->drop = true;
1834  return;
1835  }
1836 
1837  if (StringEquNI(ibp->division, "CON", 3))
1838  fta_add_hist(pp, bioseq, embl_block->SetExtra_acc(), Parser::ESource::EMBL, CSeq_id::e_Embl, true, ibp->acnum);
1839  else
1840  fta_add_hist(pp, bioseq, embl_block->SetExtra_acc(), Parser::ESource::EMBL, CSeq_id::e_Embl, false, ibp->acnum);
1841 
1842  if (embl_block->GetExtra_acc().empty())
1843  embl_block->ResetExtra_acc();
1844 
1845  CRef<CGB_block> gbb;
1846 
1847  if (pp->source == Parser::ESource::NCBI || (! embl_block->IsSetDiv() && ! gbdiv.empty())) {
1848  gbb = GetEmblGBBlock(pp, entry, gbdiv, bio_src); /* GB-block */
1849  }
1850 
1851  gbdiv.clear();
1852 
1853  bool hasEmblBlock = false;
1854  if (pp->source != Parser::ESource::NCBI) {
1855  CRef<CSeqdesc> descr(new CSeqdesc);
1856  descr->SetEmbl(*embl_block);
1857  bioseq.SetDescr().Set().push_back(descr);
1858  hasEmblBlock = true;
1859  }
1860 
1861  offset = xSrchNodeType(entry, ParFlat_AH, &len);
1862  if (! offset && ibp->is_tpa && ibp->is_wgs == false) {
1863  if (ibp->inferential || ibp->experimental) {
1864  if (! fta_dblink_has_sra(dbuop)) {
1865  ErrPostEx(SEV_REJECT, ERR_TPA_TpaSpansMissing, "TPA:%s record lacks both AH/PRIMARY linetype and Sequence Read Archive links. Entry dropped.", (ibp->inferential == false) ? "experimental" : "inferential");
1866  ibp->drop = true;
1867  return;
1868  }
1869  } else if (ibp->specialist_db == false) {
1870  ErrPostEx(SEV_REJECT, ERR_TPA_TpaSpansMissing, "TPA record lacks required AH/PRIMARY linetype. Entry dropped.");
1871  ibp->drop = true;
1872  return;
1873  }
1874  }
1875 
1876  if (offset && len > 0 &&
1877  fta_parse_tpa_tsa_block(bioseq, offset, ibp->acnum, ibp->vernum, len, ParFlat_COL_DATA_EMBL, ibp->is_tpa) == false) {
1878  ibp->drop = true;
1879  return;
1880  }
1881 
1882  /* GB-block and div
1883  */
1884  if (pp->taxserver == 1) {
1885  if (hasEmblBlock && embl_block->IsSetDiv() && embl_block->GetDiv() < 15) {
1886  if (org_ref && org_ref->IsSetOrgname() && ! org_ref->GetOrgname().IsSetDiv() &&
1887  (! org_ref->IsSetDb() || ! fta_orgref_has_taxid(org_ref->GetDb()))) {
1888  org_ref->SetOrgname().SetDiv(ParFlat_GBDIV_array[embl_block->GetDiv()]);
1889  }
1890 
1891  if (bioseq.IsSetAnnot()) {
1892  for (auto& pAnnot : bioseq.SetAnnot()) {
1893  if (pAnnot->IsFtable()) {
1894  for (auto& pFeat : pAnnot->SetData().SetFtable()) {
1895  if (pFeat->IsSetData() && pFeat->SetData().IsBiosrc()) {
1896  auto& biosrc = pFeat->SetData().SetBiosrc();
1897  if (biosrc.IsSetOrg() &&
1898  (! biosrc.GetOrg().IsSetDb() ||
1899  ! fta_orgref_has_taxid(biosrc.GetOrg().GetDb()))) {
1900  biosrc.SetOrg().SetOrgname().SetDiv(ParFlat_GBDIV_array[embl_block->GetDiv()]);
1901  }
1902  }
1903  }
1904  }
1905  }
1906  }
1907  } else if (gbb && gbb->IsSetDiv()) {
1908  fta_fix_orgref_div(bioseq.GetAnnot(), org_ref, *gbb);
1909  }
1910  }
1911 
1912  if (gbb) {
1913  CRef<CSeqdesc> descr(new CSeqdesc);
1914  descr->SetGenbank(*gbb);
1915  bioseq.SetDescr().Set().push_back(descr);
1916  }
1917 
1918  /* all CC data ==> comment
1919  */
1920  offset = xSrchNodeType(entry, ParFlat_CC, &len);
1921  if (offset && len > 0) {
1922  char* str = GetDescrComment(offset, len, ParFlat_COL_DATA_EMBL, (pp->xml_comp ? false : is_htg), ibp->is_pat);
1923  if (str) {
1924  bool bad = false;
1925  TUserObjVector user_objs;
1926 
1927  fta_parse_structured_comment(str, bad, user_objs);
1928  if (bad) {
1929  ibp->drop = true;
1930  MemFree(str);
1931  return;
1932  }
1933 
1934  for (auto& user_obj : user_objs) {
1935  CRef<CSeqdesc> descr(new CSeqdesc);
1936  descr->SetUser(*user_obj);
1937  bioseq.SetDescr().Set().push_back(descr);
1938  }
1939 
1940  if (pp->xml_comp) {
1941  char* p;
1942  char* q;
1943  for (q = str, p = q; *p != '\0';) {
1944  if (*p == ';' && (p[1] == ' ' || p[1] == '~'))
1945  *p = ' ';
1946  if (*p == '~' || *p == ' ') {
1947  *q++ = ' ';
1948  for (p++; *p == ' ' || *p == '~';)
1949  p++;
1950  } else
1951  *q++ = *p++;
1952  }
1953  *q = '\0';
1954  }
1955 
1956  if (str[0] != 0) {
1957  CRef<CSeqdesc> descr(new CSeqdesc);
1958  descr->SetComment(str);
1959  bioseq.SetDescr().Set().push_back(descr);
1960  }
1961  MemFree(str);
1962  }
1963  }
1964 
1965  if (pp->no_date)
1966  return;
1967 
1968  /* DT data ==> create-date, update-date
1969  */
1970 
1971  CRef<CDate_std> std_creation_date,
1972  std_update_date;
1973  GetEmblDate(pp->source, entry, std_creation_date, std_update_date);
1974  if (std_creation_date.NotEmpty()) {
1975  CRef<CSeqdesc> descr(new CSeqdesc);
1976  descr->SetCreate_date().SetStd(*std_creation_date);
1977  bioseq.SetDescr().Set().push_back(descr);
1978  }
1979 
1980  if (std_update_date.NotEmpty()) {
1981  CRef<CSeqdesc> descr(new CSeqdesc);
1982  descr->SetUpdate_date().SetStd(*std_update_date);
1983  bioseq.SetDescr().Set().push_back(descr);
1984 
1985  if (std_creation_date.NotEmpty() && std_creation_date->Compare(*std_update_date) == CDate::eCompare_after) {
1986  string crdate_str, update_str;
1987  std_creation_date->GetDate(&crdate_str, "%2M-%2D-%4Y");
1988  std_update_date->GetDate(&crdate_str, "%2M-%2D-%4Y");
1989  ErrPostEx(SEV_ERROR, ERR_DATE_IllegalDate, "Update-date \"%s\" precedes create-date \"%s\".", update_str.c_str(), crdate_str.c_str());
1990  }
1991  }
1992 }
1993 
1994 /**********************************************************/
1995 static void FakeEmblBioSources(const DataBlk& entry, CBioseq& bioseq)
1996 {
1997  DataBlkPtr dbp;
1998  DataBlkPtr subdbp;
1999 
2000  char* p;
2001  char* q;
2002 
2003  dbp = TrackNodeType(entry, ParFlat_OS);
2004  if (! dbp) {
2005  ErrPostStr(SEV_WARNING, ERR_ORGANISM_NoOrganism, "No Organism data in Embl format file");
2006  return;
2007  }
2008 
2009  for (; dbp; dbp = dbp->mpNext) {
2010  if (dbp->mType != ParFlat_OS)
2011  continue;
2012 
2013  CRef<COrg_ref> org_ref = GetEmblOrgRef(dbp);
2014  if (org_ref.Empty())
2015  continue;
2016 
2017  CRef<CBioSource> bio_src(new CBioSource);
2018  bio_src->SetOrg(*org_ref);
2019 
2020  string& taxname_str = org_ref->SetTaxname();
2021  size_t off_pos = 0;
2022  if (GetGenomeInfo(*bio_src, taxname_str) && bio_src->GetGenome() != CBioSource::eGenome_plasmid) {
2023  while (taxname_str[off_pos] != ' ' && off_pos < taxname_str.size())
2024  ++off_pos;
2025  while (taxname_str[off_pos] == ' ' && off_pos < taxname_str.size())
2026  ++off_pos;
2027  }
2028 
2029  taxname_str = taxname_str.substr(off_pos);
2030  if (taxname_str == "Unknown.") {
2031  taxname_str = taxname_str.substr(0, taxname_str.size() - 1);
2032  }
2033 
2034  subdbp = static_cast<DataBlk*>(dbp->mpData);
2035  for (; subdbp; subdbp = subdbp->mpNext) {
2036  if (subdbp->mType == ParFlat_OG) {
2037  GetGenomeInfo(*bio_src, subdbp->mOffset + ParFlat_COL_DATA_EMBL);
2038  continue;
2039  }
2040  if (subdbp->mType != ParFlat_OC || ! subdbp->mOffset ||
2041  subdbp->len < ParFlat_COL_DATA_EMBL)
2042  continue;
2043 
2044  q = StringSave(string_view(subdbp->mOffset + ParFlat_COL_DATA_EMBL, subdbp->len - ParFlat_COL_DATA_EMBL));
2045  for (p = q; p;) {
2046  p = StringStr(p, "\nOC ");
2047  if (p)
2048  fta_StringCpy(p, p + 5);
2049  }
2050  for (p = q; *p != '\0';)
2051  p++;
2052  if (p == q) {
2053  MemFree(q);
2054  continue;
2055  }
2056  for (p--;; p--) {
2057  if (*p != ' ' && *p != '\t' && *p != '\n' && *p != '.' &&
2058  *p != ';') {
2059  p++;
2060  break;
2061  }
2062  if (p == q)
2063  break;
2064  }
2065  if (p == q) {
2066  MemFree(q);
2067  continue;
2068  }
2069  *p = '\0';
2070 
2071  if (! org_ref->IsSetOrgname()) {
2072  org_ref->SetOrgname().SetLineage(q);
2073  }
2074  MemFree(q);
2075  }
2076 
2077  CRef<CSeqdesc> descr(new CSeqdesc);
2078  descr->SetSource(*bio_src);
2079  bioseq.SetDescr().Set().push_front(descr);
2080  }
2081 }
2082 
2083 /**********************************************************/
2084 static void EmblGetDivision(IndexblkPtr ibp, const DataBlk& entry)
2085 {
2086  const char* p;
2087  const char* q;
2088 
2089  p = StringChr(entry.mOffset, ';');
2090  if (! p)
2091  p = entry.mOffset;
2092  else {
2093  q = StringChr(p + 1, ';');
2094  if (q)
2095  p = q;
2096  }
2097  while (*p == ' ' || *p == ';')
2098  p++;
2099 
2100  StringNCpy(ibp->division, p, 3);
2101  ibp->division[3] = '\0';
2102 }
2103 
2104 /**********************************************************/
2105 static void EmblGetDivisionNewID(IndexblkPtr ibp, const DataBlk& entry)
2106 {
2107  const char* p;
2108  Int4 i;
2109 
2110  for (i = 0, p = entry.mOffset; *p != '\0' && i < 4; p++)
2111  if (*p == ';' && p[1] == ' ')
2112  i++;
2113 
2114  while (*p == ' ')
2115  p++;
2116 
2118  if (i < 0) {
2119  p = StringChr(p, ';');
2120  if (p)
2121  for (p++; *p == ' ';)
2122  p++;
2123  } else if (i == 0)
2124  p = "CON";
2125 
2126  if (! p)
2127  p = " ";
2128 
2129  StringNCpy(ibp->division, p, 3);
2130  ibp->division[3] = '\0';
2131 }
2132 
2133 /**********************************************************
2134  *
2135  * bool EmblAscii(pp):
2136  *
2137  * Return FALSE if allocate entry block failed.
2138  *
2139  **********************************************************/
2141 {
2142  Int2 curkw;
2143  Int4 i;
2144  Int4 imax;
2145  Int4 total = 0;
2146  char* ptr;
2147  char* eptr;
2148 
2149  // DataBlkPtr entry;
2150  EntryBlkPtr ebp;
2151  TEntryList seq_entries;
2152  CSeq_loc locs;
2153 
2154  bool reject_set;
2155  bool seq_long = false;
2156  IndexblkPtr ibp;
2157 
2158  auto dnaconv = GetDNAConv(); /* set up sequence alphabets */
2159 
2160  for (imax = pp->indx, i = 0; i < imax; i++) {
2161  pp->curindx = i;
2162  ibp = pp->entrylist[i];
2163 
2164  err_install(ibp, pp->accver);
2165  if (! ibp->drop) {
2166  unique_ptr<DataBlk, decltype(&xFreeEntry)> pEntry(
2167  LoadEntry(pp, ibp->offset, ibp->len), &xFreeEntry);
2168  // pEntry.reset(LoadEntry(pp, ibp->offset, ibp->len));
2169  if (! pEntry) {
2171  return false;
2172  }
2173  ebp = static_cast<EntryBlk*>(pEntry->mpData);
2174  ptr = pEntry->mOffset; /* points to beginning of the
2175  memory line */
2176  eptr = ptr + pEntry->len;
2177  curkw = ParFlat_ID;
2178 
2179  // TODO: below is a potentially infinite cycle!!!!
2180  while (curkw != ParFlatEM_END) {
2181  /* ptr points to current keyword's memory line */
2182  ptr = GetEmblBlock(&ebp->chain, ptr, &curkw, pp->format, eptr);
2183  }
2184 
2185  if (ibp->embl_new_ID)
2186  EmblGetDivisionNewID(ibp, *pEntry);
2187  else
2188  EmblGetDivision(ibp, *pEntry);
2189 
2190  if (StringEqu(ibp->division, "TSA")) {
2191  if (ibp->tsa_allowed == false)
2192  ErrPostEx(SEV_WARNING, ERR_TSA_UnexpectedPrimaryAccession, "The record with accession \"%s\" is not expected to have a TSA division code.", ibp->acnum);
2193  ibp->is_tsa = true;
2194  }
2195 
2196  if (! CheckEmblContigEverywhere(ibp, pp->source)) {
2197  // if (ibp->drop) {
2198  ibp->drop = true;
2199  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2200  continue;
2201  }
2202 
2203  if (ptr >= eptr) {
2204  ibp->drop = true;
2205  ErrPostStr(SEV_ERROR, ERR_FORMAT_MissingEnd, "Missing end of the entry, entry dropped");
2206  if (pp->segment == false) {
2207  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2208  }
2209  continue;
2210  }
2211  GetEmblSubBlock(ibp->bases, pp->source, *pEntry);
2212 
2213  CRef<CBioseq> bioseq = CreateEntryBioseq(pp);
2214  AddNIDSeqId(*bioseq, *pEntry, ParFlat_NI, ParFlat_COL_DATA_EMBL, pp->source);
2215 
2216  ebp->seq_entry.Reset(new CSeq_entry);
2217  ebp->seq_entry->SetSeq(*bioseq);
2218  GetScope().AddBioseq(*bioseq);
2219 
2220  if (! pp->accver) {
2221  GetReleaseInfo(*pEntry);
2222  }
2223  if (! GetEmblInst(pp, *pEntry, dnaconv.get())) {
2224  ibp->drop = true;
2225  ErrPostStr(SEV_REJECT, ERR_SEQUENCE_BadData, "Bad sequence data, entry dropped");
2226  if (pp->segment == false) {
2227  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2228  }
2229  continue;
2230  }
2231 
2232  FakeEmblBioSources(*pEntry, *bioseq);
2233  LoadFeat(pp, *pEntry, *bioseq);
2234 
2235  if (! bioseq->IsSetAnnot() && ibp->drop) {
2236  if (pp->segment == false) {
2237  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2238  }
2239  continue;
2240  }
2241 
2242  GetEmblDescr(pp, *pEntry, *bioseq);
2243 
2244  if (ibp->drop) {
2245  if (pp->segment == false) {
2246  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2247  }
2248  continue;
2249  }
2250 
2251  fta_set_molinfo_completeness(*bioseq, ibp);
2252 
2253  if (ibp->is_tsa)
2254  fta_tsa_tls_comment_dblink_check(*bioseq, true);
2255 
2256  if (ibp->is_tls)
2257  fta_tsa_tls_comment_dblink_check(*bioseq, false);
2258 
2259  if (bioseq->GetInst().IsNa()) {
2260  if (bioseq->GetInst().GetRepr() == CSeq_inst::eRepr_raw) {
2261  if (ibp->gaps)
2262  GapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
2263  else if (ibp->htg == 4 || ibp->htg == 1 || ibp->htg == 2 ||
2264  (ibp->is_pat && pp->source == Parser::ESource::DDBJ))
2265  SeqToDelta(*bioseq, ibp->htg);
2266  } else if (ibp->gaps)
2267  AssemblyGapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
2268  }
2269 
2270  if (pEntry->mpQscore.empty() && pp->accver) {
2271  if (pp->ff_get_qscore)
2272  pEntry->mpQscore = (*pp->ff_get_qscore)(ibp->acnum, ibp->vernum);
2273  else if (pp->ff_get_qscore_pp)
2274  pEntry->mpQscore = (*pp->ff_get_qscore_pp)(ibp->acnum, ibp->vernum, pp);
2275  if (pp->qsfd && ibp->qslength > 0)
2276  pEntry->mpQscore = GetQSFromFile(pp->qsfd, ibp);
2277  }
2278 
2279  if (! QscoreToSeqAnnot(pEntry->mpQscore, *bioseq, ibp->acnum, ibp->vernum, false, false)) {
2280  if (pp->ign_bad_qs == false) {
2281  ibp->drop = true;
2282  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse, "Error while parsing QScore. Entry dropped.");
2283  if (pp->segment == false) {
2284  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2285  }
2286  continue;
2287  }
2288  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse, "Error while parsing QScore.");
2289  }
2290 
2291  pEntry->mpQscore.clear();
2292 
2293  /* add PatentSeqId if patent is found in reference
2294  */
2295  if (ibp->psip.NotEmpty()) {
2296  CRef<CSeq_id> id(new CSeq_id);
2297  id->SetPatent(*ibp->psip);
2298  bioseq->SetId().push_back(id);
2299  ibp->psip.Reset();
2300  }
2301 
2302  if (no_reference(*bioseq) && pp->debug == false) {
2303  ibp->drop = true;
2304  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No reference for the entry, entry dropped");
2305  if (pp->segment == false) {
2306  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2307  }
2308  continue;
2309  }
2310 
2311  seq_entries.push_back(ebp->seq_entry);
2312  ebp->seq_entry.Reset();
2313 
2314  if (pp->segment == false) {
2315  if (pp->limit != 0 && ibp->bases > (size_t)pp->limit) {
2316  if (ibp->htg == 4 || ibp->htg == 1 || ibp->htg == 2) {
2317  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence, "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem", ibp->locusname, ibp->acnum, pp->limit);
2318  } else
2319  seq_long = true;
2320  }
2321 
2322  if (! OutputEmblAsn(seq_long, pp, seq_entries))
2323  ibp->drop = true;
2324  else if (! ibp->drop)
2325  total++;
2326  seq_long = false;
2327  } else {
2328  GetSeqExt(pp, locs);
2329  }
2330  GetScope().ResetHistory();
2331  } /* if, not drop */
2332 
2333  if (pp->segment == false) {
2334  if (! ibp->drop) {
2335  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry parsed successfully: \"%s|%s\".", ibp->locusname, ibp->acnum);
2336  } else {
2337  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2338  }
2339  }
2340  } /* while, ascii block entries */
2341 
2342  if (pp->segment) {
2343  /* reject the whole set if any one entry was rejected
2344  */
2345  for (reject_set = false, i = 0; i < imax; i++) {
2346  if (pp->entrylist[i]->drop) {
2347  reject_set = true;
2348  break;
2349  }
2350  }
2351  if (pp->limit != 0 && ! reject_set) {
2352  for (seq_long = false, i = 0; i < imax; i++) {
2353  ibp = pp->entrylist[i];
2354  if (ibp->bases > (size_t)pp->limit && ibp->htg != 1 &&
2355  ibp->htg != 2 && ibp->htg != 4) {
2356  seq_long = true;
2357  break;
2358  }
2359  }
2360  if (! seq_long) {
2361  for (i = 0; i < imax; i++) {
2362  ibp = pp->entrylist[i];
2363  if (ibp->bases > (size_t)pp->limit &&
2364  (ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 4)) {
2365  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence, "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem", ibp->locusname, ibp->acnum, pp->limit);
2366  }
2367  }
2368  }
2369  }
2370  if (! reject_set) {
2371  // LCOV_EXCL_START
2372  // Excluded per Mark's request on 12/14/2016
2373  BuildBioSegHeader(pp, seq_entries, locs);
2374  // LCOV_EXCL_STOP
2375 
2376  if (! OutputEmblAsn(seq_long, pp, seq_entries))
2377  reject_set = true;
2378  }
2379  if (! reject_set) {
2380  for (i = 0; i < imax; i++) {
2381  ibp = pp->entrylist[i];
2382  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry parsed successfully: \"%s|%s\".", ibp->locusname, ibp->acnum);
2383  }
2384  total = imax;
2385  } else {
2386  ErrPostEx(SEV_WARNING, ERR_SEGMENT_Rejected, "Reject the whole segmented set.");
2387  for (i = 0; i < imax; i++) {
2388  ibp = pp->entrylist[i];
2389  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
2390  }
2391  }
2392  }
2393 
2395 
2396  ErrPostEx(SEV_INFO, ERR_ENTRY_ParsingComplete, "COMPLETED : SUCCEEDED = %d; SKIPPED = %d.", total, imax - total);
2397  return true;
2398 }
2399 
2400 /**********************************************************/
2401 const char* GetEmblDiv(Uint1 num)
2402 {
2403  if (num > 15)
2404  return nullptr;
2405  return ParFlat_Embl_DIV_array[num];
2406 }
2407 
2408 /**********************************************************/
2409 CRef<CEMBL_block> XMLGetEMBLBlock(ParserPtr pp, const char* entry, CMolInfo& mol_info, string& gbdiv, CBioSource* bio_src, TStringList& dr_ena, TStringList& dr_biosample)
2410 {
2411  CRef<CEMBL_block> embl(new CEMBL_block),
2412  ret;
2413 
2414  IndexblkPtr ibp;
2415  char* bptr;
2416 
2417  CEMBL_block::EDiv div;
2418 
2419  bool pat_ref = false;
2420  bool est_kwd = false;
2421  bool sts_kwd = false;
2422  bool gss_kwd = false;
2423  bool htc_kwd = false;
2424  bool fli_kwd = false;
2425  bool wgs_kwd = false;
2426  bool tpa_kwd = false;
2427  bool env_kwd = false;
2428  bool mga_kwd = false;
2429  bool tsa_kwd = false;
2430  bool tls_kwd = false;
2431  bool cancelled;
2432 
2433  char* tempdiv;
2434  char* r;
2435  Int4 i;
2436  Char dataclass[4];
2437 
2438  ibp = pp->entrylist[pp->curindx];
2439 
2440  bool if_cds = XMLCheckCDS(entry, ibp->xip);
2441 
2442  if (ibp->psip.NotEmpty())
2443  pat_ref = true;
2444 
2445  if (! ibp->keywords.empty()) {
2446  embl->SetKeywords().swap(ibp->keywords);
2447  ibp->keywords.clear();
2448  } else
2449  XMLGetKeywords(entry, ibp->xip, embl->SetKeywords());
2450 
2451  for (const string& key : embl->GetKeywords()) {
2452  fta_keywords_check(key.c_str(), &est_kwd, &sts_kwd, &gss_kwd, &htc_kwd, &fli_kwd, &wgs_kwd, &tpa_kwd, &env_kwd, &mga_kwd, &tsa_kwd, &tls_kwd);
2453  }
2454 
2455  if (ibp->env_sample_qual == false && env_kwd) {
2456  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ENV_NoMatchingQualifier, "This record utilizes the ENV keyword, but there are no /environmental_sample qualifiers among its source features.");
2457  return ret;
2458  }
2459 
2460  bptr = StringSave(XMLFindTagValue(entry, ibp->xip, INSDSEQ_DIVISION));
2461  div = static_cast<CEMBL_block::TDiv>(fta_StringMatch(ParFlat_Embl_DIV_array, bptr));
2462  dataclass[0] = '\0';
2463  if (bptr) {
2464  bptr[3] = '\0';
2465  StringCpy(dataclass, bptr);
2466  }
2467  if (div < 0) {
2468  ErrPostEx(SEV_REJECT, ERR_DIVISION_UnknownDivCode, "Unknown division code \"%s\" found in Embl flatfile. Record rejected.", bptr);
2469  if (bptr)
2470  MemFree(bptr);
2471  return ret;
2472  }
2473 
2474  if (bptr)
2475  MemFree(bptr);
2476 
2477  /* Embl has recently (7-19-93, email) decided to change the name of
2478  * its "UNA"==10 division to "UNC"==16 (for "unclassified")
2479  */
2480  if (div == 16)
2481  div = CEMBL_block::eDiv_una;
2482 
2484 
2485  /* 06-10-96 new HUM division replaces the PRI
2486  * it's temporarily mapped to 'other' in asn.1 embl-block.
2487  * Divisions GSS, HUM, HTG, CON, ENV and MUS are mapped to other.
2488  */
2489  int thtg = (div == 18) ? CEMBL_block::eDiv_pri : div;
2490  gbdiv = ParFlat_GBDIV_array[thtg];
2491 
2492  if (div <= CEMBL_block::eDiv_sts)
2493  embl->SetDiv(div);
2494 
2495  const char* p = gbdiv.c_str();
2496  if (ibp->is_tpa &&
2497  (StringEqu(p, "EST") || StringEqu(p, "GSS") ||
2498  StringEqu(p, "PAT") || StringEqu(p, "HTG"))) {
2499  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTPADivcode, "Division code \"%s\" is not legal for TPA records. Entry dropped.", p);
2500  return ret;
2501  }
2502 
2503  if (ibp->is_tsa && ! StringEqu(p, "TSA")) {
2504  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTSADivcode, "Division code \"%s\" is not legal for TSA records. Entry dropped.", p);
2505  return ret;
2506  }
2507 
2508  cancelled = IsCancelled(embl->GetKeywords());
2509 
2510  if (div == 19) /* HTG */
2511  {
2512  if (! HasHtg(embl->GetKeywords())) {
2513  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTGKeywords, "Division is HTG, but entry lacks HTG-related keywords. Entry dropped.");
2514  return ret;
2515  }
2516  tempdiv = StringSave("HTG");
2517  } else
2518  tempdiv = nullptr;
2519 
2520  fta_check_htg_kwds(embl->SetKeywords(), ibp, mol_info);
2521 
2522  XMLDefVsHTGKeywords(mol_info.GetTech(), entry, ibp->xip, cancelled);
2523  if ((mol_info.GetTech() == CMolInfo::eTech_htgs_0 || mol_info.GetTech() == CMolInfo::eTech_htgs_1 ||
2524  mol_info.GetTech() == CMolInfo::eTech_htgs_2) &&
2525  ! gbdiv.empty()) {
2526  gbdiv.clear();
2527  }
2528 
2529  CheckHTGDivision(tempdiv, mol_info.GetTech());
2530  if (tempdiv)
2531  MemFree(tempdiv);
2532 
2533  i = 0;
2534  if (est_kwd)
2535  i++;
2536  if (sts_kwd)
2537  i++;
2538  if (gss_kwd)
2539  i++;
2540  if (ibp->htg > 0)
2541  i++;
2542  if (htc_kwd)
2543  i++;
2544  if (fli_kwd)
2545  i++;
2546  if (wgs_kwd)
2547  i++;
2548  if (env_kwd)
2549  i++;
2550  if (mga_kwd) {
2551  if (ibp->is_mga == false) {
2552  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeCAGE, "This is apparently _not_ a CAGE record, but the special keywords are present. Entry dropped.");
2553  return ret;
2554  }
2555  i++;
2556  } else if (ibp->is_mga) {
2557  ErrPostEx(SEV_REJECT, ERR_KEYWORD_NoGeneExpressionKeywords, "This is apparently a CAGE or 5'-SAGE record, but it lacks the required keywords. Entry dropped.");
2558  return ret;
2559  }
2560 
2561  if (tpa_kwd) {
2562  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL) {
2563  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA\" and/or \"Third Party Annotation\" keywords are present. Entry dropped.");
2564  return ret;
2565  }
2566  i++;
2567  } else if (ibp->is_tpa) {
2568  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA\" and/or \"Third Party Annotation\" keywords. Entry dropped.");
2569  return ret;
2570  }
2571 
2572  if (tsa_kwd) {
2573  if (ibp->is_tsa == false) {
2574  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords are present. Entry dropped.");
2575  return ret;
2576  }
2577  i++;
2578  } else if (ibp->is_tsa) {
2579  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TPA\" and/or \"Transcriptome Shotgun Assembly\" keywords. Entry dropped.");
2580  return ret;
2581  }
2582 
2583  if (tls_kwd) {
2584  if (ibp->is_tls == false) {
2585  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS\" and/or \"Targeted Locus Study\" keywords are present. Entry dropped.");
2586  return ret;
2587  }
2588  i++;
2589  } else if (ibp->is_tls) {
2590  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS\" and/or \"Targeted Locus Study\" keywords. Entry dropped.");
2591  return ret;
2592  }
2593 
2594  if (i > 1) {
2595  if (i == 2 && ibp->htg > 0 && env_kwd)
2596  ErrPostEx(SEV_WARNING, ERR_KEYWORD_HTGPlusENV, "This HTG record also has the ENV keyword, which is an unusual combination. Confirmation that isolation and cloning steps actually occured might be appropriate.");
2597  else if (i != 2 || env_kwd == false ||
2598  (est_kwd == false && gss_kwd == false && wgs_kwd == false)) {
2599  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingKeywords, "This record contains more than one of the special keywords used to indicate that a sequence is an HTG, EST, GSS, STS, HTC, WGS, ENV, FLI_CDNA, TPA, CAGE, TSA or TLS sequence.");
2600  return ret;
2601  }
2602  }
2603 
2604  if (wgs_kwd)
2605  i--;
2606  if (ibp->is_contig && i > 0 &&
2607  wgs_kwd == false && tpa_kwd == false && env_kwd == false) {
2608  ErrPostEx(SEV_REJECT, ERR_KEYWORD_IllegalForCON, "This CON record should not have HTG, EST, GSS, STS, HTC, FLI_CDNA, CAGE, TSA or TLS special keywords. Entry dropped.");
2609  return ret;
2610  }
2611 
2612  CMolInfo::TTech tech = mol_info.GetTech();
2613  if (tech == CMolInfo::eTech_htgs_0 || tech == CMolInfo::eTech_htgs_1 ||
2614  tech == CMolInfo::eTech_htgs_2 || tech == CMolInfo::eTech_htgs_3) {
2615  RemoveHtgPhase(embl->SetKeywords());
2616  }
2617 
2618  char* kw = StringSave(XMLConcatSubTags(entry, ibp->xip, INSDSEQ_KEYWORDS, ';'));
2619  if (kw) {
2620  if (! est_kwd && StringStr(kw, "EST")) {
2621  ErrPostEx(SEV_WARNING, ERR_KEYWORD_ESTSubstring, "Keyword %s has substring EST, but no official EST keywords found", kw);
2622  }
2623  if (! sts_kwd && StringStr(kw, "STS")) {
2624  ErrPostEx(SEV_WARNING, ERR_KEYWORD_STSSubstring, "Keyword %s has substring STS, but no official STS keywords found", kw);
2625  }
2626  if (! gss_kwd && StringStr(kw, "GSS")) {
2627  ErrPostEx(SEV_WARNING, ERR_KEYWORD_GSSSubstring, "Keyword %s has substring GSS, but no official GSS keywords found", kw);
2628  }
2629  MemFree(kw);
2630  }
2631  if (! ibp->is_contig) {
2632  bool drop = false;
2633  CMolInfo::TTech tech = mol_info.GetTech();
2634 
2635  check_div(ibp->is_pat, pat_ref, est_kwd, sts_kwd, gss_kwd, if_cds, gbdiv, &tech, ibp->bases, pp->source, drop);
2636  if (tech != CMolInfo::eTech_unknown)
2637  mol_info.SetTech(tech);
2638  else
2639  mol_info.ResetTech();
2640 
2641  if (drop) {
2642  return ret;
2643  }
2644  } else if (! gbdiv.empty() && StringEqu(gbdiv.c_str(), "CON")) {
2645  gbdiv.clear();
2646  }
2647 
2648  bool is_htc_div = ! gbdiv.empty() && StringEqu(gbdiv.c_str(), "HTC");
2649  bool has_htc = HasHtc(embl->GetKeywords());
2650 
2651  if (is_htc_div && ! has_htc) {
2652  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTCKeyword, "This record is in the HTC division, but lacks the required HTC keyword.");
2653  return ret;
2654  }
2655  if (! is_htc_div && has_htc) {
2656  ErrPostEx(SEV_ERROR, ERR_DIVISION_InvalidHTCKeyword, "This record has the special HTC keyword, but is not in HTC division. If this record has graduated out of HTC, then the keyword should be removed.");
2657  return ret;
2658  }
2659 
2660  if (is_htc_div) {
2661  r = StringSave(XMLFindTagValue(entry, ibp->xip, INSDSEQ_MOLTYPE));
2662  if (r) {
2663  p = r;
2664  if (*r == 'm' || *r == 'r')
2665  p = r + 1;
2666  else if (StringEquN(r, "pre-", 4))
2667  p = r + 4;
2668  else if (StringEquN(r, "transcribed ", 12))
2669  p = r + 12;
2670 
2671  if (! StringEquN(p, "RNA", 3)) {
2672  ErrPostEx(SEV_ERROR, ERR_DIVISION_HTCWrongMolType, "All HTC division records should have a moltype of pre-RNA, mRNA or RNA.");
2673  MemFree(r);
2674  return ret;
2675  }
2676  MemFree(r);
2677  }
2678  }
2679 
2680  if (fli_kwd)
2682 
2683  /* will be used in flat file database
2684  */
2685  if (! gbdiv.empty()) {
2686  if (StringEqu(gbdiv.c_str(), "EST")) {
2687  ibp->EST = true;
2688  mol_info.SetTech(CMolInfo::eTech_est);
2689  } else if (StringEqu(gbdiv.c_str(), "STS")) {
2690  ibp->STS = true;
2691  mol_info.SetTech(CMolInfo::eTech_sts);
2692  } else if (StringEqu(gbdiv.c_str(), "GSS")) {
2693  ibp->GSS = true;
2694  mol_info.SetTech(CMolInfo::eTech_survey);
2695  } else if (StringEqu(gbdiv.c_str(), "HTC")) {
2696  ibp->HTC = true;
2697  mol_info.SetTech(CMolInfo::eTech_htc);
2698  gbdiv.clear();
2699  } else if (StringEqu(gbdiv.c_str(), "SYN") && bio_src &&
2700  bio_src->IsSetOrigin() && bio_src->GetOrigin() == CBioSource::eOrigin_synthetic) {
2701  gbdiv.clear();
2702  }
2703  } else if (mol_info.IsSetTech()) {
2704  if (mol_info.GetTech() == CMolInfo::eTech_est)
2705  ibp->EST = true;
2706  if (mol_info.GetTech() == CMolInfo::eTech_sts)
2707  ibp->STS = true;
2708  if (mol_info.GetTech() == CMolInfo::eTech_survey)
2709  ibp->GSS = true;
2710  if (mol_info.GetTech() == CMolInfo::eTech_htc)
2711  ibp->HTC = true;
2712  }
2713 
2714  if (mol_info.IsSetTech())
2715  fta_remove_keywords(mol_info.GetTech(), embl->SetKeywords());
2716 
2717  if (ibp->is_tpa)
2718  fta_remove_tpa_keywords(embl->SetKeywords());
2719 
2720  if (ibp->is_tsa)
2721  fta_remove_tsa_keywords(embl->SetKeywords(), pp->source);
2722 
2723  if (ibp->is_tls)
2724  fta_remove_tls_keywords(embl->SetKeywords(), pp->source);
2725 
2726  ibp->wgssec[0] = '\0';
2727  GetExtraAccession(ibp, pp->allow_uwsec, pp->source, embl->SetExtra_acc());
2728 
2729 
2730  CRef<CDate_std> std_creation_date, std_update_date;
2731  if (char* p = StringSave(XMLFindTagValue(entry, ibp->xip, INSDSEQ_CREATE_DATE))) {
2732  std_creation_date = GetUpdateDate(p, pp->source);
2733  embl->SetCreation_date().SetStd(*std_creation_date);
2734  MemFree(p);
2735  }
2736  if (char* p = StringSave(XMLFindTagValue(entry, ibp->xip, INSDSEQ_UPDATE_DATE))) {
2737  std_update_date = GetUpdateDate(p, pp->source);
2738  embl->SetUpdate_date().SetStd(*std_update_date);
2739  MemFree(p);
2740  }
2741 
2742  if (std_update_date.Empty() && std_creation_date.NotEmpty())
2743  embl->SetUpdate_date().SetStd(*std_creation_date);
2744 
2745  GetEmblBlockXref(DataBlk(), ibp->xip, entry, dr_ena, dr_biosample, &ibp->drop, *embl);
2746 
2747  if (StringEqu(dataclass, "ANN") || StringEqu(dataclass, "CON")) {
2748  if (StringLen(ibp->acnum) == 8 && StringEquN(ibp->acnum, "CT", 2)) {
2749  bool found = false;
2750  for (const string& acc : embl->SetExtra_acc()) {
2751  if (fta_if_wgs_acc(acc) == 0 &&
2752  (acc[0] == 'C' || acc[0] == 'U')) {
2753  found = true;
2754  break;
2755  }
2756  }
2757  if (found)
2758  mol_info.SetTech(CMolInfo::eTech_wgs);
2759  }
2760  }
2761 
2762  return embl;
2763 }
2764 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
bool no_reference(const CBioseq &bioseq)
Definition: add.cpp:220
void SeqToDelta(CBioseq &bioseq, Int2 tech)
Definition: add.cpp:505
CMolInfo::TTech fta_check_con_for_wgs(CBioseq &bioseq)
Definition: add.cpp:2075
bool fta_check_htg_kwds(TKeywordList &kwds, IndexblkPtr ibp, CMolInfo &mol_info)
Definition: add.cpp:913
void fta_set_molinfo_completeness(CBioseq &bioseq, const Indexblk *ibp)
Definition: add.cpp:2765
void fta_add_hist(ParserPtr pp, CBioseq &bioseq, CGB_block::TExtra_accessions &extra_accs, Parser::ESource source, CSeq_id::E_Choice acctype, bool pricon, const char *acc)
Definition: add.cpp:793
void AssemblyGapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:339
bool fta_parse_tpa_tsa_block(CBioseq &bioseq, char *offset, char *acnum, Int2 vernum, size_t len, Int2 col_data, bool tpa)
Definition: add.cpp:1118
bool fta_if_valid_biosample(const Char *id, bool dblink)
Definition: add.cpp:1757
string GetQSFromFile(FILE *fd, const Indexblk *ibp)
Definition: add.cpp:2668
void fta_get_project_user_object(TSeqdescList &descrs, char *offset, Parser::EFormat format, bool *drop, Parser::ESource source)
Definition: add.cpp:1610
bool check_cds(const DataBlk &entry, Parser::EFormat format)
Definition: add.cpp:258
void fta_create_far_fetch_policy_user_object(CBioseq &bsp, Int4 num)
Definition: add.cpp:2790
void fta_tsa_tls_comment_dblink_check(const CBioseq &bioseq, bool is_tsa)
Definition: add.cpp:2720
void fta_remove_cleanup_user_object(CSeq_entry &seq_entry)
Definition: add.cpp:2687
bool fta_if_valid_sra(const Char *id, bool dblink)
Definition: add.cpp:1736
bool fta_dblink_has_sra(const CRef< CUser_object > &uop)
Definition: add.cpp:2831
CRef< CSeq_loc > fta_get_seqloc_int_whole(CSeq_id &seq_id, size_t len)
Definition: add.cpp:1453
void GapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:387
void err_install(const Indexblk *ibp, bool accver)
Definition: add.cpp:302
void fta_parse_structured_comment(char *str, bool &bad, TUserObjVector &objs)
Definition: add.cpp:2552
Int4 fta_fix_seq_loc_id(TSeqLocList &locs, ParserPtr pp, const char *location, const char *name, bool iscon)
Definition: add.cpp:2299
void StripSerialNumbers(TEntryList &seq_entries)
Definition: asci_blk.cpp:3406
void AddNIDSeqId(CBioseq &bioseq, const DataBlk &entry, Int2 type, Int2 coldata, Parser::ESource source)
Definition: asci_blk.cpp:2726
void fta_fix_orgref_div(const CBioseq::TAnnot &annots, COrg_ref *org_ref, CGB_block &gbb)
Definition: asci_blk.cpp:3270
void DefVsHTGKeywords(CMolInfo::TTech tech, const DataBlk &entry, Int2 what, Int2 ori, bool cancelled)
Definition: asci_blk.cpp:2821
void fta_sort_seqfeat_cit(TEntryList &seq_entries)
Definition: asci_blk.cpp:3242
void PackEntries(TEntryList &seq_entries)
Definition: asci_blk.cpp:3506
void fta_set_strandedness(TEntryList &seq_entries)
Definition: asci_blk.cpp:3341
void CheckHTGDivision(const char *div, CMolInfo::TTech tech)
Definition: asci_blk.cpp:2946
unique_ptr< unsigned char[]> GetDNAConv(void)
Definition: asci_blk.cpp:1790
bool XMLCheckCDS(const char *entry, XmlIndexPtr xip)
Definition: asci_blk.cpp:3310
bool fta_orgref_has_taxid(const COrg_ref::TDb &dbtags)
Definition: asci_blk.cpp:3258
char * GetDescrComment(char *offset, size_t len, Uint2 col_data, bool is_htg, bool is_pat)
Definition: asci_blk.cpp:1159
void EntryCheckDivCode(TEntryList &seq_entries, ParserPtr pp)
Definition: asci_blk.cpp:2810
void GetEmblSubBlock(size_t bases, Parser::ESource source, const DataBlk &entry)
Definition: asci_blk.cpp:740
char * GetEmblBlock(DataBlkPtr *chain, char *ptr, short *retkw, Parser::EFormat format, char *eptr)
Definition: asci_blk.cpp:545
void GetSeqExt(ParserPtr pp, CSeq_loc &seq_loc)
Definition: asci_blk.cpp:2473
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
Definition: asci_blk.cpp:1678
bool fta_EntryCheckGBBlock(TEntryList &seq_entries)
Definition: asci_blk.cpp:3113
void fta_sort_descr(TEntryList &seq_entries)
Definition: asci_blk.cpp:3185
void XMLDefVsHTGKeywords(CMolInfo::TTech tech, const char *entry, XmlIndexPtr xip, bool cancelled)
Definition: asci_blk.cpp:2889
void BuildBioSegHeader(ParserPtr pp, TEntryList &entries, const CSeq_loc &seqloc)
Definition: asci_blk.cpp:2498
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
Definition: asci_blk.cpp:1321
bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string &div, CMolInfo::TTech *tech, size_t bases, Parser::ESource source, bool &drop)
Definition: asci_blk.cpp:2569
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
Definition: asci_blk.cpp:1074
void xFreeEntry(DataBlkPtr entry)
Definition: block.cpp:109
list< string > TStringList
Definition: cgictx.cpp:719
void ProcessCitations(TEntryList &seq_entries)
Definition: citation.cpp:307
CDate::ECompare Compare(const CDate_std &date) const
Indicate how *this relates to another date.
Definition: Date_std.cpp:91
void GetDate(string *label, const string &format) const
Append a custom string representation of the date to the label.
Definition: Date_std.cpp:159
@ eCompare_after
*this comes second.
Definition: Date.hpp:76
Definition: Dbtag.hpp:53
CEMBL_block –.
Definition: EMBL_block.hpp:66
CEMBL_xref –.
Definition: EMBL_xref.hpp:66
@Imp_feat.hpp User-defined methods of the data storage class.
Definition: Imp_feat.hpp:54
const list< string > KeywordList() const
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
char * mOffset
Definition: ftablock.h:370
size_t len
Definition: ftablock.h:371
CFlatFileData * mpData
Definition: ftablock.h:369
DataBlk * mpNext
Definition: ftablock.h:374
int mType
Definition: ftablock.h:368
USING_SCOPE(objects)
static bool GetEmblInst(ParserPtr pp, const DataBlk &entry, unsigned char *dnaconv)
Definition: em_ascii.cpp:838
static bool CheckEmblContigEverywhere(const IndexblkPtr ibp, Parser::ESource source)
Definition: em_ascii.cpp:720
static const char * ParFlat_DRname_array[]
Definition: em_ascii.cpp:152
static bool OutputEmblAsn(bool seq_long, ParserPtr pp, TEntryList &seq_entries)
Definition: em_ascii.cpp:307
static const char * ParFlat_Embl_DIV_array[]
Definition: em_ascii.cpp:112
static void GetEmblDescr(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
Definition: em_ascii.cpp:1642
const char * GetEmblDiv(Uint1 num)
Definition: em_ascii.cpp:2401
static CRef< CUser_field > fta_create_user_field(const char *tag, TStringList &lst)
Definition: em_ascii.cpp:1487
static void fta_create_imgt_misc_feat(CBioseq &bioseq, CEMBL_block &embl_block, IndexblkPtr ibp)
Definition: em_ascii.cpp:1568
static bool s_DuplicatesBiosource(const CBioSource &biosource, const string &gbdiv)
Definition: em_ascii.cpp:1360
static CTextseq_id & SetTextIdRef(CSeq_id &id)
Definition: em_ascii.cpp:605
static void EmblGetDivisionNewID(IndexblkPtr ibp, const DataBlk &entry)
Definition: em_ascii.cpp:2105
static const char * ParFlat_DBname_array[]
Definition: em_ascii.cpp:131
bool GetEmblInstContig(const DataBlk &entry, CBioseq &bioseq, ParserPtr pp)
Definition: em_ascii.cpp:756
static void EmblGetDivision(IndexblkPtr ibp, const DataBlk &entry)
Definition: em_ascii.cpp:2084
void fta_build_ena_user_object(CSeq_descr::Tdata &descrs, TStringList &dr_ena, TStringList &dr_biosample, CRef< CUser_object > &dbuop)
Definition: em_ascii.cpp:1505
static CRef< CMolInfo > GetEmblMolInfo(ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
Definition: em_ascii.cpp:1415
static CRef< CGB_block > GetEmblGBBlock(ParserPtr pp, const DataBlk &entry, const string &gbdiv, CBioSource *bio_src)
Definition: em_ascii.cpp:1369
static const char * ParFlat_Embl_dataclass_array[]
Definition: em_ascii.cpp:105
static CRef< CEMBL_block > GetDescrEmblBlock(ParserPtr pp, const DataBlk &entry, CMolInfo &mol_info, string &gbdiv, const CBioSource *bio_src, TStringList &dr_ena, TStringList &dr_biosample)
Definition: em_ascii.cpp:937
static void FakeEmblBioSources(const DataBlk &entry, CBioseq &bioseq)
Definition: em_ascii.cpp:1995
bool EmblAscii(ParserPtr pp)
Definition: em_ascii.cpp:2140
static void SetXrefObjId(CEMBL_xref &xref, const string &str)
Definition: em_ascii.cpp:368
static const char * ParFlat_GBDIV_array[]
Definition: em_ascii.cpp:122
static void GetReleaseInfo(const DataBlk &entry)
Definition: em_ascii.cpp:641
static CRef< COrg_ref > GetEmblOrgRef(const DataBlkPtr dbp)
Definition: em_ascii.cpp:680
static bool s_HasTPAPrefix(const CTempString &line)
Definition: em_ascii.cpp:1631
static void GetEmblBlockXref(const DataBlk &entry, XmlIndexPtr xip, const char *chentry, TStringList &dr_ena, TStringList &dr_biosample, bool *drop, CEMBL_block &embl)
Definition: em_ascii.cpp:403
CRef< CEMBL_block > XMLGetEMBLBlock(ParserPtr pp, const char *entry, CMolInfo &mol_info, string &gbdiv, CBioSource *bio_src, TStringList &dr_ena, TStringList &dr_biosample)
Definition: em_ascii.cpp:2409
static void GetEmblDate(Parser::ESource source, const DataBlk &entry, CRef< CDate_std > &crdate, CRef< CDate_std > &update)
Definition: em_ascii.cpp:272
@ ParFlat_NI
Definition: embl.h:44
@ ParFlat_OC
Definition: embl.h:61
@ ParFlat_PR
Definition: embl.h:57
@ ParFlat_KW
Definition: embl.h:47
@ ParFlat_AH
Definition: embl.h:56
@ ParFlat_DT
Definition: embl.h:45
@ ParFlat_SQ
Definition: embl.h:53
@ ParFlat_DR
Definition: embl.h:50
@ ParFlat_OS
Definition: embl.h:48
@ ParFlat_CC
Definition: embl.h:51
@ ParFlat_CO
Definition: embl.h:55
@ ParFlat_OG
Definition: embl.h:62
@ ParFlat_DE
Definition: embl.h:46
@ ParFlat_ID
Definition: embl.h:42
@ ParFlatEM_END
Definition: embl.h:58
#define ParFlat_COL_DATA_EMBL
Definition: embl.h:38
DataBlkPtr LoadEntry(ParserPtr pp, size_t offset, size_t len)
Definition: entry.cpp:300
void FinalCleanup(TEntryList &seq_entries)
Definition: fcleanup.cpp:377
#define ERR_DRXREF_DuplicatedSRA
Definition: flat2err.h:600
#define ERR_SEQUENCE_BadData
Definition: flat2err.h:150
#define ERR_TPA_TpaSpansMissing
Definition: flat2err.h:593
#define ERR_ENTRY_LongSequence
Definition: flat2err.h:82
#define ERR_FORMAT_MissingContigFeature
Definition: flat2err.h:43
#define ERR_KEYWORD_ShouldNotBeTPA
Definition: flat2err.h:208
#define ERR_DIVISION_BadTSADivcode
Definition: flat2err.h:261
#define ERR_FORMAT_MissingSequenceData
Definition: flat2err.h:41
#define ERR_DIVISION_InvalidHTCKeyword
Definition: flat2err.h:254
#define ERR_DRXREF_InvalidSRA
Definition: flat2err.h:599
#define ERR_KEYWORD_IllegalForCON
Definition: flat2err.h:210
#define ERR_DIVISION_MissingHTGKeywords
Definition: flat2err.h:249
#define ERR_QSCORE_FailedToParse
Definition: flat2err.h:577
#define ERR_ENTRY_LongHTGSSequence
Definition: flat2err.h:86
#define ERR_KEYWORD_MissingTSA
Definition: flat2err.h:216
#define ERR_DIVISION_BadTPADivcode
Definition: flat2err.h:257
#define ERR_DRXREF_InvalidBioSample
Definition: flat2err.h:597
#define ERR_LOCUS_WrongTopology
Definition: flat2err.h:180
#define ERR_REFERENCE_No_references
Definition: flat2err.h:289
#define ERR_KEYWORD_ShouldNotBeTLS
Definition: flat2err.h:218
#define ERR_ENTRY_GBBlock_not_Empty
Definition: flat2err.h:85
#define ERR_KEYWORD_HTGPlusENV
Definition: flat2err.h:217
#define ERR_DEFINITION_MissingTPA
Definition: flat2err.h:269
#define ERR_ENTRY_Skipped
Definition: flat2err.h:80
#define ERR_DEFINITION_MissingTLS
Definition: flat2err.h:273
#define ERR_KEYWORD_ESTSubstring
Definition: flat2err.h:204
#define ERR_KEYWORD_ConflictingKeywords
Definition: flat2err.h:207
#define ERR_DIVISION_ConDivLacksContig
Definition: flat2err.h:252
#define ERR_LOCATION_ContigHasNull
Definition: flat2err.h:397
#define ERR_KEYWORD_ENV_NoMatchingQualifier
Definition: flat2err.h:214
#define ERR_KEYWORD_ShouldNotBeTSA
Definition: flat2err.h:215
#define ERR_KEYWORD_STSSubstring
Definition: flat2err.h:205
#define ERR_DIVISION_UnknownDivCode
Definition: flat2err.h:222
#define ERR_KEYWORD_MissingTLS
Definition: flat2err.h:219
#define ERR_DEFINITION_ShouldNotBeTSA
Definition: flat2err.h:270
#define ERR_SEGMENT_Rejected
Definition: flat2err.h:166
#define ERR_DIVISION_MissingHTCKeyword
Definition: flat2err.h:253
#define ERR_DIVISION_MappedtoCON
Definition: flat2err.h:248
#define ERR_FORMAT_ContigWithSequenceData
Definition: flat2err.h:42
#define ERR_DRXREF_UnknownDBname
Definition: flat2err.h:596
#define ERR_DRXREF_DuplicatedBioSamples
Definition: flat2err.h:598
#define ERR_KEYWORD_NoGeneExpressionKeywords
Definition: flat2err.h:213
#define ERR_DEFINITION_MissingTSA
Definition: flat2err.h:271
#define ERR_KEYWORD_GSSSubstring
Definition: flat2err.h:206
#define ERR_DEFINITION_ShouldNotBeTPA
Definition: flat2err.h:268
#define ERR_FORMAT_MissingEnd
Definition: flat2err.h:39
#define ERR_KEYWORD_MissingTPA
Definition: flat2err.h:209
#define ERR_DIVISION_ConDivInSegset
Definition: flat2err.h:251
#define ERR_ENTRY_ParsingComplete
Definition: flat2err.h:79
#define ERR_ORGANISM_NoOrganism
Definition: flat2err.h:184
#define ERR_DATE_IllegalDate
Definition: flat2err.h:102
#define ERR_ENTRY_Parsed
Definition: flat2err.h:83
#define ERR_DIVISION_HTCWrongMolType
Definition: flat2err.h:255
#define ERR_KEYWORD_ShouldNotBeCAGE
Definition: flat2err.h:211
#define ERR_DEFINITION_ShouldNotBeTLS
Definition: flat2err.h:272
#define ERR_TSA_UnexpectedPrimaryAccession
Definition: flat2err.h:609
list< CRef< objects::CSeq_entry > > TEntryList
bool QscoreToSeqAnnot(const string &qscore, CBioseq &bioseq, char *acc, Int2 ver, bool check_minmax, bool allow_na)
#define INSDSEQ_MOLTYPE
Definition: fta_xml.h:45
void XMLGetKeywords(const char *entry, const XmlIndex *xip, TKeywordList &keywords)
Definition: xm_index.cpp:1524
#define INSDSEQ_KEYWORDS
Definition: fta_xml.h:58
#define INSDSEQ_DATABASE_REFERENCE
Definition: fta_xml.h:67
#define INSDSEQ_CREATE_DATE
Definition: fta_xml.h:49
#define INSDSEQ_DIVISION
Definition: fta_xml.h:47
#define INSDSEQ_UPDATE_DATE
Definition: fta_xml.h:48
unique_ptr< string > XMLConcatSubTags(const char *entry, const XmlIndex *xip, Int4 tag, Char sep)
Definition: xm_index.cpp:1550
unique_ptr< string > XMLFindTagValue(const char *entry, const XmlIndex *xip, Int4 tag)
Definition: xm_index.cpp:214
std::list< std::string > TKeywordList
Definition: ftablock.h:204
std::vector< CRef< objects::CUser_object > > TUserObjVector
Definition: ftablock.h:61
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:131
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:121
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:111
void StringCpy(char *d, const char *s)
Definition: ftacpp.hpp:89
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:90
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
char * StringNew(size_t sz)
Definition: ftacpp.hpp:43
void FtaDeletePrefix(int prefix)
Definition: ftaerr.cpp:346
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
void fta_find_pub_explore(ParserPtr pp, TEntryList &seq_entries)
Definition: ftanet.cpp:753
static const char * str(char *buf, int n)
Definition: stats.c:84
int offset
Definition: replacements.h:160
void DealWithGenes(TEntryList &seq_entries, ParserPtr pp)
Definition: genref.cpp:2957
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
#define StringStr
Definition: ncbistr.hpp:322
#define StringSave
Definition: ncbistr.hpp:326
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
void ResetHistory(EActionIfLocked action=eKeepIfLocked)
Clean all unused TSEs from the scope's cache and release the memory.
Definition: scope.cpp:325
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
void ResetDataAndHistory(void)
Clear all information in the scope except added data loaders.
Definition: scope.cpp:331
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5384
@ fAllowTrailingSymbols
Ignore trailing non-numerics characters.
Definition: ncbistr.hpp:298
list< CRef< CObject_id > > TId
Definition: EMBL_xref_.hpp:91
const TXref & GetXref(void) const
Get the Xref member data.
TXref & SetXref(void)
Assign a value to Xref data member.
TId & SetId(void)
Assign a value to Id data member.
Definition: EMBL_xref_.hpp:245
bool IsSetXref(void) const
Check if a value has been assigned to Xref data member.
list< CRef< CEMBL_xref > > TXref
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
TOrigin GetOrigin(void) const
Get the Origin member data.
Definition: BioSource_.hpp:472
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
Definition: BioSource_.hpp:447
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
@ eOrigin_synthetic
purely synthetic
Definition: BioSource_.hpp:134
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
void SetYear(TYear value)
Assign a value to Year data member.
Definition: Date_std_.hpp:435
void SetMonth(TMonth value)
Assign a value to Month data member.
Definition: Date_std_.hpp:482
TStd & SetStd(void)
Select the variant.
Definition: Date_.cpp:115
void SetDay(TDay value)
Assign a value to Day data member.
Definition: Date_std_.hpp:529
TData & SetData(void)
Assign a value to Data data member.
void SetNum(TNum value)
Assign a value to Num data member.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
void SetLabel(TLabel &value)
Assign a value to Label data member.
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
TYear GetYear(void) const
Get the Year member data.
Definition: Date_std_.hpp:426
TMonth GetMonth(void) const
Get the Month member data.
Definition: Date_std_.hpp:473
TDay GetDay(void) const
Get the Day member data.
Definition: Date_std_.hpp:520
bool IsSetDb(void) const
ids in taxonomic or culture dbases Check if a value has been assigned to Db data member.
Definition: Org_ref_.hpp:479
const TDiv & GetDiv(void) const
Get the Div member data.
Definition: OrgName_.hpp:1005
void SetCommon(const TCommon &value)
Assign a value to Common data member.
Definition: Org_ref_.hpp:428
const TDb & GetDb(void) const
Get the Db member data.
Definition: Org_ref_.hpp:491
bool IsSetDiv(void) const
GenBank division code Check if a value has been assigned to Div data member.
Definition: OrgName_.hpp:993
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
Definition: Org_ref_.hpp:381
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSeq_code_type_iupacna
IUPAC 1 letter nuc acid code.
vector< CRef< CDbtag > > TDbxref
Definition: Seq_feat_.hpp:123
TDbxref & SetDbxref(void)
Assign a value to Dbxref data member.
Definition: Seq_feat_.hpp:1339
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void SetKey(const TKey &value)
Assign a value to Key data member.
Definition: Imp_feat_.hpp:268
bool IsMix(void) const
Check if variant Mix is selected.
Definition: Seq_loc_.hpp:552
const TMix & GetMix(void) const
Get the variant data.
Definition: Seq_loc_.cpp:282
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
Definition: Seq_id_.hpp:113
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ e_Named_annot_track
Internal named annotation tracking ID.
Definition: Seq_id_.hpp:114
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
TPub & SetPub(void)
Select the variant.
Definition: Seqdesc_.cpp:362
TTopology GetTopology(void) const
Get the Topology member data.
Definition: Seq_inst_.hpp:733
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
TGenbank & SetGenbank(void)
Select the variant.
Definition: Seqdesc_.cpp:340
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
TComment & SetComment(void)
Select the variant.
Definition: Seqdesc_.hpp:1065
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
void ResetTech(void)
Reset Tech data member.
Definition: MolInfo_.hpp:484
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
void SetTopology(TTopology value)
Assign a value to Topology data member.
Definition: Seq_inst_.hpp:739
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
TEmbl & SetEmbl(void)
Select the variant.
Definition: Seqdesc_.cpp:456
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
EStrand
strandedness in living organism
Definition: Seq_inst_.hpp:133
list< CRef< CSeq_annot > > TAnnot
Definition: Bioseq_.hpp:97
void SetStrand(TStrand value)
Assign a value to Strand data member.
Definition: Seq_inst_.hpp:786
void SetTech(TTech value)
Assign a value to Tech data member.
Definition: MolInfo_.hpp:503
TMolinfo & SetMolinfo(void)
Select the variant.
Definition: Seqdesc_.cpp:594
TCreate_date & SetCreate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:478
TUpdate_date & SetUpdate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:500
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_targeted
targeted locus sets/studies
Definition: MolInfo_.hpp:147
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ ParFlat_REF_NO_TARGET
Definition: index.h:63
@ ParFlat_REF_END
Definition: index.h:60
CRef< CDate_std > GetUpdateDate(const char *ptr, Parser::ESource source)
Definition: indx_blk.cpp:609
int fta_if_wgs_acc(string_view accession)
Definition: indx_blk.cpp:1191
int CheckSTRAND(const string &str)
Definition: indx_blk.cpp:465
int i
int len
void GetFlatBiomol(CMolInfo::TBiomol &biomol, CMolInfo::TTech tech, char *molstr, ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
Definition: loadfeat.cpp:4966
void LoadFeat(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
Definition: loadfeat.cpp:4660
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
const char * tag
std::list< SeqLoc > TSeqLocList
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void CheckDupDates(TEntryList &seq_entries)
Definition: nucprot.cpp:2661
CRef< CPubdesc > DescrRefs(ParserPtr pp, DataBlkPtr dbp, Uint2 col_data)
Definition: ref.cpp:2426
DataBlkPtr chain
Definition: ftablock.h:382
CRef< objects::CSeq_entry > seq_entry
Definition: ftablock.h:384
Char acnum[200]
Definition: ftablock.h:207
CRef< objects::CPatent_seq_id > psip
Definition: ftablock.h:231
Char division[4]
Definition: ftablock.h:212
bool assembly
Definition: ftablock.h:282
bool is_mga
Definition: ftablock.h:240
bool tsa_allowed
Definition: ftablock.h:252
Int2 htg
Definition: ftablock.h:237
bool is_tls
Definition: ftablock.h:249
Int2 vernum
Definition: ftablock.h:208
bool is_tpa
Definition: ftablock.h:247
TKeywordList keywords
Definition: ftablock.h:281
bool embl_new_ID
Definition: ftablock.h:259
bool is_wgs
Definition: ftablock.h:246
bool origin
Definition: ftablock.h:242
bool is_contig
Definition: ftablock.h:238
bool STS
Definition: ftablock.h:234
bool is_pat
Definition: ftablock.h:243
bool HTC
Definition: ftablock.h:236
bool drop
Definition: ftablock.h:223
bool experimental
Definition: ftablock.h:288
size_t bases
Definition: ftablock.h:213
bool inferential
Definition: ftablock.h:286
bool is_tsa
Definition: ftablock.h:248
bool EST
Definition: ftablock.h:233
size_t len
Definition: ftablock.h:225
GapFeatsPtr gaps
Definition: ftablock.h:255
string wgssec
Definition: ftablock.h:277
size_t offset
Definition: ftablock.h:209
bool specialist_db
Definition: ftablock.h:284
Uint2 segnum
Definition: ftablock.h:214
Char locusname[200]
Definition: ftablock.h:211
bool env_sample_qual
Definition: ftablock.h:260
XmlIndexPtr xip
Definition: ftablock.h:258
size_t qslength
Definition: ftablock.h:271
bool GSS
Definition: ftablock.h:235
char *(* ff_get_qscore_pp)(const char *accession, Int2 v, Parser *pp)
vector< IndexblkPtr > entrylist
bool allow_crossdb_featloc
optional< string > buf
CKeywordParser & KeywordParser()
char *(* ff_get_qscore)(const char *accession, Int2 v)
TEntryList entries
Definition: inftrees.h:24
else result
Definition: token2.c:20
CScope & GetScope()
bool GetGenomeInfo(CBioSource &bsp, string_view bptr)
Definition: utilfeat.cpp:225
void MaybeCutGbblockSource(TEntryList &seq_entries)
Definition: utilfeat.cpp:435
bool HasHtg(const TKeywordList &keywords)
Definition: utilfun.cpp:1605
bool HasHtc(const TKeywordList &keywords)
Definition: utilfun.cpp:1634
char * SrchTheChar(char *bptr, char *eptr, Char letter)
Definition: utilfun.cpp:789
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1167
void RemoveHtgPhase(TKeywordList &keywords)
Definition: utilfun.cpp:1619
string GetBlkDataReplaceNewLine(string_view instr, Uint2 indent)
Definition: utilfun.cpp:674
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1308
void fta_remove_tpa_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1294
void CleanTailNoneAlphaCharInString(string &str)
Definition: utilfun.cpp:713
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
Definition: utilfun.cpp:1263
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1323
char * xSrchNodeType(const DataBlk &entry, Int4 type, size_t *len)
Definition: utilfun.cpp:994
Int2 fta_StringMatch(const Char **array, string_view text)
Definition: utilfun.cpp:516
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
Definition: utilfun.cpp:1226
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1527
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
Definition: utilfun.cpp:1025
bool IsCancelled(const TKeywordList &keywords)
Definition: utilfun.cpp:1594
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1133
void fta_remove_env_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1338
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
Definition: utilfun.cpp:809
bool fta_tpa_keywords_check(const TKeywordList &kwds)
Definition: utilfun.cpp:1051
char * PointToNextToken(char *ptr)
Definition: utilfun.cpp:737
CRef< CSeq_loc > xgbparseint_ver(const char *raw_intervals, bool &keep_rawPt, int &numErrors, const TSeqIdList &seq_ids, bool accver)
Definition: xgbparint.cpp:1466
void XGappedSeqLocsToDeltaSeqs(const TSeqLocList &locs, TDeltaList &deltas)
Definition: xutils.cpp:91
Modified on Mon May 13 04:36:19 2024 by modify_doxy.py rev. 669887