NCBI C++ ToolKit
xm_ascii.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: xm_ascii.cpp 102143 2024-04-09 12:51:41Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: xm_ascii.cpp
27  *
28  * Author: Sergey Bazhin
29  *
30  * File Description:
31  * Parse INSDSEQ from blocks to asn.
32  * Build XML format entry block.
33  *
34  */
35 
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 
40 #include <objects/seq/Seq_inst.hpp>
42 #include <objects/seq/Seq_ext.hpp>
48 #include <objmgr/scope.hpp>
49 #include <objects/seq/MolInfo.hpp>
55 #include <objects/seq/Pubdesc.hpp>
56 
57 
58 #include "index.h"
59 
60 #include "ftanet.h"
63 
64 #include "ftaerr.hpp"
65 #include "indx_blk.h"
66 #include "asci_blk.h"
67 #include "utilref.h"
68 #include "utilfeat.h"
69 #include "loadfeat.h"
70 #include "add.h"
71 #include "gb_ascii.h"
72 #include "nucprot.h"
73 #include "fta_qscore.h"
74 #include "em_ascii.h"
75 #include "citation.h"
76 #include "fcleanup.h"
77 #include "utilfun.h"
78 #include "ref.h"
79 #include "xgbparint.h"
80 #include "xutils.h"
81 #include "fta_xml.h"
82 
83 #ifdef THIS_FILE
84 # undef THIS_FILE
85 #endif
86 #define THIS_FILE "xm_ascii.cpp"
87 
90 
91 /**********************************************************/
93 {
94  if (! ibp)
95  return;
96 
97  bool condiv = (NStr::CompareNocase(ibp->division, "CON") == 0);
98 
99  if (condiv && ibp->segnum != 0) {
100  ErrPostEx(SEV_ERROR, ERR_DIVISION_ConDivInSegset, "Use of the CON division is not allowed for members of segmented set : %s|%s. Entry dropped.", ibp->locusname, ibp->acnum);
101  ibp->drop = true;
102  return;
103  }
104 
105  if (! condiv && ibp->is_contig == false && ibp->origin == false) {
106  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingSequenceData, "Required sequence data is absent. Entry dropped.");
107  ibp->drop = true;
108  } else if (! condiv && ibp->is_contig && ibp->origin == false) {
109  ErrPostEx(SEV_WARNING, ERR_DIVISION_MappedtoCON, "Division [%s] mapped to CON based on the existence of <INSDSeq_contig> line.", ibp->division);
110  } else if (ibp->is_contig && ibp->origin) {
112  ErrPostEx(SEV_INFO, ERR_FORMAT_ContigWithSequenceData, "The <INSDSeq_contig> linetype and sequence data are both present. Ignoring sequence data.");
113  } else {
114  ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigWithSequenceData, "The <INSDSeq_contig> linetype and sequence data may not both be present in a sequence record.");
115  ibp->drop = true;
116  }
117  } else if (condiv && ibp->is_contig == false && ibp->origin == false) {
118  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingContigFeature, "No <INSDSeq_contig> data in XML format file. Entry dropped.");
119  ibp->drop = true;
120  } else if (condiv && ibp->is_contig == false && ibp->origin) {
121  ErrPostEx(SEV_WARNING, ERR_DIVISION_ConDivLacksContig, "Division is CON, but <INSDSeq_contig> data have not been found.");
122  }
123 }
124 
125 /**********************************************************/
126 static bool XMLGetInstContig(XmlIndexPtr xip, DataBlkPtr dbp, CBioseq& bioseq, ParserPtr pp)
127 {
128  char* p;
129  char* q;
130  char* r;
131  bool locmap;
132  bool allow_crossdb_featloc;
133  Int4 i;
134  int numerr;
135 
136  p = XMLFindTagValue(dbp->mOffset, xip, INSDSEQ_CONTIG);
137  if (! p)
138  return false;
139 
140  for (q = p, r = p; *q != '\0'; q++)
141  if (*q != '\n' && *q != '\t' && *q != ' ')
142  *r++ = *q;
143  *r = '\0';
144 
145  for (q = p; *q != '\0'; q++)
146  if ((q[0] == ',' && q[1] == ',') || (q[0] == '(' && q[1] == ',') ||
147  (q[0] == ',' && q[1] == ')'))
148  break;
149  if (*q != '\0') {
150  ErrPostEx(SEV_REJECT, ERR_LOCATION_ContigHasNull, "The join() statement for this record's contig line contains one or more comma-delimited components which are null.");
151  MemFree(p);
152  return false;
153  }
154 
155  if (pp->buf)
156  MemFree(pp->buf);
157  pp->buf = nullptr;
158 
159  CRef<CSeq_loc> loc = xgbparseint_ver(p, locmap, numerr, bioseq.GetId(), pp->accver);
160 
161  if (loc.Empty()) {
162  MemFree(p);
163  return true;
164  }
165 
166  allow_crossdb_featloc = pp->allow_crossdb_featloc;
167  pp->allow_crossdb_featloc = true;
168 
169  TSeqLocList locs;
170  locs.push_back(loc);
171  i = fta_fix_seq_loc_id(locs, pp, p, nullptr, true);
172  if (i > 999)
174 
175  pp->allow_crossdb_featloc = allow_crossdb_featloc;
176 
177  if (loc->IsMix()) {
178  XGappedSeqLocsToDeltaSeqs(loc->GetMix(), bioseq.SetInst().SetExt().SetDelta().Set());
179  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_delta);
180  } else
181  bioseq.SetInst().ResetExt();
182 
183  MemFree(p);
184 
185  return true;
186 }
187 
188 /**********************************************************/
189 bool XMLGetInst(ParserPtr pp, DataBlkPtr dbp, unsigned char* dnaconv, CBioseq& bioseq)
190 {
191  IndexblkPtr ibp;
192  XmlIndexPtr xip;
193  Int2 topology;
194  Int2 strand;
195  char* topstr;
196  char* strandstr;
197 
198  ibp = pp->entrylist[pp->curindx];
199  topstr = nullptr;
200  strandstr = nullptr;
201  for (xip = ibp->xip; xip; xip = xip->next) {
202  if (xip->tag == INSDSEQ_TOPOLOGY && ! topstr)
203  topstr = XMLGetTagValue(dbp->mOffset, xip);
204  else if (xip->tag == INSDSEQ_STRANDEDNESS && ! strandstr)
205  strandstr = XMLGetTagValue(dbp->mOffset, xip);
206  }
207  if (! topstr)
208  topstr = StringSave(" ");
209  if (! strandstr)
210  strandstr = StringSave(" ");
211 
212  CSeq_inst& inst = bioseq.SetInst();
214 
215  /* get linear, circular, tandem topology, blank is linear which = 1
216  */
217  topology = XMLCheckTPG(topstr);
218  if (topology > 1)
219  inst.SetTopology(static_cast<CSeq_inst::ETopology>(topology));
220 
221  strand = XMLCheckSTRAND(strandstr);
222  if (strand > 0)
223  inst.SetStrand(static_cast<CSeq_inst::EStrand>(strand));
224 
225  if (topstr)
226  MemFree(topstr);
227  if (strandstr)
228  MemFree(strandstr);
229 
230  if (! GetSeqData(pp, *dbp, bioseq, 0, dnaconv, ibp->is_prot ? eSeq_code_type_iupacaa : eSeq_code_type_iupacna))
231  return false;
232 
233  if (ibp->is_contig && ! XMLGetInstContig(ibp->xip, dbp, bioseq, pp))
234  return false;
235 
236  return true;
237 }
238 
239 /**********************************************************/
240 static CRef<CGB_block> XMLGetGBBlock(ParserPtr pp, const char* entry, CMolInfo& mol_info, CBioSource* bio_src)
241 {
242  CRef<CGB_block> gbb(new CGB_block),
243  ret;
244 
245  IndexblkPtr ibp;
246  char* bptr;
247  char* str;
248  char msg[4];
249  char* kw;
250  char* kwp;
251  Int2 div;
252  bool if_cds;
253 
254  bool pat_ref = false;
255  bool est_kwd = false;
256  bool sts_kwd = false;
257  bool gss_kwd = false;
258  bool htc_kwd = false;
259  bool fli_kwd = false;
260  bool wgs_kwd = false;
261  bool tpa_kwd = false;
262  bool tsa_kwd = false;
263  bool tls_kwd = false;
264  bool env_kwd = false;
265  bool mga_kwd = false;
266 
267  bool cancelled;
268  bool drop;
269  char* tempdiv;
270  Int2 thtg;
271  char* p;
272  Int4 i;
273 
274  ibp = pp->entrylist[pp->curindx];
275 
276  ibp->wgssec[0] = '\0';
277 
278  str = XMLFindTagValue(entry, ibp->xip, INSDSEQ_SOURCE);
279  if (str) {
280  p = StringRChr(str, '.');
281  if (p && p > str && p[1] == '\0' && *(p - 1) == '.')
282  *p = '\0';
283 
284  gbb->SetSource(str);
285  MemFree(str);
286  }
287 
288  if (! ibp->keywords.empty()) {
289  gbb->SetKeywords().swap(ibp->keywords);
290  ibp->keywords.clear();
291  } else
292  XMLGetKeywords(entry, ibp->xip, gbb->SetKeywords());
293 
294  if (ibp->is_mga && ! fta_check_mga_keywords(mol_info, gbb->GetKeywords())) {
295  return ret;
296  }
297 
298  if (ibp->is_tpa && ! fta_tpa_keywords_check(gbb->SetKeywords())) {
299  return ret;
300  }
301 
302  if (ibp->is_tsa && ! fta_tsa_keywords_check(gbb->SetKeywords(), pp->source)) {
303  return ret;
304  }
305 
306  if (ibp->is_tls && ! fta_tls_keywords_check(gbb->SetKeywords(), pp->source)) {
307  return ret;
308  }
309 
310  for (const string& key : gbb->GetKeywords()) {
311  fta_keywords_check(key.c_str(), &est_kwd, &sts_kwd, &gss_kwd, &htc_kwd, &fli_kwd, &wgs_kwd, &tpa_kwd, &env_kwd, &mga_kwd, &tsa_kwd, &tls_kwd);
312  }
313 
314  if (ibp->env_sample_qual == false && env_kwd) {
315  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ENV_NoMatchingQualifier, "This record utilizes the ENV keyword, but there are no /environmental_sample qualifiers among its source features.");
316  return ret;
317  }
318 
319  bptr = XMLFindTagValue(entry, ibp->xip, INSDSEQ_DIVISION);
320  if (bptr) {
321  if_cds = XMLCheckCDS(entry, ibp->xip);
322  div = CheckDIV(bptr);
323  if (div != -1) {
324  string div_str(bptr, bptr + 3);
325  gbb->SetDiv(div_str);
326 
327  if (div == 16) /* "ORG" replaced by "UNA" */
328  gbb->SetDiv("UNA");
329 
330  /* preserve the division code for later use
331  */
332  const char* p_div = gbb->GetDiv().c_str();
333  StringCpy(ibp->division, p_div);
334 
335  if (ibp->psip.NotEmpty())
336  pat_ref = true;
337 
338  if (ibp->is_tpa &&
339  (StringEqu(p_div, "EST") || StringEqu(p_div, "GSS") ||
340  StringEqu(p_div, "PAT") || StringEqu(p_div, "HTG"))) {
341  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTPADivcode, "Division code \"%s\" is not legal for TPA records. Entry dropped.", p_div);
342  return ret;
343  }
344 
345  if (ibp->is_tsa && ! StringEqu(p_div, "TSA")) {
346  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTSADivcode, "Division code \"%s\" is not legal for TSA records. Entry dropped.", p_div);
347  return ret;
348  }
349 
350  cancelled = IsCancelled(gbb->GetKeywords());
351 
352  if (StringEqu(p_div, "HTG")) {
353  if (! HasHtg(gbb->GetKeywords())) {
354  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTGKeywords, "Division is HTG, but entry lacks HTG-related keywords. Entry dropped.");
355  return ret;
356  }
357  }
358 
359  tempdiv = StringSave(gbb->GetDiv());
360 
361  if (fta_check_htg_kwds(gbb->SetKeywords(), pp->entrylist[pp->curindx], mol_info))
362  gbb->SetDiv("");
363 
364  XMLDefVsHTGKeywords(mol_info.GetTech(), entry, ibp->xip, cancelled);
365 
366  CheckHTGDivision(tempdiv, mol_info.GetTech());
367  if (tempdiv)
368  MemFree(tempdiv);
369 
370  i = 0;
371  if (est_kwd)
372  i++;
373  if (sts_kwd)
374  i++;
375  if (gss_kwd)
376  i++;
377  if (ibp->htg > 0)
378  i++;
379  if (htc_kwd)
380  i++;
381  if (fli_kwd)
382  i++;
383  if (wgs_kwd)
384  i++;
385  if (env_kwd)
386  i++;
387  if (mga_kwd) {
388  if (ibp->is_mga == false) {
389  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeCAGE, "This is apparently _not_ a CAGE record, but the special keywords are present. Entry dropped.");
390  return ret;
391  }
392  i++;
393  } else if (ibp->is_mga) {
394  ErrPostEx(SEV_REJECT, ERR_KEYWORD_NoGeneExpressionKeywords, "This is apparently a CAGE or 5'-SAGE record, but it lacks the required keywords. Entry dropped.");
395  return ret;
396  }
397  if (tpa_kwd) {
398  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL) {
399  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA\" and/or \"Third Party Annotation\" keywords are present. Entry dropped.");
400  return ret;
401  }
402  i++;
403  } else if (ibp->is_tpa) {
404  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA\" and/or \"Third Party Annotation\" keywords. Entry dropped.");
405  return ret;
406  }
407  if (tsa_kwd) {
408  if (ibp->is_tsa == false) {
409  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords are present. Entry dropped.");
410  return ret;
411  }
412  i++;
413  } else if (ibp->is_tsa) {
414  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords. Entry dropped.");
415  return ret;
416  }
417  if (tls_kwd) {
418  if (ibp->is_tls == false) {
419  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS\" and/or \"Targeted Locus Study\" keywords are present. Entry dropped.");
420  return ret;
421  }
422  i++;
423  } else if (ibp->is_tls) {
424  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS\" and/or \"Targeted Locus Study\" keywords. Entry dropped.");
425  return ret;
426  }
427  if (i > 1) {
428  if (i == 2 && ibp->htg > 0 && env_kwd)
429  ErrPostEx(SEV_WARNING, ERR_KEYWORD_HTGPlusENV, "This HTG record also has the ENV keyword, which is an unusual combination. Confirmation that isolation and cloning steps actually occured might be appropriate.");
430  else if (i != 2 || env_kwd == false ||
431  (est_kwd == false && gss_kwd == false && wgs_kwd == false)) {
432  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingKeywords, "This record contains more than one of the special keywords used to indicate that a sequence is an HTG, EST, GSS, STS, HTC, WGS, ENV, FLI_CDNA, TPA, CAGE, TSA or TLS sequence.");
433  return ret;
434  }
435  }
436 
437  if (wgs_kwd)
438  i--;
439  if (ibp->is_contig && i > 0 &&
440  wgs_kwd == false && tpa_kwd == false && env_kwd == false) {
441  ErrPostEx(SEV_REJECT, ERR_KEYWORD_IllegalForCON, "This CON record should not have HTG, EST, GSS, STS, HTC, FLI_CDNA, CAGE, TSA or TLS special keywords. Entry dropped.");
442  return ret;
443  }
444 
445  thtg = mol_info.GetTech();
446  if (thtg == CMolInfo::eTech_htgs_0 || thtg == CMolInfo::eTech_htgs_1 ||
447  thtg == CMolInfo::eTech_htgs_2 || thtg == CMolInfo::eTech_htgs_3) {
448  RemoveHtgPhase(gbb->SetKeywords());
449  }
450 
451  kw = XMLConcatSubTags(entry, ibp->xip, INSDSEQ_KEYWORDS, ';');
452  if (kw) {
453  kwp = StringStr(kw, "EST");
454  if (kwp && est_kwd == false) {
455  ErrPostEx(SEV_WARNING, ERR_KEYWORD_ESTSubstring, "Keyword %s has substring EST, but no official EST keywords found", kw);
456  }
457  kwp = StringStr(kw, "STS");
458  if (kwp && sts_kwd == false) {
459  ErrPostEx(SEV_WARNING, ERR_KEYWORD_STSSubstring, "Keyword %s has substring STS, but no official STS keywords found", kw);
460  }
461  MemFree(kw);
462  }
463 
464  if (! ibp->is_contig) {
465  drop = false;
466  CMolInfo::TTech tech = mol_info.GetTech();
467  string p_div = gbb->GetDiv();
468 
469  check_div(ibp->is_pat, pat_ref, est_kwd, sts_kwd, gss_kwd, if_cds, p_div, &tech, ibp->bases, pp->source, drop);
470 
471  if (tech != CMolInfo::eTech_unknown)
472  mol_info.SetTech(tech);
473  else
474  mol_info.ResetTech();
475 
476  if (! p_div.empty())
477  gbb->SetDiv(p_div);
478  else
479  gbb->SetDiv("");
480 
481  if (drop) {
482  MemFree(bptr);
483  return ret;
484  }
485  } else if (gbb->GetDiv() == "CON") {
486  gbb->SetDiv("");
487  }
488  } else {
489  MemCpy(msg, bptr, 3);
490  msg[3] = '\0';
491  ErrPostEx(SEV_REJECT, ERR_DIVISION_UnknownDivCode, "Unknown division code \"%s\" found in GenBank flatfile. Record rejected.", msg);
492  MemFree(bptr);
493  return ret;
494  }
495 
496  if (IsNewAccessFormat(ibp->acnum) == 0 && *ibp->acnum == 'T' &&
497  gbb->GetDiv() != "EST") {
498  ErrPostStr(SEV_INFO, ERR_DIVISION_MappedtoEST, "Leading T in accession number.");
499  mol_info.SetTech(CMolInfo::eTech_est);
500 
501  gbb->SetDiv("");
502  }
503 
504  MemFree(bptr);
505  }
506 
507  bool is_htc_div = gbb->GetDiv() == "HTC",
508  has_htc = HasHtc(gbb->GetKeywords());
509 
510  if (is_htc_div && ! has_htc) {
511  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTCKeyword, "This record is in the HTC division, but lacks the required HTC keyword.");
512  return ret;
513  }
514 
515  if (! is_htc_div && has_htc) {
516  ErrPostEx(SEV_ERROR, ERR_DIVISION_InvalidHTCKeyword, "This record has the special HTC keyword, but is not in HTC division. If this record has graduated out of HTC, then the keyword should be removed.");
517  return ret;
518  }
519 
520  if (is_htc_div) {
521  str = XMLFindTagValue(entry, ibp->xip, INSDSEQ_MOLTYPE);
522  if (str) {
523  p = str;
524  if (*str == 'm' || *str == 'r')
525  p = str + 1;
526  else if (StringEquN(str, "pre-", 4))
527  p = str + 4;
528  else if (StringEquN(str, "transcribed ", 12))
529  p = str + 12;
530 
531  if (! StringEquN(p, "RNA", 3)) {
532  ErrPostEx(SEV_ERROR, ERR_DIVISION_HTCWrongMolType, "All HTC division records should have a moltype of pre-RNA, mRNA or RNA.");
533  MemFree(str);
534  return ret;
535  }
536  MemFree(str);
537  }
538  }
539 
540  if (fli_kwd)
542 
543  /* will be used in flat file database
544  */
545  if (! gbb->GetDiv().empty()) {
546  if (gbb->GetDiv() == "EST") {
547  ibp->EST = true;
548  mol_info.SetTech(CMolInfo::eTech_est);
549  gbb->SetDiv("");
550  } else if (gbb->GetDiv() == "STS") {
551  ibp->STS = true;
552  mol_info.SetTech(CMolInfo::eTech_sts);
553  gbb->SetDiv("");
554  } else if (gbb->GetDiv() == "GSS") {
555  ibp->GSS = true;
557  gbb->SetDiv("");
558  } else if (gbb->GetDiv() == "HTC") {
559  ibp->HTC = true;
560  mol_info.SetTech(CMolInfo::eTech_htc);
561  gbb->SetDiv("");
562  } else if (gbb->GetDiv() == "SYN" && bio_src && bio_src->IsSetOrigin() &&
563  bio_src->GetOrigin() == 5) /* synthetic */
564  {
565  gbb->SetDiv("");
566  }
567  } else if (mol_info.IsSetTech()) {
568  if (mol_info.GetTech() == CMolInfo::eTech_est)
569  ibp->EST = true;
570  if (mol_info.GetTech() == CMolInfo::eTech_sts)
571  ibp->STS = true;
572  if (mol_info.GetTech() == CMolInfo::eTech_survey)
573  ibp->GSS = true;
574  if (mol_info.GetTech() == CMolInfo::eTech_htc)
575  ibp->HTC = true;
576  }
577 
578  if (mol_info.IsSetTech())
579  fta_remove_keywords(mol_info.GetTech(), gbb->SetKeywords());
580 
581  if (ibp->is_tpa)
582  fta_remove_tpa_keywords(gbb->SetKeywords());
583 
584  if (ibp->is_tsa)
585  fta_remove_tsa_keywords(gbb->SetKeywords(), pp->source);
586 
587  if (ibp->is_tls)
588  fta_remove_tls_keywords(gbb->SetKeywords(), pp->source);
589 
590  if (bio_src && bio_src->IsSetSubtype()) {
591  for (const auto& subtype : bio_src->GetSubtype()) {
592  if (subtype->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
593  fta_remove_env_keywords(gbb->SetKeywords());
594  break;
595  }
596  }
597  }
598 
599  GetExtraAccession(ibp, pp->allow_uwsec, pp->source, gbb->SetExtra_accessions());
600 
601  if (gbb->IsSetDiv() &&
602  bio_src &&
603  bio_src->IsSetOrg() &&
604  bio_src->GetOrg().IsSetOrgname() &&
605  bio_src->GetOrg().GetOrgname().IsSetDiv() &&
606  bio_src->GetOrg().GetOrgname().GetDiv() == gbb->GetDiv()) {
607  gbb->ResetDiv();
608  }
609 
610  return gbb;
611 }
612 
613 /**********************************************************/
615 {
616  IndexblkPtr ibp;
617 
618  char* div;
619  char* molstr;
620 
621  ibp = pp->entrylist[pp->curindx];
622 
623  CRef<CMolInfo> mol_info(new CMolInfo);
624 
625  molstr = XMLFindTagValue(entry->mOffset, ibp->xip, INSDSEQ_MOLTYPE);
626  div = XMLFindTagValue(entry->mOffset, ibp->xip, INSDSEQ_DIVISION);
627 
628  if (StringEquN(div, "EST", 3))
629  mol_info->SetTech(CMolInfo::eTech_est);
630  else if (StringEquN(div, "STS", 3))
631  mol_info->SetTech(CMolInfo::eTech_sts);
632  else if (StringEquN(div, "GSS", 3))
633  mol_info->SetTech(CMolInfo::eTech_survey);
634  else if (StringEquN(div, "HTG", 3))
635  mol_info->SetTech(CMolInfo::eTech_htgs_1);
636  else if (ibp->is_wgs) {
637  if (ibp->is_tsa)
638  mol_info->SetTech(CMolInfo::eTech_tsa);
639  else if (ibp->is_tls)
640  mol_info->SetTech(CMolInfo::eTech_targeted);
641  else
642  mol_info->SetTech(CMolInfo::eTech_wgs);
643  } else if (ibp->is_tsa)
644  mol_info->SetTech(CMolInfo::eTech_tsa);
645  else if (ibp->is_tls)
646  mol_info->SetTech(CMolInfo::eTech_targeted);
647 
648  MemFree(div);
649  GetFlatBiomol(mol_info->SetBiomol(), mol_info->GetTech(), molstr, pp, *entry, org_ref);
650  if (mol_info->GetBiomol() == CMolInfo::eBiomol_unknown) // not set
651  mol_info->ResetBiomol();
652 
653  if (molstr)
654  MemFree(molstr);
655 
656  return mol_info;
657 }
658 
659 /**********************************************************/
660 static void XMLFakeBioSources(XmlIndexPtr xip, const char* entry, CBioseq& bioseq, Parser::ESource source)
661 {
662  char* organism = nullptr;
663  char* taxonomy = nullptr;
664 
665  char* p;
666  char* q;
667 
668  for (; xip; xip = xip->next) {
669  if (xip->tag == INSDSEQ_ORGANISM && ! organism)
670  organism = XMLGetTagValue(entry, xip);
671  else if (xip->tag == INSDSEQ_TAXONOMY && ! taxonomy)
672  taxonomy = XMLGetTagValue(entry, xip);
673  }
674 
675  if (! organism) {
676  ErrPostStr(SEV_WARNING, ERR_ORGANISM_NoOrganism, "No <INSDSeq_organism> data in XML format file.");
677  if (taxonomy)
678  MemFree(taxonomy);
679  return;
680  }
681 
682  CRef<CBioSource> bio_src(new CBioSource);
683 
684  p = organism;
685  if (GetGenomeInfo(*bio_src, p) && bio_src->GetGenome() != CBioSource::eGenome_plasmid) {
686  while (*p != ' ' && *p != '\0')
687  p++;
688  while (*p == ' ')
689  p++;
690  }
691 
692  COrg_ref& org_ref = bio_src->SetOrg();
693 
694  if (source == Parser::ESource::EMBL) {
695  q = StringChr(p, '(');
696  if (q && q > p) {
697  for (q--; *q == ' ' || *q == '\t'; q--)
698  if (q == p)
699  break;
700  if (*q != ' ' && *q != '\t')
701  q++;
702  if (q > p) {
703  *q = '\0';
704  org_ref.SetCommon(p);
705  }
706  }
707  }
708 
709  org_ref.SetTaxname(p);
710  MemFree(organism);
711 
712  if (org_ref.GetTaxname() == "Unknown.") {
713  string& taxname = org_ref.SetTaxname();
714  taxname = taxname.substr(0, taxname.size() - 1);
715  }
716 
717  if (taxonomy) {
718  org_ref.SetOrgname().SetLineage(taxonomy);
719  }
720 
721  CRef<CSeqdesc> descr(new CSeqdesc);
722  descr->SetSource(*bio_src);
723  bioseq.SetDescr().Set().push_back(descr);
724 }
725 
726 /**********************************************************/
727 static void XMLGetDescrComment(char* offset)
728 {
729  char* p;
730  char* q;
731 
732  for (p = offset; *p == '\n' || *p == ' ';)
733  p++;
734  if (p > offset)
735  fta_StringCpy(offset, p);
736 
737  for (p = offset, q = offset; *p != '\0';) {
738  if (*p != '\n') {
739  *q++ = *p++;
740  continue;
741  }
742 
743  *q++ = '~';
744  for (p++; *p == ' ';)
745  p++;
746  }
747  *q = '\0';
748 
749  for (p = offset;;) {
750  p = StringStr(p, "; ");
751  if (! p)
752  break;
753  for (p += 2, q = p; *q == ' ';)
754  q++;
755  if (q > p)
756  fta_StringCpy(p, q);
757  }
758 
759  for (p = offset; *p == ' ';)
760  p++;
761  if (p > offset)
762  fta_StringCpy(offset, p);
763  for (p = offset; *p != '\0';)
764  p++;
765 
766  if (p > offset) {
767  for (p--;; p--) {
768  if (*p == ' ' || *p == '\t' || *p == ';' || *p == ',' ||
769  *p == '.' || *p == '~') {
770  if (p > offset)
771  continue;
772  *p = '\0';
773  }
774  break;
775  }
776  if (*p != '\0') {
777  p++;
778  if (StringEquN(p, "...", 3))
779  p[3] = '\0';
780  else if (StringChr(p, '.')) {
781  *p = '.';
782  p[1] = '\0';
783  } else
784  *p = '\0';
785  }
786  }
787 }
788 
789 /**********************************************************/
790 static void XMLGetDescr(ParserPtr pp, DataBlkPtr entry, CBioseq& bioseq)
791 {
792  IndexblkPtr ibp;
793 
794  DataBlkPtr dbp;
795  DataBlkPtr dbpnext;
796 
797  char* crdate;
798  char* update;
799  char* offset;
800  char* str;
801  char* p;
802  char* q;
803  string gbdiv;
804 
805  ibp = pp->entrylist[pp->curindx];
806 
807  CBioSource* bio_src = nullptr;
808  COrg_ref* org_ref = nullptr;
809 
810  /* ORGANISM
811  */
812  for (auto& descr : bioseq.SetDescr().Set()) {
813  if (descr->IsSource()) {
814  bio_src = &(descr->SetSource());
815  if (bio_src->IsSetOrg())
816  org_ref = &bio_src->SetOrg();
817  break;
818  }
819  }
820 
821  /* MolInfo from LOCUS line
822  */
823  CRef<CMolInfo> mol_info = XMLGetMolInfo(pp, entry, org_ref);
824 
825  /* DEFINITION data ==> descr_title
826  */
828  string title;
829 
830  if (str) {
831  for (p = str; *p == ' ';)
832  p++;
833  if (p > str)
834  fta_StringCpy(str, p);
835  if (pp->xml_comp && pp->source != Parser::ESource::EMBL) {
836  p = StringRChr(str, '.');
837  if (! p || p[1] != '\0') {
838  string s = str;
839  s += '.';
840  MemFree(str);
841  str = StringSave(s);
842  p = nullptr;
843  }
844  }
845 
846  title = str;
847  MemFree(str);
848  str = nullptr;
849 
850  CRef<CSeqdesc> descr(new CSeqdesc);
851  descr->SetTitle(title);
852  bioseq.SetDescr().Set().push_back(descr);
853 
854  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL &&
855  StringEquN(title.c_str(), "TPA:", 4)) {
856  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA:\" prefix is present on its definition line. Entry dropped.");
857  ibp->drop = true;
858  return;
859  }
860 
861  if (ibp->is_tsa == false && StringEquN(title.c_str(), "TSA:", 4)) {
862  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA:\" prefix is present on its definition line. Entry dropped.");
863  ibp->drop = true;
864  return;
865  }
866 
867  if (ibp->is_tls == false && StringEquN(title.c_str(), "TLS:", 4)) {
868  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS:\" prefix is present on its definition line. Entry dropped.");
869  ibp->drop = true;
870  return;
871  }
872  }
873 
874  if (ibp->is_tpa &&
875  (title.empty() || ! StringEquN(title.c_str(), "TPA:", 4))) {
876  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA:\" prefix on its definition line. Entry dropped.");
877  ibp->drop = true;
878  return;
879  }
880 
881  if (ibp->is_tsa &&
882  (title.empty() || ! StringEquN(title.c_str(), "TSA:", 4))) {
883  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TSA:\" prefix on its definition line. Entry dropped.");
884  ibp->drop = true;
885  return;
886  }
887 
888  if (ibp->is_tls &&
889  (title.empty() || ! StringEquN(title.c_str(), "TLS:", 4))) {
890  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS:\" prefix on its definition line. Entry dropped.");
891  ibp->drop = true;
892  return;
893  }
894 
895  /* REFERENCE
896  */
897  /* pub should be before GBblock because we need patent ref
898  */
899  dbp = XMLBuildRefDataBlk(entry->mOffset, ibp->xip, ParFlat_REF_END);
900  for (; dbp; dbp = dbpnext) {
901  dbpnext = dbp->mpNext;
902 
903  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, 0);
904  if (pubdesc.NotEmpty()) {
905  CRef<CSeqdesc> descr(new CSeqdesc);
906  descr->SetPub(*pubdesc);
907  bioseq.SetDescr().Set().push_back(descr);
908  }
909 
910  dbp->SimpleDelete();
911  }
912 
914  for (; dbp; dbp = dbpnext) {
915  dbpnext = dbp->mpNext;
916 
917  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, 0);
918  if (pubdesc.NotEmpty()) {
919  CRef<CSeqdesc> descr(new CSeqdesc);
920  descr->SetPub(*pubdesc);
921  bioseq.SetDescr().Set().push_back(descr);
922  }
923 
924  dbp->SimpleDelete();
925  }
926 
927  TStringList dr_ena,
928  dr_biosample;
929 
930  CRef<CEMBL_block> embl;
931  CRef<CGB_block> gbb;
932 
933  if (pp->source == Parser::ESource::EMBL)
934  embl = XMLGetEMBLBlock(pp, entry->mOffset, *mol_info, gbdiv, bio_src, dr_ena, dr_biosample);
935  else
936  gbb = XMLGetGBBlock(pp, entry->mOffset, *mol_info, bio_src);
937 
938  CRef<CUser_object> dbuop;
939  if (! dr_ena.empty() || ! dr_biosample.empty())
940  fta_build_ena_user_object(bioseq.SetDescr().Set(), dr_ena, dr_biosample, dbuop);
941 
942  if (mol_info->IsSetBiomol() || mol_info->IsSetTech()) {
943  CRef<CSeqdesc> descr(new CSeqdesc);
944  descr->SetMolinfo(*mol_info);
945  bioseq.SetDescr().Set().push_back(descr);
946  }
947 
948  if (pp->source == Parser::ESource::EMBL) {
949  if (embl.Empty()) {
950  ibp->drop = true;
951  return;
952  }
953  } else if (gbb.Empty()) {
954  ibp->drop = true;
955  return;
956  }
957 
958  if (pp->source == Parser::ESource::EMBL) {
959  if (StringEquNI(ibp->division, "CON", 3))
960  fta_add_hist(pp, bioseq, embl->SetExtra_acc(), Parser::ESource::EMBL, CSeq_id::e_Embl, true, ibp->acnum);
961  else
962  fta_add_hist(pp, bioseq, embl->SetExtra_acc(), Parser::ESource::EMBL, CSeq_id::e_Embl, false, ibp->acnum);
963 
964  if (embl->GetExtra_acc().empty())
965  embl->ResetExtra_acc();
966  } else {
967  if (StringEquNI(ibp->division, "CON", 3))
968  fta_add_hist(pp, bioseq, gbb->SetExtra_accessions(), Parser::ESource::DDBJ, CSeq_id::e_Ddbj, true, ibp->acnum);
969  else
970  fta_add_hist(pp, bioseq, gbb->SetExtra_accessions(), Parser::ESource::DDBJ, CSeq_id::e_Ddbj, false, ibp->acnum);
971  }
972 
973  if (pp->source == Parser::ESource::EMBL) {
974  if (! gbdiv.empty()) {
975  gbb.Reset(new CGB_block);
976  gbb->SetDiv(gbdiv);
977  gbdiv.clear();
978  }
979 
980  CRef<CSeqdesc> descr(new CSeqdesc);
981  descr->SetEmbl(*embl);
982  bioseq.SetDescr().Set().push_back(descr);
983  }
984 
986  if (! offset && ibp->is_tpa && ibp->is_wgs == false) {
987  if (ibp->inferential || ibp->experimental) {
988  if (! fta_dblink_has_sra(dbuop)) {
989  ErrPostEx(SEV_REJECT, ERR_TPA_TpaSpansMissing, "TPA:%s record lacks both AH/PRIMARY linetype and Sequence Read Archive links. Entry dropped.", (ibp->inferential == false) ? "experimental" : "inferential");
990  ibp->drop = true;
991  return;
992  }
993  } else if (ibp->specialist_db == false) {
994  ErrPostEx(SEV_REJECT, ERR_TPA_TpaSpansMissing, "TPA record lacks required AH/PRIMARY linetype. Entry dropped.");
995  ibp->drop = true;
996  return;
997  }
998  }
999 
1000  if (offset) {
1001  if (! fta_parse_tpa_tsa_block(bioseq, offset, ibp->acnum, ibp->vernum, 10, 0, ibp->is_tpa)) {
1002  ibp->drop = true;
1003  MemFree(offset);
1004  return;
1005  }
1006  MemFree(offset);
1007  }
1008 
1009  if (gbb.NotEmpty()) {
1010  if (pp->taxserver == 1 && gbb->IsSetDiv())
1011  fta_fix_orgref_div(bioseq.SetAnnot(), org_ref, *gbb);
1012 
1013  CRef<CSeqdesc> descr(new CSeqdesc);
1014  descr->SetGenbank(*gbb);
1015  bioseq.SetDescr().Set().push_back(descr);
1016  }
1017 
1018  /* COMMENT data
1019  */
1021  if (offset) {
1022  bool bad = false;
1023  TUserObjVector user_objs;
1024 
1025  fta_parse_structured_comment(offset, bad, user_objs);
1026 
1027  if (bad) {
1028  ibp->drop = true;
1029  MemFree(offset);
1030  return;
1031  }
1032 
1033  for (auto& user_obj : user_objs) {
1034  CRef<CSeqdesc> descr(new CSeqdesc);
1035  descr->SetUser(*user_obj);
1036  bioseq.SetDescr().Set().push_back(descr);
1037  }
1038 
1040  if (pp->xml_comp) {
1041  for (q = offset, p = q; *p != '\0';) {
1042  if (*p == ';' && (p[1] == ' ' || p[1] == '~'))
1043  *p = ' ';
1044  if (*p == '~' || *p == ' ') {
1045  *q++ = ' ';
1046  for (p++; *p == ' ' || *p == '~';)
1047  p++;
1048  } else
1049  *q++ = *p++;
1050  }
1051  *q = '\0';
1052  }
1053 
1054  if (offset[0] != 0) {
1055  CRef<CSeqdesc> descr(new CSeqdesc);
1056  descr->SetComment(offset);
1057  bioseq.SetDescr().Set().push_back(descr);
1058  }
1059  MemFree(offset);
1060  }
1061 
1062  /* DATE
1063  */
1064  if (pp->no_date) /* -N in command line means no date */
1065  return;
1066 
1067  CRef<CDate_std> std_upd_date,
1068  std_cre_date;
1069 
1070  if (pp->date) /* -L in command line means replace
1071  date */
1072  {
1073  CTime cur_time(CTime::eCurrent);
1074 
1075  std_upd_date.Reset(new CDate_std);
1076  std_upd_date->SetToTime(cur_time);
1077 
1078  std_cre_date.Reset(new CDate_std);
1079  std_cre_date->SetToTime(cur_time);
1080 
1081  update = nullptr;
1082  crdate = nullptr;
1083  } else {
1084  update = XMLFindTagValue(entry->mOffset, ibp->xip, INSDSEQ_UPDATE_DATE);
1085  if (update)
1086  std_upd_date = GetUpdateDate(update, pp->source);
1087 
1088  crdate = XMLFindTagValue(entry->mOffset, ibp->xip, INSDSEQ_CREATE_DATE);
1089  if (crdate)
1090  std_cre_date = GetUpdateDate(crdate, pp->source);
1091  }
1092 
1093  if (std_upd_date.NotEmpty()) {
1094  CRef<CSeqdesc> descr(new CSeqdesc);
1095  descr->SetUpdate_date().SetStd(*std_upd_date);
1096  bioseq.SetDescr().Set().push_back(descr);
1097 
1098  if (std_cre_date.NotEmpty() && std_cre_date->Compare(*std_upd_date) == CDate::eCompare_after) {
1099  ErrPostEx(SEV_ERROR, ERR_DATE_IllegalDate, "Update-date \"%s\" precedes create-date \"%s\".", update, crdate);
1100  }
1101  }
1102 
1103  if (std_cre_date.NotEmpty()) {
1104  if (pp->xml_comp == false || pp->source == Parser::ESource::EMBL) {
1105  CRef<CSeqdesc> descr(new CSeqdesc);
1106  descr->SetCreate_date().SetStd(*std_cre_date);
1107  bioseq.SetDescr().Set().push_back(descr);
1108  }
1109  }
1110 
1111  if (update)
1112  MemFree(update);
1113  if (crdate)
1114  MemFree(crdate);
1115 }
1116 
1117 /**********************************************************/
1118 static void XMLGetDivision(const char* entry, IndexblkPtr ibp)
1119 {
1120  char* div;
1121 
1122  if (! ibp || ! entry)
1123  return;
1124 
1125  div = XMLFindTagValue(entry, ibp->xip, INSDSEQ_DIVISION);
1126  if (! div)
1127  return;
1128  div[3] = '\0';
1129  StringCpy(ibp->division, div);
1130  MemFree(div);
1131 }
1132 
1133 /**********************************************************/
1135 {
1136  Int4 i;
1137  Int4 imax;
1138  Int4 j;
1139  Int4 segindx;
1140  Int4 total = 0;
1141  Int4 total_long = 0;
1142  Int4 total_dropped = 0;
1143  char* div;
1144  char* entry;
1145  EntryBlkPtr ebp;
1146 
1147  TEntryList seq_entries;
1148 
1149  CSeq_loc locs;
1150 
1151  bool seq_long = false;
1152  IndexblkPtr ibp;
1153  IndexblkPtr tibp;
1154  DataBlkPtr dbp;
1155 
1156  /* set up sequence alphabets
1157  */
1158  auto dnaconv = GetDNAConv();
1159  auto protconv = GetProteinConv();
1160 
1161  segindx = -1;
1162 
1163  for (imax = pp->indx, i = 0; i < imax; i++) {
1164  pp->curindx = i;
1165  ibp = pp->entrylist[i];
1166 
1167  err_install(ibp, pp->accver);
1168 
1169  if (ibp->segnum == 1)
1170  segindx = i;
1171 
1172  if (ibp->drop && ibp->segnum == 0) {
1173  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1174  total_dropped++;
1175  continue;
1176  }
1177 
1178  entry = XMLLoadEntry(pp, false);
1179  if (! entry) {
1181  return false;
1182  }
1183 
1184  XMLGetDivision(entry, ibp);
1185 
1186  if (StringEqu(ibp->division, "TSA")) {
1187  if (ibp->tsa_allowed == false)
1188  ErrPostEx(SEV_WARNING, ERR_TSA_UnexpectedPrimaryAccession, "The record with accession \"%s\" is not expected to have a TSA division code.", ibp->acnum);
1189  ibp->is_tsa = true;
1190  }
1191 
1192  XMLCheckContigEverywhere(ibp, pp->source);
1193  if (ibp->drop && ibp->segnum == 0) {
1194  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1195  MemFree(entry);
1196  total_dropped++;
1197  continue;
1198  }
1199 
1200  ebp = new EntryBlk();
1201 
1202  CRef<CBioseq> bioseq = CreateEntryBioseq(pp);
1203  ebp->seq_entry.Reset(new CSeq_entry);
1204  ebp->seq_entry->SetSeq(*bioseq);
1205  GetScope().AddBioseq(*bioseq);
1206 
1207  dbp = new DataBlk();
1208  dbp->mpData = ebp;
1209  dbp->mOffset = entry;
1210  dbp->len = StringLen(entry);
1211 
1212  if (! XMLGetInst(pp, dbp, ibp->is_prot ? protconv.get() : dnaconv.get(), *bioseq)) {
1213  ibp->drop = true;
1214  ErrPostStr(SEV_REJECT, ERR_SEQUENCE_BadData, "Bad sequence data. Entry dropped.");
1215  if (ibp->segnum == 0) {
1216  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1217  delete dbp;
1218  MemFree(entry);
1219  total_dropped++;
1220  continue;
1221  }
1222  }
1223 
1224  XMLFakeBioSources(ibp->xip, dbp->mOffset, *bioseq, pp->source);
1225  LoadFeat(pp, *dbp, *bioseq);
1226 
1227  if (! bioseq->IsSetAnnot() && ibp->drop && ibp->segnum == 0) {
1228  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1229  delete dbp;
1230  MemFree(entry);
1231  total_dropped++;
1232  continue;
1233  }
1234 
1235  XMLGetDescr(pp, dbp, *bioseq);
1236 
1237  if (ibp->drop && ibp->segnum == 0) {
1238  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1239  delete dbp;
1240  MemFree(entry);
1241  total_dropped++;
1242  continue;
1243  }
1244 
1245  fta_set_molinfo_completeness(*bioseq, ibp);
1246 
1247  if (ibp->is_tsa)
1248  fta_tsa_tls_comment_dblink_check(*bioseq, true);
1249 
1250  if (ibp->is_tls)
1251  fta_tsa_tls_comment_dblink_check(*bioseq, false);
1252 
1253  if (bioseq->GetInst().IsNa()) {
1254  if (bioseq->GetInst().GetRepr() == CSeq_inst::eRepr_raw) {
1255  if (ibp->gaps)
1256  GapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
1257  else if (ibp->htg == 4 || ibp->htg == 1 || ibp->htg == 2 ||
1258  (ibp->is_pat && pp->source == Parser::ESource::DDBJ))
1259  SeqToDelta(*bioseq, ibp->htg);
1260  } else if (ibp->gaps)
1261  AssemblyGapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
1262  }
1263 
1264  if (no_date(pp->format, bioseq->GetDescr().Get()) &&
1265  pp->debug == false && pp->no_date == false &&
1266  pp->xml_comp == false && pp->source != Parser::ESource::USPTO) {
1267  ibp->drop = true;
1268  ErrPostStr(SEV_ERROR, ERR_DATE_IllegalDate, "Illegal create date. Entry dropped.");
1269  if (ibp->segnum == 0) {
1270  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1271  delete dbp;
1272  MemFree(entry);
1273  total_dropped++;
1274  continue;
1275  }
1276  }
1277 
1278  if (dbp->mpQscore.empty() && pp->accver) {
1279  if (pp->ff_get_qscore)
1280  dbp->mpQscore = (*pp->ff_get_qscore)(ibp->acnum, ibp->vernum);
1281  else if (pp->ff_get_qscore_pp)
1282  dbp->mpQscore = (*pp->ff_get_qscore_pp)(ibp->acnum, ibp->vernum, pp);
1283  else if (pp->qsfd && ibp->qslength > 0)
1284  dbp->mpQscore = GetQSFromFile(pp->qsfd, ibp);
1285  }
1286 
1287  if (! QscoreToSeqAnnot(dbp->mpQscore, *bioseq, ibp->acnum, ibp->vernum, false, true)) {
1288  if (pp->ign_bad_qs == false) {
1289  ibp->drop = true;
1290  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse, "Error while parsing QScore. Entry dropped.");
1291  if (ibp->segnum == 0) {
1292  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1293  delete dbp;
1294  MemFree(entry);
1295  total_dropped++;
1296  continue;
1297  }
1298  } else {
1299  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse, "Error while parsing QScore.");
1300  }
1301  }
1302 
1303  dbp->mpQscore.clear();
1304 
1305  if (ibp->psip.NotEmpty()) {
1306  CRef<CSeq_id> id(new CSeq_id);
1307  id->SetPatent(*ibp->psip);
1308  bioseq->SetId().push_back(id);
1309  ibp->psip.Reset();
1310  }
1311 
1312  /* add PatentSeqId if patent is found in reference
1313  */
1314  if (no_reference(*bioseq) && ! pp->debug) {
1315  if (pp->source == Parser::ESource::Flybase) {
1316  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for entry from FlyBase. Continue anyway.");
1317  } else if (pp->source == Parser::ESource::Refseq &&
1318  StringEquN(ibp->acnum, "NW_", 3)) {
1319  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for RefSeq's NW_ entry. Continue anyway.");
1320  } else if (ibp->is_wgs) {
1321  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for WGS entry. Continue anyway.");
1322  } else {
1323  ibp->drop = true;
1324  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references. Entry dropped.");
1325  if (ibp->segnum == 0) {
1326  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1327  delete dbp;
1328  MemFree(entry);
1329  total_dropped++;
1330  continue;
1331  }
1332  }
1333  }
1334 
1335  if (ibp->segnum == ibp->segtotal) {
1336  seq_entries.push_back(ebp->seq_entry);
1337  ebp->seq_entry.Reset();
1338 
1339  if (ibp->segnum < 2) {
1340  if (ibp->segnum != 0) {
1341  ErrPostEx(SEV_WARNING, ERR_SEGMENT_OnlyOneMember, "Segmented set contains only one member.");
1342  }
1343  segindx = i;
1344  } else {
1345  GetSeqExt(pp, locs);
1346  // LCOV_EXCL_START
1347  // Excluded per Mark's request on 12/14/2016
1348  BuildBioSegHeader(pp, seq_entries, locs);
1349  // LCOV_EXCL_STOP
1350  }
1351 
1352  /* reject the whole set if any one entry was rejected
1353  */
1354  if (ibp->segnum != 0) {
1355  div = pp->entrylist[segindx]->division;
1356  for (j = segindx; j <= i; j++) {
1357  tibp = pp->entrylist[j];
1358  err_install(tibp, pp->accver);
1359  if (! StringEqu(div, tibp->division)) {
1360  ErrPostEx(SEV_WARNING, ERR_DIVISION_Mismatch, "Division different in segmented set: %s: %s", div, tibp->division);
1361  }
1362  if (tibp->drop) {
1363  ErrPostEx(SEV_WARNING, ERR_SEGMENT_Rejected, "Reject the whole segmented set");
1364  break;
1365  }
1366  }
1367  if (j <= i) {
1368  for (j = segindx; j <= i; j++) {
1369  tibp = pp->entrylist[j];
1370  err_install(tibp, pp->accver);
1371  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", tibp->locusname, tibp->acnum);
1372  total_dropped++;
1373  }
1374 
1375  seq_entries.clear();
1376 
1377  delete dbp;
1378  MemFree(entry);
1379  GetScope().ResetHistory();
1380  continue;
1381  }
1382  }
1383 
1384  if (pp->source == Parser::ESource::USPTO) {
1385  GeneRefFeats gene_refs;
1386  gene_refs.valid = false;
1387  ProcNucProt(pp, seq_entries, gene_refs);
1388  } else
1389  DealWithGenes(seq_entries, pp);
1390 
1391  if (seq_entries.empty()) {
1392  if (ibp->segnum != 0) {
1393  ErrPostEx(SEV_WARNING, ERR_SEGMENT_Rejected, "Reject the whole segmented set.");
1394  for (j = segindx; j <= i; j++) {
1395  tibp = pp->entrylist[j];
1396  err_install(tibp, pp->accver);
1397  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", tibp->locusname, tibp->acnum);
1398  total_dropped++;
1399  }
1400  } else {
1401  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1402  total_dropped++;
1403  }
1404  delete dbp;
1405  MemFree(entry);
1406  GetScope().ResetHistory();
1407  continue;
1408  }
1409 
1410  /* remove out all the features if their seqloc has
1411  * "join" or "order" among other segments, to the annot
1412  * which in class = parts
1413  */
1414  if (ibp->segnum != 0)
1415  // LCOV_EXCL_START
1416  // Excluded per Mark's request on 12/14/2016
1417  CheckFeatSeqLoc(seq_entries);
1418  // LCOV_EXCL_STOP
1419 
1420  fta_find_pub_explore(pp, seq_entries);
1421 
1422  /* change qual "citation' on features to SeqFeat.cit
1423  * find citation in the list by serial_number.
1424  * If serial number not found remove /citation
1425  */
1426  ProcessCitations(seq_entries);
1427 
1428  /* check for long sequences in each segment
1429  */
1430  if (pp->limit != 0) {
1431  if (ibp->segnum != 0) {
1432  for (j = segindx; j <= i; j++) {
1433  tibp = pp->entrylist[j];
1434  err_install(tibp, pp->accver);
1435  if (tibp->bases <= (size_t)pp->limit)
1436  continue;
1437 
1438  if (tibp->htg == 1 || tibp->htg == 2 || tibp->htg == 4) {
1439  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence, "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem", tibp->locusname, tibp->acnum, pp->limit);
1440  } else {
1441  seq_long = true;
1442  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence, "Sequence %s|%s is longer than limit %ld", tibp->locusname, tibp->acnum, pp->limit);
1443  }
1444  }
1445  } else if (ibp->bases > (size_t)pp->limit) {
1446  if (ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 4) {
1447  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence, "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem", ibp->locusname, ibp->acnum, pp->limit);
1448  } else {
1449  seq_long = true;
1450  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence, "Sequence %s|%s is longer than limit %ld", ibp->locusname, ibp->acnum, pp->limit);
1451  }
1452  }
1453  }
1454 
1455  if (pp->convert) {
1456  if (pp->cleanup <= 1) {
1457  FinalCleanup(seq_entries);
1458 
1459  if (pp->qamode && ! seq_entries.empty())
1460  fta_remove_cleanup_user_object(*(*seq_entries.begin()));
1461  }
1462 
1463  MaybeCutGbblockSource(seq_entries);
1464  }
1465 
1466  EntryCheckDivCode(seq_entries, pp);
1467 
1468  if (pp->xml_comp)
1469  fta_set_strandedness(seq_entries);
1470 
1471  if (fta_EntryCheckGBBlock(seq_entries)) {
1472  ErrPostStr(SEV_WARNING, ERR_ENTRY_GBBlock_not_Empty, "Attention: GBBlock is not empty");
1473  }
1474 
1475  /* check for identical features
1476  */
1477  if (pp->qamode) {
1478  fta_sort_descr(seq_entries);
1479  fta_sort_seqfeat_cit(seq_entries);
1480  }
1481 
1482  if (pp->citat) {
1483  StripSerialNumbers(seq_entries);
1484  }
1485 
1486  PackEntries(seq_entries);
1487  CheckDupDates(seq_entries);
1488 
1489  if (ibp->segnum != 0)
1490  for (j = segindx; j <= i; j++)
1491  err_install(pp->entrylist[j], pp->accver);
1492 
1493  if (seq_long) {
1494  seq_long = false;
1495  if (ibp->segnum != 0)
1496  total_long += (i - segindx + 1);
1497  else
1498  total_long++;
1499  } else {
1500  pp->entries.splice(pp->entries.end(), seq_entries);
1501 
1502  if (ibp->segnum != 0)
1503  total += (i - segindx + 1);
1504  else
1505  total++;
1506  }
1507 
1508  if (ibp->segnum != 0) {
1509  for (j = segindx; j <= i; j++) {
1510  tibp = pp->entrylist[j];
1511  err_install(tibp, pp->accver);
1512  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry parsed successfully: \"%s|%s\".", tibp->locusname, tibp->acnum);
1513  }
1514  } else {
1515  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry parsed successfully: \"%s|%s\".", ibp->locusname, ibp->acnum);
1516  }
1517 
1518  seq_entries.clear();
1519  } else {
1520  GetSeqExt(pp, locs);
1521 
1522  seq_entries.push_back(ebp->seq_entry);
1523  ebp->seq_entry.Reset();
1524  }
1525 
1526  delete dbp;
1527  MemFree(entry);
1528  GetScope().ResetHistory();
1529 
1530  } /* for, ascii block entries */
1531 
1533 
1534  ErrPostEx(SEV_INFO, ERR_ENTRY_ParsingComplete, "COMPLETED : SUCCEEDED = %d (including: LONG ones = %d); SKIPPED = %d.", total, total_long, total_dropped);
1535 
1536  return true;
1537 }
1538 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
bool no_reference(const CBioseq &bioseq)
Definition: add.cpp:220
void SeqToDelta(CBioseq &bioseq, Int2 tech)
Definition: add.cpp:505
bool fta_check_htg_kwds(TKeywordList &kwds, IndexblkPtr ibp, CMolInfo &mol_info)
Definition: add.cpp:913
void fta_set_molinfo_completeness(CBioseq &bioseq, const Indexblk *ibp)
Definition: add.cpp:2796
void fta_add_hist(ParserPtr pp, CBioseq &bioseq, CGB_block::TExtra_accessions &extra_accs, Parser::ESource source, CSeq_id::E_Choice acctype, bool pricon, const char *acc)
Definition: add.cpp:793
void AssemblyGapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:339
bool fta_parse_tpa_tsa_block(CBioseq &bioseq, char *offset, char *acnum, Int2 vernum, size_t len, Int2 col_data, bool tpa)
Definition: add.cpp:1118
string GetQSFromFile(FILE *fd, const Indexblk *ibp)
Definition: add.cpp:2699
void fta_create_far_fetch_policy_user_object(CBioseq &bsp, Int4 num)
Definition: add.cpp:2821
void fta_tsa_tls_comment_dblink_check(const CBioseq &bioseq, bool is_tsa)
Definition: add.cpp:2751
void fta_remove_cleanup_user_object(CSeq_entry &seq_entry)
Definition: add.cpp:2718
bool fta_dblink_has_sra(const CRef< CUser_object > &uop)
Definition: add.cpp:2862
void GapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:387
void err_install(const Indexblk *ibp, bool accver)
Definition: add.cpp:302
Int4 fta_fix_seq_loc_id(TSeqLocList &locs, ParserPtr pp, char *location, const char *name, bool iscon)
Definition: add.cpp:2321
bool no_date(Parser::EFormat format, const TSeqdescList &descrs)
Definition: add.cpp:190
void fta_parse_structured_comment(char *str, bool &bad, TUserObjVector &objs)
Definition: add.cpp:2583
void StripSerialNumbers(TEntryList &seq_entries)
Definition: asci_blk.cpp:3372
void fta_fix_orgref_div(const CBioseq::TAnnot &annots, COrg_ref *org_ref, CGB_block &gbb)
Definition: asci_blk.cpp:3236
void fta_sort_seqfeat_cit(TEntryList &seq_entries)
Definition: asci_blk.cpp:3208
void PackEntries(TEntryList &seq_entries)
Definition: asci_blk.cpp:3472
void fta_set_strandedness(TEntryList &seq_entries)
Definition: asci_blk.cpp:3307
void CheckHTGDivision(const char *div, CMolInfo::TTech tech)
Definition: asci_blk.cpp:2912
unique_ptr< unsigned char[]> GetDNAConv(void)
Definition: asci_blk.cpp:1744
bool XMLCheckCDS(const char *entry, XmlIndexPtr xip)
Definition: asci_blk.cpp:3276
unique_ptr< unsigned char[]> GetProteinConv(void)
Definition: asci_blk.cpp:1772
void EntryCheckDivCode(TEntryList &seq_entries, ParserPtr pp)
Definition: asci_blk.cpp:2776
void GetSeqExt(ParserPtr pp, CSeq_loc &seq_loc)
Definition: asci_blk.cpp:2439
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
Definition: asci_blk.cpp:1632
bool fta_EntryCheckGBBlock(TEntryList &seq_entries)
Definition: asci_blk.cpp:3079
void fta_sort_descr(TEntryList &seq_entries)
Definition: asci_blk.cpp:3151
void XMLDefVsHTGKeywords(CMolInfo::TTech tech, const char *entry, XmlIndexPtr xip, bool cancelled)
Definition: asci_blk.cpp:2855
void BuildBioSegHeader(ParserPtr pp, TEntryList &entries, const CSeq_loc &seqloc)
Definition: asci_blk.cpp:2464
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
Definition: asci_blk.cpp:1274
bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string &div, CMolInfo::TTech *tech, size_t bases, Parser::ESource source, bool &drop)
Definition: asci_blk.cpp:2535
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
Definition: asci_blk.cpp:1020
list< string > TStringList
Definition: cgictx.cpp:719
void ProcessCitations(TEntryList &seq_entries)
Definition: citation.cpp:307
CDate::ECompare Compare(const CDate_std &date) const
Indicate how *this relates to another date.
Definition: Date_std.cpp:91
void SetToTime(const CTime &time, CDate::EPrecision prec=CDate::ePrecision_second)
Definition: Date_std.cpp:59
@ eCompare_after
*this comes second.
Definition: Date.hpp:76
Definition: Seq_entry.hpp:56
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
CTime –.
Definition: ncbitime.hpp:296
char * mOffset
Definition: ftablock.h:332
size_t len
Definition: ftablock.h:333
string mpQscore
Definition: ftablock.h:334
void SimpleDelete()
Definition: ftablock.h:323
CFlatFileData * mpData
Definition: ftablock.h:331
DataBlk * mpNext
Definition: ftablock.h:336
void fta_build_ena_user_object(CSeq_descr::Tdata &descrs, TStringList &dr_ena, TStringList &dr_biosample, CRef< CUser_object > &dbuop)
Definition: em_ascii.cpp:1507
CRef< CEMBL_block > XMLGetEMBLBlock(ParserPtr pp, const char *entry, CMolInfo &mol_info, string &gbdiv, CBioSource *bio_src, TStringList &dr_ena, TStringList &dr_biosample)
Definition: em_ascii.cpp:2411
void FinalCleanup(TEntryList &seq_entries)
Definition: fcleanup.cpp:377
#define ERR_SEQUENCE_BadData
Definition: flat2err.h:150
#define ERR_TPA_TpaSpansMissing
Definition: flat2err.h:593
#define ERR_ENTRY_LongSequence
Definition: flat2err.h:82
#define ERR_FORMAT_MissingContigFeature
Definition: flat2err.h:43
#define ERR_KEYWORD_ShouldNotBeTPA
Definition: flat2err.h:208
#define ERR_DIVISION_BadTSADivcode
Definition: flat2err.h:261
#define ERR_FORMAT_MissingSequenceData
Definition: flat2err.h:41
#define ERR_DIVISION_InvalidHTCKeyword
Definition: flat2err.h:254
#define ERR_KEYWORD_IllegalForCON
Definition: flat2err.h:210
#define ERR_DIVISION_MissingHTGKeywords
Definition: flat2err.h:249
#define ERR_QSCORE_FailedToParse
Definition: flat2err.h:577
#define ERR_ENTRY_LongHTGSSequence
Definition: flat2err.h:86
#define ERR_KEYWORD_MissingTSA
Definition: flat2err.h:216
#define ERR_DIVISION_BadTPADivcode
Definition: flat2err.h:257
#define ERR_REFERENCE_No_references
Definition: flat2err.h:289
#define ERR_KEYWORD_ShouldNotBeTLS
Definition: flat2err.h:218
#define ERR_ENTRY_GBBlock_not_Empty
Definition: flat2err.h:85
#define ERR_KEYWORD_HTGPlusENV
Definition: flat2err.h:217
#define ERR_DEFINITION_MissingTPA
Definition: flat2err.h:269
#define ERR_ENTRY_Skipped
Definition: flat2err.h:80
#define ERR_DEFINITION_MissingTLS
Definition: flat2err.h:273
#define ERR_KEYWORD_ESTSubstring
Definition: flat2err.h:204
#define ERR_KEYWORD_ConflictingKeywords
Definition: flat2err.h:207
#define ERR_DIVISION_ConDivLacksContig
Definition: flat2err.h:252
#define ERR_LOCATION_ContigHasNull
Definition: flat2err.h:397
#define ERR_SEGMENT_OnlyOneMember
Definition: flat2err.h:165
#define ERR_KEYWORD_ENV_NoMatchingQualifier
Definition: flat2err.h:214
#define ERR_KEYWORD_ShouldNotBeTSA
Definition: flat2err.h:215
#define ERR_KEYWORD_STSSubstring
Definition: flat2err.h:205
#define ERR_DIVISION_UnknownDivCode
Definition: flat2err.h:222
#define ERR_KEYWORD_MissingTLS
Definition: flat2err.h:219
#define ERR_DEFINITION_ShouldNotBeTSA
Definition: flat2err.h:270
#define ERR_SEGMENT_Rejected
Definition: flat2err.h:166
#define ERR_DIVISION_MissingHTCKeyword
Definition: flat2err.h:253
#define ERR_DIVISION_MappedtoCON
Definition: flat2err.h:248
#define ERR_DIVISION_MappedtoEST
Definition: flat2err.h:223
#define ERR_FORMAT_ContigWithSequenceData
Definition: flat2err.h:42
#define ERR_KEYWORD_NoGeneExpressionKeywords
Definition: flat2err.h:213
#define ERR_DEFINITION_MissingTSA
Definition: flat2err.h:271
#define ERR_DEFINITION_ShouldNotBeTPA
Definition: flat2err.h:268
#define ERR_KEYWORD_MissingTPA
Definition: flat2err.h:209
#define ERR_DIVISION_ConDivInSegset
Definition: flat2err.h:251
#define ERR_ENTRY_ParsingComplete
Definition: flat2err.h:79
#define ERR_DIVISION_Mismatch
Definition: flat2err.h:226
#define ERR_ORGANISM_NoOrganism
Definition: flat2err.h:184
#define ERR_DATE_IllegalDate
Definition: flat2err.h:102
#define ERR_ENTRY_Parsed
Definition: flat2err.h:83
#define ERR_DIVISION_HTCWrongMolType
Definition: flat2err.h:255
#define ERR_KEYWORD_ShouldNotBeCAGE
Definition: flat2err.h:211
#define ERR_DEFINITION_ShouldNotBeTLS
Definition: flat2err.h:272
#define ERR_TSA_UnexpectedPrimaryAccession
Definition: flat2err.h:609
list< CRef< objects::CSeq_entry > > TEntryList
bool QscoreToSeqAnnot(const string &qscore, CBioseq &bioseq, char *acc, Int2 ver, bool check_minmax, bool allow_na)
#define INSDSEQ_TOPOLOGY
Definition: fta_xml.h:46
#define INSDSEQ_MOLTYPE
Definition: fta_xml.h:45
#define INSDSEQ_DEFINITION
Definition: fta_xml.h:52
DataBlkPtr XMLBuildRefDataBlk(char *entry, const XmlIndex *xip, int type)
Definition: xm_index.cpp:1490
char * XMLFindTagValue(const char *entry, const XmlIndex *xip, Int4 tag)
Definition: xm_index.cpp:214
#define INSDSEQ_STRANDEDNESS
Definition: fta_xml.h:44
char * XMLGetTagValue(const char *entry, const XmlIndex *xip)
Definition: xm_index.cpp:202
#define INSDSEQ_COMMENT
Definition: fta_xml.h:64
#define INSDSEQ_ORGANISM
Definition: fta_xml.h:61
void XMLGetKeywords(const char *entry, const XmlIndex *xip, TKeywordList &keywords)
Definition: xm_index.cpp:1523
char * XMLLoadEntry(ParserPtr pp, bool err)
Definition: xm_index.cpp:968
#define INSDSEQ_KEYWORDS
Definition: fta_xml.h:58
#define INSDSEQ_TAXONOMY
Definition: fta_xml.h:62
#define INSDSEQ_CREATE_DATE
Definition: fta_xml.h:49
#define INSDSEQ_DIVISION
Definition: fta_xml.h:47
#define INSDSEQ_UPDATE_DATE
Definition: fta_xml.h:48
#define INSDSEQ_SOURCE
Definition: fta_xml.h:60
#define INSDSEQ_CONTIG
Definition: fta_xml.h:70
char * XMLConcatSubTags(const char *entry, const XmlIndex *xip, Int4 tag, Char sep)
Definition: xm_index.cpp:1549
#define INSDSEQ_PRIMARY
Definition: fta_xml.h:65
std::vector< CRef< objects::CUser_object > > TUserObjVector
Definition: ftablock.h:61
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:125
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:115
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:105
void StringCpy(char *d, const char *s)
Definition: ftacpp.hpp:83
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
void MemCpy(void *p, const void *q, size_t sz)
Definition: ftacpp.hpp:50
char * StringRChr(char *s, const char c)
Definition: ftacpp.hpp:87
void FtaDeletePrefix(int prefix)
Definition: ftaerr.cpp:344
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
void fta_find_pub_explore(ParserPtr pp, TEntryList &seq_entries)
Definition: ftanet.cpp:762
static const char * str(char *buf, int n)
Definition: stats.c:84
int offset
Definition: replacements.h:160
void CheckFeatSeqLoc(TEntryList &seq_entries)
Definition: gb_ascii.cpp:2377
void DealWithGenes(TEntryList &seq_entries, ParserPtr pp)
Definition: genref.cpp:2957
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
#define StringStr
Definition: ncbistr.hpp:322
#define StringSave
Definition: ncbistr.hpp:326
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
void ResetHistory(EActionIfLocked action=eKeepIfLocked)
Clean all unused TSEs from the scope's cache and release the memory.
Definition: scope.cpp:325
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
TOrigin GetOrigin(void) const
Get the Origin member data.
Definition: BioSource_.hpp:472
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
Definition: BioSource_.hpp:447
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
TStd & SetStd(void)
Select the variant.
Definition: Date_.cpp:115
const TDiv & GetDiv(void) const
Get the Div member data.
Definition: OrgName_.hpp:1005
void SetCommon(const TCommon &value)
Assign a value to Common data member.
Definition: Org_ref_.hpp:428
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
bool IsSetDiv(void) const
GenBank division code Check if a value has been assigned to Div data member.
Definition: OrgName_.hpp:993
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
Definition: Org_ref_.hpp:381
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSeq_code_type_iupacaa
IUPAC 1 letter amino acid code.
@ eSeq_code_type_iupacna
IUPAC 1 letter nuc acid code.
bool IsMix(void) const
Check if variant Mix is selected.
Definition: Seq_loc_.hpp:552
const TMix & GetMix(void) const
Get the variant data.
Definition: Seq_loc_.cpp:282
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
TPub & SetPub(void)
Select the variant.
Definition: Seqdesc_.cpp:362
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
TGenbank & SetGenbank(void)
Select the variant.
Definition: Seqdesc_.cpp:340
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
TComment & SetComment(void)
Select the variant.
Definition: Seqdesc_.hpp:1065
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
void ResetTech(void)
Reset Tech data member.
Definition: MolInfo_.hpp:484
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
void SetTopology(TTopology value)
Assign a value to Topology data member.
Definition: Seq_inst_.hpp:739
ETopology
topology of molecule
Definition: Seq_inst_.hpp:121
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
TEmbl & SetEmbl(void)
Select the variant.
Definition: Seqdesc_.cpp:456
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
EStrand
strandedness in living organism
Definition: Seq_inst_.hpp:133
void SetStrand(TStrand value)
Assign a value to Strand data member.
Definition: Seq_inst_.hpp:786
void SetTech(TTech value)
Assign a value to Tech data member.
Definition: MolInfo_.hpp:503
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
TMolinfo & SetMolinfo(void)
Select the variant.
Definition: Seqdesc_.cpp:594
TCreate_date & SetCreate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:478
TUpdate_date & SetUpdate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:500
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_targeted
targeted locus sets/studies
Definition: MolInfo_.hpp:147
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ ParFlat_REF_NO_TARGET
Definition: index.h:63
@ ParFlat_REF_END
Definition: index.h:60
CRef< CDate_std > GetUpdateDate(const char *ptr, Parser::ESource source)
Definition: indx_blk.cpp:611
Int2 XMLCheckSTRAND(const char *str)
Definition: indx_blk.cpp:485
Int4 IsNewAccessFormat(const Char *acnum)
Definition: indx_blk.cpp:995
Int2 CheckDIV(const char *str)
Definition: indx_blk.cpp:532
Int2 XMLCheckTPG(const char *str)
Definition: indx_blk.cpp:491
int i
void GetFlatBiomol(CMolInfo::TBiomol &biomol, CMolInfo::TTech tech, char *molstr, ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
Definition: loadfeat.cpp:5094
void LoadFeat(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
Definition: loadfeat.cpp:4788
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
std::list< SeqLoc > TSeqLocList
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void CheckDupDates(TEntryList &seq_entries)
Definition: nucprot.cpp:2673
void ProcNucProt(ParserPtr pp, TEntryList &seq_entries, GeneRefFeats &gene_refs)
Definition: nucprot.cpp:2519
CRef< CPubdesc > DescrRefs(ParserPtr pp, DataBlkPtr dbp, Int4 col_data)
Definition: ref.cpp:2423
CRef< objects::CSeq_entry > seq_entry
Definition: ftablock.h:346
bool valid
Definition: nucprot.h:64
Char acnum[200]
Definition: ftablock.h:169
CRef< objects::CPatent_seq_id > psip
Definition: ftablock.h:193
Char division[4]
Definition: ftablock.h:174
bool is_mga
Definition: ftablock.h:202
bool tsa_allowed
Definition: ftablock.h:214
Int2 htg
Definition: ftablock.h:199
bool is_tls
Definition: ftablock.h:211
Int2 vernum
Definition: ftablock.h:170
bool is_tpa
Definition: ftablock.h:209
TKeywordList keywords
Definition: ftablock.h:243
bool is_prot
Definition: ftablock.h:225
bool is_wgs
Definition: ftablock.h:208
bool origin
Definition: ftablock.h:204
bool is_contig
Definition: ftablock.h:200
bool STS
Definition: ftablock.h:196
bool is_pat
Definition: ftablock.h:205
bool HTC
Definition: ftablock.h:198
bool drop
Definition: ftablock.h:185
bool experimental
Definition: ftablock.h:250
size_t bases
Definition: ftablock.h:175
bool inferential
Definition: ftablock.h:248
Uint2 segtotal
Definition: ftablock.h:178
bool is_tsa
Definition: ftablock.h:210
bool EST
Definition: ftablock.h:195
GapFeatsPtr gaps
Definition: ftablock.h:217
string wgssec
Definition: ftablock.h:239
bool specialist_db
Definition: ftablock.h:246
Uint2 segnum
Definition: ftablock.h:176
Char locusname[200]
Definition: ftablock.h:173
bool env_sample_qual
Definition: ftablock.h:222
XmlIndexPtr xip
Definition: ftablock.h:220
size_t qslength
Definition: ftablock.h:233
bool GSS
Definition: ftablock.h:197
char *(* ff_get_qscore_pp)(const char *accession, Int2 v, Parser *pp)
vector< IndexblkPtr > entrylist
bool allow_crossdb_featloc
char *(* ff_get_qscore)(const char *accession, Int2 v)
TEntryList entries
XmlIndex * next
Definition: ftablock.h:161
Int4 tag
Definition: ftablock.h:153
CScope & GetScope()
void MaybeCutGbblockSource(TEntryList &seq_entries)
Definition: utilfeat.cpp:435
bool GetGenomeInfo(CBioSource &bsp, const Char *bptr)
Definition: utilfeat.cpp:225
bool HasHtg(const TKeywordList &keywords)
Definition: utilfun.cpp:1663
bool HasHtc(const TKeywordList &keywords)
Definition: utilfun.cpp:1692
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1225
void RemoveHtgPhase(TKeywordList &keywords)
Definition: utilfun.cpp:1677
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1366
void fta_remove_tpa_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1352
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
Definition: utilfun.cpp:1321
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1381
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
Definition: utilfun.cpp:1284
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1585
bool IsCancelled(const TKeywordList &keywords)
Definition: utilfun.cpp:1652
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1191
void fta_remove_env_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1396
bool fta_tpa_keywords_check(const TKeywordList &kwds)
Definition: utilfun.cpp:1109
bool fta_check_mga_keywords(CMolInfo &mol_info, const TKeywordList &kwds)
Definition: utilfun.cpp:1536
CRef< CSeq_loc > xgbparseint_ver(const char *raw_intervals, bool &keep_rawPt, int &numErrors, const TSeqIdList &seq_ids, bool accver)
Definition: xgbparint.cpp:1466
USING_SCOPE(objects)
bool XMLAscii(ParserPtr pp)
Definition: xm_ascii.cpp:1134
bool XMLGetInst(ParserPtr pp, DataBlkPtr dbp, unsigned char *dnaconv, CBioseq &bioseq)
Definition: xm_ascii.cpp:189
static bool XMLGetInstContig(XmlIndexPtr xip, DataBlkPtr dbp, CBioseq &bioseq, ParserPtr pp)
Definition: xm_ascii.cpp:126
static void XMLGetDescr(ParserPtr pp, DataBlkPtr entry, CBioseq &bioseq)
Definition: xm_ascii.cpp:790
static CRef< CGB_block > XMLGetGBBlock(ParserPtr pp, const char *entry, CMolInfo &mol_info, CBioSource *bio_src)
Definition: xm_ascii.cpp:240
static CRef< CMolInfo > XMLGetMolInfo(ParserPtr pp, DataBlkPtr entry, COrg_ref *org_ref)
Definition: xm_ascii.cpp:614
static void XMLCheckContigEverywhere(IndexblkPtr ibp, Parser::ESource source)
Definition: xm_ascii.cpp:92
static void XMLGetDivision(const char *entry, IndexblkPtr ibp)
Definition: xm_ascii.cpp:1118
static void XMLFakeBioSources(XmlIndexPtr xip, const char *entry, CBioseq &bioseq, Parser::ESource source)
Definition: xm_ascii.cpp:660
static void XMLGetDescrComment(char *offset)
Definition: xm_ascii.cpp:727
void XGappedSeqLocsToDeltaSeqs(const TSeqLocList &locs, TDeltaList &deltas)
Definition: xutils.cpp:91
Modified on Sat Apr 13 11:45:21 2024 by modify_doxy.py rev. 669887