NCBI C++ ToolKit
xm_ascii.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: xm_ascii.cpp 99335 2023-03-13 13:48:10Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: xm_ascii.cpp
27  *
28  * Author: Sergey Bazhin
29  *
30  * File Description:
31  * Parse INSDSEQ from blocks to asn.
32  * Build XML format entry block.
33  *
34  */
35 
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 
40 #include <objects/seq/Seq_inst.hpp>
42 #include <objects/seq/Seq_ext.hpp>
48 #include <objmgr/scope.hpp>
49 #include <objects/seq/MolInfo.hpp>
55 #include <objects/seq/Pubdesc.hpp>
56 
57 
58 #include "index.h"
59 
60 #include "ftanet.h"
63 
64 #include "ftaerr.hpp"
65 #include "indx_blk.h"
66 #include "asci_blk.h"
67 #include "utilref.h"
68 #include "utilfeat.h"
69 #include "loadfeat.h"
70 #include "add.h"
71 #include "gb_ascii.h"
72 #include "nucprot.h"
73 #include "fta_qscore.h"
74 #include "em_ascii.h"
75 #include "citation.h"
76 #include "fcleanup.h"
77 #include "utilfun.h"
78 #include "ref.h"
79 #include "xgbparint.h"
80 #include "xutils.h"
81 #include "fta_xml.h"
82 
83 #ifdef THIS_FILE
84 # undef THIS_FILE
85 #endif
86 #define THIS_FILE "xm_ascii.cpp"
87 
90 
91 /**********************************************************/
93 {
94  if (! ibp)
95  return;
96 
97  bool condiv = (NStr::CompareNocase(ibp->division, "CON") == 0);
98 
99  if (condiv && ibp->segnum != 0) {
100  ErrPostEx(SEV_ERROR, ERR_DIVISION_ConDivInSegset, "Use of the CON division is not allowed for members of segmented set : %s|%s. Entry dropped.", ibp->locusname, ibp->acnum);
101  ibp->drop = true;
102  return;
103  }
104 
105  if (! condiv && ibp->is_contig == false && ibp->origin == false) {
106  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingSequenceData, "Required sequence data is absent. Entry dropped.");
107  ibp->drop = true;
108  } else if (! condiv && ibp->is_contig && ibp->origin == false) {
109  ErrPostEx(SEV_WARNING, ERR_DIVISION_MappedtoCON, "Division [%s] mapped to CON based on the existence of <INSDSeq_contig> line.", ibp->division);
110  } else if (ibp->is_contig && ibp->origin) {
112  ErrPostEx(SEV_INFO, ERR_FORMAT_ContigWithSequenceData, "The <INSDSeq_contig> linetype and sequence data are both present. Ignoring sequence data.");
113  } else {
114  ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigWithSequenceData, "The <INSDSeq_contig> linetype and sequence data may not both be present in a sequence record.");
115  ibp->drop = true;
116  }
117  } else if (condiv && ibp->is_contig == false && ibp->origin == false) {
118  ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingContigFeature, "No <INSDSeq_contig> data in XML format file. Entry dropped.");
119  ibp->drop = true;
120  } else if (condiv && ibp->is_contig == false && ibp->origin) {
121  ErrPostEx(SEV_WARNING, ERR_DIVISION_ConDivLacksContig, "Division is CON, but <INSDSeq_contig> data have not been found.");
122  }
123 }
124 
125 /**********************************************************/
126 static bool XMLGetInstContig(XmlIndexPtr xip, DataBlkPtr dbp, CBioseq& bioseq, ParserPtr pp)
127 {
128  char* p;
129  char* q;
130  char* r;
131  bool locmap;
132  bool allow_crossdb_featloc;
133  Int4 i;
134  int numerr;
135 
136  p = XMLFindTagValue(dbp->mOffset, xip, INSDSEQ_CONTIG);
137  if (! p)
138  return false;
139 
140  for (q = p, r = p; *q != '\0'; q++)
141  if (*q != '\n' && *q != '\t' && *q != ' ')
142  *r++ = *q;
143  *r = '\0';
144 
145  for (q = p; *q != '\0'; q++)
146  if ((q[0] == ',' && q[1] == ',') || (q[0] == '(' && q[1] == ',') ||
147  (q[0] == ',' && q[1] == ')'))
148  break;
149  if (*q != '\0') {
150  ErrPostEx(SEV_REJECT, ERR_LOCATION_ContigHasNull, "The join() statement for this record's contig line contains one or more comma-delimited components which are null.");
151  MemFree(p);
152  return false;
153  }
154 
155  if (pp->buf)
156  MemFree(pp->buf);
157  pp->buf = nullptr;
158 
159  CRef<CSeq_loc> loc = xgbparseint_ver(p, locmap, numerr, bioseq.GetId(), pp->accver);
160 
161  if (loc.Empty()) {
162  MemFree(p);
163  return true;
164  }
165 
166  allow_crossdb_featloc = pp->allow_crossdb_featloc;
167  pp->allow_crossdb_featloc = true;
168 
169  TSeqLocList locs;
170  locs.push_back(loc);
171  i = fta_fix_seq_loc_id(locs, pp, p, nullptr, true);
172  if (i > 999)
174 
175  pp->allow_crossdb_featloc = allow_crossdb_featloc;
176 
177  if (loc->IsMix()) {
178  XGappedSeqLocsToDeltaSeqs(loc->GetMix(), bioseq.SetInst().SetExt().SetDelta().Set());
179  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_delta);
180  } else
181  bioseq.SetInst().ResetExt();
182 
183  MemFree(p);
184 
185  return true;
186 }
187 
188 /**********************************************************/
189 bool XMLGetInst(ParserPtr pp, DataBlkPtr dbp, unsigned char* dnaconv, CBioseq& bioseq)
190 {
191  IndexblkPtr ibp;
192  XmlIndexPtr xip;
193  Int2 topology;
194  Int2 strand;
195  char* topstr;
196  char* strandstr;
197 
198  ibp = pp->entrylist[pp->curindx];
199  topstr = nullptr;
200  strandstr = nullptr;
201  for (xip = ibp->xip; xip; xip = xip->next) {
202  if (xip->tag == INSDSEQ_TOPOLOGY && ! topstr)
203  topstr = XMLGetTagValue(dbp->mOffset, xip);
204  else if (xip->tag == INSDSEQ_STRANDEDNESS && ! strandstr)
205  strandstr = XMLGetTagValue(dbp->mOffset, xip);
206  }
207  if (! topstr)
208  topstr = StringSave(" ");
209  if (! strandstr)
210  strandstr = StringSave(" ");
211 
212  CSeq_inst& inst = bioseq.SetInst();
214 
215  /* get linear, circular, tandem topology, blank is linear which = 1
216  */
217  topology = XMLCheckTPG(topstr);
218  if (topology > 1)
219  inst.SetTopology(static_cast<CSeq_inst::ETopology>(topology));
220 
221  strand = XMLCheckSTRAND(strandstr);
222  if (strand > 0)
223  inst.SetStrand(static_cast<CSeq_inst::EStrand>(strand));
224 
225  if (topstr)
226  MemFree(topstr);
227  if (strandstr)
228  MemFree(strandstr);
229 
230  if (! GetSeqData(pp, *dbp, bioseq, 0, dnaconv, ibp->is_prot ? eSeq_code_type_iupacaa : eSeq_code_type_iupacna))
231  return false;
232 
233  if (ibp->is_contig && ! XMLGetInstContig(ibp->xip, dbp, bioseq, pp))
234  return false;
235 
236  return true;
237 }
238 
239 /**********************************************************/
240 static CRef<CGB_block> XMLGetGBBlock(ParserPtr pp, const char* entry, CMolInfo& mol_info, CBioSource* bio_src)
241 {
242  CRef<CGB_block> gbb(new CGB_block),
243  ret;
244 
245  IndexblkPtr ibp;
246  char* bptr;
247  char* str;
248  char msg[4];
249  char* kw;
250  char* kwp;
251  Int2 div;
252  bool if_cds;
253 
254  bool pat_ref = false;
255  bool est_kwd = false;
256  bool sts_kwd = false;
257  bool gss_kwd = false;
258  bool htc_kwd = false;
259  bool fli_kwd = false;
260  bool wgs_kwd = false;
261  bool tpa_kwd = false;
262  bool tsa_kwd = false;
263  bool tls_kwd = false;
264  bool env_kwd = false;
265  bool mga_kwd = false;
266 
267  bool cancelled;
268  bool drop;
269  char* tempdiv;
270  Int2 thtg;
271  char* p;
272  Int4 i;
273 
274  ibp = pp->entrylist[pp->curindx];
275 
276  ibp->wgssec[0] = '\0';
277 
278  str = XMLFindTagValue(entry, ibp->xip, INSDSEQ_SOURCE);
279  if (str) {
280  p = StringRChr(str, '.');
281  if (p && p > str && p[1] == '\0' && *(p - 1) == '.')
282  *p = '\0';
283 
284  gbb->SetSource(str);
285  MemFree(str);
286  }
287 
288  if (! ibp->keywords.empty()) {
289  gbb->SetKeywords().swap(ibp->keywords);
290  ibp->keywords.clear();
291  } else
292  XMLGetKeywords(entry, ibp->xip, gbb->SetKeywords());
293 
294  if (ibp->is_mga && ! fta_check_mga_keywords(mol_info, gbb->GetKeywords())) {
295  return ret;
296  }
297 
298  if (ibp->is_tpa && ! fta_tpa_keywords_check(gbb->SetKeywords())) {
299  return ret;
300  }
301 
302  if (ibp->is_tsa && ! fta_tsa_keywords_check(gbb->SetKeywords(), pp->source)) {
303  return ret;
304  }
305 
306  if (ibp->is_tls && ! fta_tls_keywords_check(gbb->SetKeywords(), pp->source)) {
307  return ret;
308  }
309 
310  for (const string& key : gbb->GetKeywords()) {
311  fta_keywords_check(key.c_str(), &est_kwd, &sts_kwd, &gss_kwd, &htc_kwd, &fli_kwd, &wgs_kwd, &tpa_kwd, &env_kwd, &mga_kwd, &tsa_kwd, &tls_kwd);
312  }
313 
314  if (ibp->env_sample_qual == false && env_kwd) {
315  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ENV_NoMatchingQualifier, "This record utilizes the ENV keyword, but there are no /environmental_sample qualifiers among its source features.");
316  return ret;
317  }
318 
319  bptr = XMLFindTagValue(entry, ibp->xip, INSDSEQ_DIVISION);
320  if (bptr) {
321  if_cds = XMLCheckCDS(entry, ibp->xip);
322  div = CheckDIV(bptr);
323  if (div != -1) {
324  string div_str(bptr, bptr + 3);
325  gbb->SetDiv(div_str);
326 
327  if (div == 16) /* "ORG" replaced by "UNA" */
328  gbb->SetDiv("UNA");
329 
330  /* preserve the division code for later use
331  */
332  const char* p_div = gbb->GetDiv().c_str();
333  StringCpy(ibp->division, p_div);
334 
335  if (ibp->psip.NotEmpty())
336  pat_ref = true;
337 
338  if (ibp->is_tpa &&
339  (StringEqu(p_div, "EST") || StringEqu(p_div, "GSS") ||
340  StringEqu(p_div, "PAT") || StringEqu(p_div, "HTG"))) {
341  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTPADivcode, "Division code \"%s\" is not legal for TPA records. Entry dropped.", p_div);
342  return ret;
343  }
344 
345  if (ibp->is_tsa && ! StringEqu(p_div, "TSA")) {
346  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTSADivcode, "Division code \"%s\" is not legal for TSA records. Entry dropped.", p_div);
347  return ret;
348  }
349 
350  cancelled = IsCancelled(gbb->GetKeywords());
351 
352  if (StringEqu(p_div, "HTG")) {
353  if (! HasHtg(gbb->GetKeywords())) {
354  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTGKeywords, "Division is HTG, but entry lacks HTG-related keywords. Entry dropped.");
355  return ret;
356  }
357  }
358 
359  tempdiv = StringSave(gbb->GetDiv().c_str());
360 
361  if (fta_check_htg_kwds(gbb->SetKeywords(), pp->entrylist[pp->curindx], mol_info))
362  gbb->SetDiv("");
363 
364  XMLDefVsHTGKeywords(mol_info.GetTech(), entry, ibp->xip, cancelled);
365 
366  CheckHTGDivision(tempdiv, mol_info.GetTech());
367  if (tempdiv)
368  MemFree(tempdiv);
369 
370  i = 0;
371  if (est_kwd)
372  i++;
373  if (sts_kwd)
374  i++;
375  if (gss_kwd)
376  i++;
377  if (ibp->htg > 0)
378  i++;
379  if (htc_kwd)
380  i++;
381  if (fli_kwd)
382  i++;
383  if (wgs_kwd)
384  i++;
385  if (env_kwd)
386  i++;
387  if (mga_kwd) {
388  if (ibp->is_mga == false) {
389  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeCAGE, "This is apparently _not_ a CAGE record, but the special keywords are present. Entry dropped.");
390  return ret;
391  }
392  i++;
393  } else if (ibp->is_mga) {
394  ErrPostEx(SEV_REJECT, ERR_KEYWORD_NoGeneExpressionKeywords, "This is apparently a CAGE or 5'-SAGE record, but it lacks the required keywords. Entry dropped.");
395  return ret;
396  }
397  if (tpa_kwd) {
398  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL) {
399  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA\" and/or \"Third Party Annotation\" keywords are present. Entry dropped.");
400  return ret;
401  }
402  i++;
403  } else if (ibp->is_tpa) {
404  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA\" and/or \"Third Party Annotation\" keywords. Entry dropped.");
405  return ret;
406  }
407  if (tsa_kwd) {
408  if (ibp->is_tsa == false) {
409  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords are present. Entry dropped.");
410  return ret;
411  }
412  i++;
413  } else if (ibp->is_tsa) {
414  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords. Entry dropped.");
415  return ret;
416  }
417  if (tls_kwd) {
418  if (ibp->is_tls == false) {
419  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS\" and/or \"Targeted Locus Study\" keywords are present. Entry dropped.");
420  return ret;
421  }
422  i++;
423  } else if (ibp->is_tls) {
424  ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS\" and/or \"Targeted Locus Study\" keywords. Entry dropped.");
425  return ret;
426  }
427  if (i > 1) {
428  if (i == 2 && ibp->htg > 0 && env_kwd)
429  ErrPostEx(SEV_WARNING, ERR_KEYWORD_HTGPlusENV, "This HTG record also has the ENV keyword, which is an unusual combination. Confirmation that isolation and cloning steps actually occured might be appropriate.");
430  else if (i != 2 || env_kwd == false ||
431  (est_kwd == false && gss_kwd == false && wgs_kwd == false)) {
432  ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingKeywords, "This record contains more than one of the special keywords used to indicate that a sequence is an HTG, EST, GSS, STS, HTC, WGS, ENV, FLI_CDNA, TPA, CAGE, TSA or TLS sequence.");
433  return ret;
434  }
435  }
436 
437  if (wgs_kwd)
438  i--;
439  if (ibp->is_contig && i > 0 &&
440  wgs_kwd == false && tpa_kwd == false && env_kwd == false) {
441  ErrPostEx(SEV_REJECT, ERR_KEYWORD_IllegalForCON, "This CON record should not have HTG, EST, GSS, STS, HTC, FLI_CDNA, CAGE, TSA or TLS special keywords. Entry dropped.");
442  return ret;
443  }
444 
445  thtg = mol_info.GetTech();
446  if (thtg == CMolInfo::eTech_htgs_0 || thtg == CMolInfo::eTech_htgs_1 ||
447  thtg == CMolInfo::eTech_htgs_2 || thtg == CMolInfo::eTech_htgs_3) {
448  RemoveHtgPhase(gbb->SetKeywords());
449  }
450 
451  kw = XMLConcatSubTags(entry, ibp->xip, INSDSEQ_KEYWORDS, ';');
452  if (kw) {
453  kwp = StringStr(kw, "EST");
454  if (kwp && est_kwd == false) {
455  ErrPostEx(SEV_WARNING, ERR_KEYWORD_ESTSubstring, "Keyword %s has substring EST, but no official EST keywords found", kw);
456  }
457  kwp = StringStr(kw, "STS");
458  if (kwp && sts_kwd == false) {
459  ErrPostEx(SEV_WARNING, ERR_KEYWORD_STSSubstring, "Keyword %s has substring STS, but no official STS keywords found", kw);
460  }
461  MemFree(kw);
462  }
463 
464  if (! ibp->is_contig) {
465  drop = false;
466  CMolInfo::TTech tech = mol_info.GetTech();
467  string p_div = gbb->GetDiv();
468 
469  check_div(ibp->is_pat, pat_ref, est_kwd, sts_kwd, gss_kwd, if_cds, p_div, &tech, ibp->bases, pp->source, drop);
470 
471  if (tech != CMolInfo::eTech_unknown)
472  mol_info.SetTech(tech);
473  else
474  mol_info.ResetTech();
475 
476  if (! p_div.empty())
477  gbb->SetDiv(p_div);
478  else
479  gbb->SetDiv("");
480 
481  if (drop) {
482  MemFree(bptr);
483  return ret;
484  }
485  } else if (gbb->GetDiv() == "CON") {
486  gbb->SetDiv("");
487  }
488  } else {
489  MemCpy(msg, bptr, 3);
490  msg[3] = '\0';
491  ErrPostEx(SEV_REJECT, ERR_DIVISION_UnknownDivCode, "Unknown division code \"%s\" found in GenBank flatfile. Record rejected.", msg);
492  MemFree(bptr);
493  return ret;
494  }
495 
496  if (IsNewAccessFormat(ibp->acnum) == 0 && *ibp->acnum == 'T' &&
497  gbb->GetDiv() != "EST") {
498  ErrPostStr(SEV_INFO, ERR_DIVISION_MappedtoEST, "Leading T in accession number.");
499  mol_info.SetTech(CMolInfo::eTech_est);
500 
501  gbb->SetDiv("");
502  }
503 
504  MemFree(bptr);
505  }
506 
507  bool is_htc_div = gbb->GetDiv() == "HTC",
508  has_htc = HasHtc(gbb->GetKeywords());
509 
510  if (is_htc_div && ! has_htc) {
511  ErrPostEx(SEV_ERROR, ERR_DIVISION_MissingHTCKeyword, "This record is in the HTC division, but lacks the required HTC keyword.");
512  return ret;
513  }
514 
515  if (! is_htc_div && has_htc) {
516  ErrPostEx(SEV_ERROR, ERR_DIVISION_InvalidHTCKeyword, "This record has the special HTC keyword, but is not in HTC division. If this record has graduated out of HTC, then the keyword should be removed.");
517  return ret;
518  }
519 
520  if (is_htc_div) {
521  str = XMLFindTagValue(entry, ibp->xip, INSDSEQ_MOLTYPE);
522  if (str) {
523  p = str;
524  if (*str == 'm' || *str == 'r')
525  p = str + 1;
526  else if (StringEquN(str, "pre-", 4))
527  p = str + 4;
528  else if (StringEquN(str, "transcribed ", 12))
529  p = str + 12;
530 
531  if (! StringEquN(p, "RNA", 3)) {
532  ErrPostEx(SEV_ERROR, ERR_DIVISION_HTCWrongMolType, "All HTC division records should have a moltype of pre-RNA, mRNA or RNA.");
533  MemFree(str);
534  return ret;
535  }
536  MemFree(str);
537  }
538  }
539 
540  if (fli_kwd)
542 
543  /* will be used in flat file database
544  */
545  if (! gbb->GetDiv().empty()) {
546  if (gbb->GetDiv() == "EST") {
547  ibp->EST = true;
548  mol_info.SetTech(CMolInfo::eTech_est);
549  gbb->SetDiv("");
550  } else if (gbb->GetDiv() == "STS") {
551  ibp->STS = true;
552  mol_info.SetTech(CMolInfo::eTech_sts);
553  gbb->SetDiv("");
554  } else if (gbb->GetDiv() == "GSS") {
555  ibp->GSS = true;
557  gbb->SetDiv("");
558  } else if (gbb->GetDiv() == "HTC") {
559  ibp->HTC = true;
560  mol_info.SetTech(CMolInfo::eTech_htc);
561  gbb->SetDiv("");
562  } else if (gbb->GetDiv() == "SYN" && bio_src && bio_src->IsSetOrigin() &&
563  bio_src->GetOrigin() == 5) /* synthetic */
564  {
565  gbb->SetDiv("");
566  }
567  } else if (mol_info.IsSetTech()) {
568  if (mol_info.GetTech() == CMolInfo::eTech_est)
569  ibp->EST = true;
570  if (mol_info.GetTech() == CMolInfo::eTech_sts)
571  ibp->STS = true;
572  if (mol_info.GetTech() == CMolInfo::eTech_survey)
573  ibp->GSS = true;
574  if (mol_info.GetTech() == CMolInfo::eTech_htc)
575  ibp->HTC = true;
576  }
577 
578  if (mol_info.IsSetTech())
579  fta_remove_keywords(mol_info.GetTech(), gbb->SetKeywords());
580 
581  if (ibp->is_tpa)
582  fta_remove_tpa_keywords(gbb->SetKeywords());
583 
584  if (ibp->is_tsa)
585  fta_remove_tsa_keywords(gbb->SetKeywords(), pp->source);
586 
587  if (ibp->is_tls)
588  fta_remove_tls_keywords(gbb->SetKeywords(), pp->source);
589 
590  if (bio_src && bio_src->IsSetSubtype()) {
591  for (const auto& subtype : bio_src->GetSubtype()) {
592  if (subtype->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
593  fta_remove_env_keywords(gbb->SetKeywords());
594  break;
595  }
596  }
597  }
598 
599  GetExtraAccession(ibp, pp->allow_uwsec, pp->source, gbb->SetExtra_accessions());
600 
601  if (gbb->IsSetDiv() &&
602  bio_src &&
603  bio_src->IsSetOrg() &&
604  bio_src->GetOrg().IsSetOrgname() &&
605  bio_src->GetOrg().GetOrgname().IsSetDiv() &&
606  bio_src->GetOrg().GetOrgname().GetDiv() == gbb->GetDiv()) {
607  gbb->ResetDiv();
608  }
609 
610  return gbb;
611 }
612 
613 /**********************************************************/
615 {
616  IndexblkPtr ibp;
617 
618  char* div;
619  char* molstr;
620 
621  ibp = pp->entrylist[pp->curindx];
622 
623  CRef<CMolInfo> mol_info(new CMolInfo);
624 
625  molstr = XMLFindTagValue(entry->mOffset, ibp->xip, INSDSEQ_MOLTYPE);
626  div = XMLFindTagValue(entry->mOffset, ibp->xip, INSDSEQ_DIVISION);
627 
628  if (StringEquN(div, "EST", 3))
629  mol_info->SetTech(CMolInfo::eTech_est);
630  else if (StringEquN(div, "STS", 3))
631  mol_info->SetTech(CMolInfo::eTech_sts);
632  else if (StringEquN(div, "GSS", 3))
633  mol_info->SetTech(CMolInfo::eTech_survey);
634  else if (StringEquN(div, "HTG", 3))
635  mol_info->SetTech(CMolInfo::eTech_htgs_1);
636  else if (ibp->is_wgs) {
637  if (ibp->is_tsa)
638  mol_info->SetTech(CMolInfo::eTech_tsa);
639  else if (ibp->is_tls)
640  mol_info->SetTech(CMolInfo::eTech_targeted);
641  else
642  mol_info->SetTech(CMolInfo::eTech_wgs);
643  } else if (ibp->is_tsa)
644  mol_info->SetTech(CMolInfo::eTech_tsa);
645  else if (ibp->is_tls)
646  mol_info->SetTech(CMolInfo::eTech_targeted);
647 
648  MemFree(div);
649  GetFlatBiomol(mol_info->SetBiomol(), mol_info->GetTech(), molstr, pp, *entry, org_ref);
650  if (mol_info->GetBiomol() == CMolInfo::eBiomol_unknown) // not set
651  mol_info->ResetBiomol();
652 
653  if (molstr)
654  MemFree(molstr);
655 
656  return mol_info;
657 }
658 
659 /**********************************************************/
660 static void XMLFakeBioSources(XmlIndexPtr xip, const char* entry, CBioseq& bioseq, Parser::ESource source)
661 {
662  char* organism = nullptr;
663  char* taxonomy = nullptr;
664 
665  char* p;
666  char* q;
667 
668  for (; xip; xip = xip->next) {
669  if (xip->tag == INSDSEQ_ORGANISM && ! organism)
670  organism = XMLGetTagValue(entry, xip);
671  else if (xip->tag == INSDSEQ_TAXONOMY && ! taxonomy)
672  taxonomy = XMLGetTagValue(entry, xip);
673  }
674 
675  if (! organism) {
676  ErrPostStr(SEV_WARNING, ERR_ORGANISM_NoOrganism, "No <INSDSeq_organism> data in XML format file.");
677  if (taxonomy)
678  MemFree(taxonomy);
679  return;
680  }
681 
682  CRef<CBioSource> bio_src(new CBioSource);
683 
684  p = organism;
685  if (GetGenomeInfo(*bio_src, p) && bio_src->GetGenome() != 9) /* ! Plasmid */
686  {
687  while (*p != ' ' && *p != '\0')
688  p++;
689  while (*p == ' ')
690  p++;
691  }
692 
693  COrg_ref& org_ref = bio_src->SetOrg();
694 
695  if (source == Parser::ESource::EMBL) {
696  q = StringChr(p, '(');
697  if (q && q > p) {
698  for (q--; *q == ' ' || *q == '\t'; q--)
699  if (q == p)
700  break;
701  if (*q != ' ' && *q != '\t')
702  q++;
703  if (q > p) {
704  *q = '\0';
705  org_ref.SetCommon(p);
706  }
707  }
708  }
709 
710  org_ref.SetTaxname(p);
711  MemFree(organism);
712 
713  if (org_ref.GetTaxname() == "Unknown.") {
714  string& taxname = org_ref.SetTaxname();
715  taxname = taxname.substr(0, taxname.size() - 1);
716  }
717 
718  if (taxonomy) {
719  org_ref.SetOrgname().SetLineage(taxonomy);
720  }
721 
722  CRef<CSeqdesc> descr(new CSeqdesc);
723  descr->SetSource(*bio_src);
724  bioseq.SetDescr().Set().push_back(descr);
725 }
726 
727 /**********************************************************/
728 static void XMLGetDescrComment(char* offset)
729 {
730  char* p;
731  char* q;
732 
733  for (p = offset; *p == '\n' || *p == ' ';)
734  p++;
735  if (p > offset)
736  fta_StringCpy(offset, p);
737 
738  for (p = offset, q = offset; *p != '\0';) {
739  if (*p != '\n') {
740  *q++ = *p++;
741  continue;
742  }
743 
744  *q++ = '~';
745  for (p++; *p == ' ';)
746  p++;
747  }
748  *q = '\0';
749 
750  for (p = offset;;) {
751  p = StringStr(p, "; ");
752  if (! p)
753  break;
754  for (p += 2, q = p; *q == ' ';)
755  q++;
756  if (q > p)
757  fta_StringCpy(p, q);
758  }
759 
760  for (p = offset; *p == ' ';)
761  p++;
762  if (p > offset)
763  fta_StringCpy(offset, p);
764  for (p = offset; *p != '\0';)
765  p++;
766 
767  if (p > offset) {
768  for (p--;; p--) {
769  if (*p == ' ' || *p == '\t' || *p == ';' || *p == ',' ||
770  *p == '.' || *p == '~') {
771  if (p > offset)
772  continue;
773  *p = '\0';
774  }
775  break;
776  }
777  if (*p != '\0') {
778  p++;
779  if (StringEquN(p, "...", 3))
780  p[3] = '\0';
781  else if (StringChr(p, '.')) {
782  *p = '.';
783  p[1] = '\0';
784  } else
785  *p = '\0';
786  }
787  }
788 }
789 
790 /**********************************************************/
791 static void XMLGetDescr(ParserPtr pp, DataBlkPtr entry, CBioseq& bioseq)
792 {
793  IndexblkPtr ibp;
794 
795  DataBlkPtr dbp;
796  DataBlkPtr dbpnext;
797 
798  char* crdate;
799  char* update;
800  char* offset;
801  char* str;
802  char* p;
803  char* q;
804  string gbdiv;
805 
806  ibp = pp->entrylist[pp->curindx];
807 
808  CBioSource* bio_src = nullptr;
809  COrg_ref* org_ref = nullptr;
810 
811  /* ORGANISM
812  */
813  for (auto& descr : bioseq.SetDescr().Set()) {
814  if (descr->IsSource()) {
815  bio_src = &(descr->SetSource());
816  if (bio_src->IsSetOrg())
817  org_ref = &bio_src->SetOrg();
818  break;
819  }
820  }
821 
822  /* MolInfo from LOCUS line
823  */
824  CRef<CMolInfo> mol_info = XMLGetMolInfo(pp, entry, org_ref);
825 
826  /* DEFINITION data ==> descr_title
827  */
829  string title;
830 
831  if (str) {
832  for (p = str; *p == ' ';)
833  p++;
834  if (p > str)
835  fta_StringCpy(str, p);
836  if (pp->xml_comp && pp->source != Parser::ESource::EMBL) {
837  p = StringRChr(str, '.');
838  if (! p || p[1] != '\0') {
839  string s = str;
840  s += '.';
841  MemFree(str);
842  str = StringSave(s.c_str());
843  p = nullptr;
844  }
845  }
846 
847  title = str;
848  MemFree(str);
849  str = nullptr;
850 
851  CRef<CSeqdesc> descr(new CSeqdesc);
852  descr->SetTitle(title);
853  bioseq.SetDescr().Set().push_back(descr);
854 
855  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL &&
856  StringEquN(title.c_str(), "TPA:", 4)) {
857  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA:\" prefix is present on its definition line. Entry dropped.");
858  ibp->drop = true;
859  return;
860  }
861 
862  if (ibp->is_tsa == false && StringEquN(title.c_str(), "TSA:", 4)) {
863  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA:\" prefix is present on its definition line. Entry dropped.");
864  ibp->drop = true;
865  return;
866  }
867 
868  if (ibp->is_tls == false && StringEquN(title.c_str(), "TLS:", 4)) {
869  ErrPostEx(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS:\" prefix is present on its definition line. Entry dropped.");
870  ibp->drop = true;
871  return;
872  }
873  }
874 
875  if (ibp->is_tpa &&
876  (title.empty() || ! StringEquN(title.c_str(), "TPA:", 4))) {
877  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA:\" prefix on its definition line. Entry dropped.");
878  ibp->drop = true;
879  return;
880  }
881 
882  if (ibp->is_tsa &&
883  (title.empty() || ! StringEquN(title.c_str(), "TSA:", 4))) {
884  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TSA:\" prefix on its definition line. Entry dropped.");
885  ibp->drop = true;
886  return;
887  }
888 
889  if (ibp->is_tls &&
890  (title.empty() || ! StringEquN(title.c_str(), "TLS:", 4))) {
891  ErrPostEx(SEV_REJECT, ERR_DEFINITION_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS:\" prefix on its definition line. Entry dropped.");
892  ibp->drop = true;
893  return;
894  }
895 
896  /* REFERENCE
897  */
898  /* pub should be before GBblock because we need patent ref
899  */
900  dbp = XMLBuildRefDataBlk(entry->mOffset, ibp->xip, ParFlat_REF_END);
901  for (; dbp; dbp = dbpnext) {
902  dbpnext = dbp->mpNext;
903 
904  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, 0);
905  if (pubdesc.NotEmpty()) {
906  CRef<CSeqdesc> descr(new CSeqdesc);
907  descr->SetPub(*pubdesc);
908  bioseq.SetDescr().Set().push_back(descr);
909  }
910 
911  dbp->SimpleDelete();
912  }
913 
915  for (; dbp; dbp = dbpnext) {
916  dbpnext = dbp->mpNext;
917 
918  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, 0);
919  if (pubdesc.NotEmpty()) {
920  CRef<CSeqdesc> descr(new CSeqdesc);
921  descr->SetPub(*pubdesc);
922  bioseq.SetDescr().Set().push_back(descr);
923  }
924 
925  dbp->SimpleDelete();
926  }
927 
928  TStringList dr_ena,
929  dr_biosample;
930 
931  CRef<CEMBL_block> embl;
932  CRef<CGB_block> gbb;
933 
934  if (pp->source == Parser::ESource::EMBL)
935  embl = XMLGetEMBLBlock(pp, entry->mOffset, *mol_info, gbdiv, bio_src, dr_ena, dr_biosample);
936  else
937  gbb = XMLGetGBBlock(pp, entry->mOffset, *mol_info, bio_src);
938 
939  CRef<CUser_object> dbuop;
940  if (! dr_ena.empty() || ! dr_biosample.empty())
941  fta_build_ena_user_object(bioseq.SetDescr().Set(), dr_ena, dr_biosample, dbuop);
942 
943  if (mol_info->IsSetBiomol() || mol_info->IsSetTech()) {
944  CRef<CSeqdesc> descr(new CSeqdesc);
945  descr->SetMolinfo(*mol_info);
946  bioseq.SetDescr().Set().push_back(descr);
947  }
948 
949  if (pp->source == Parser::ESource::EMBL) {
950  if (embl.Empty()) {
951  ibp->drop = true;
952  return;
953  }
954  } else if (gbb.Empty()) {
955  ibp->drop = true;
956  return;
957  }
958 
959  if (pp->source == Parser::ESource::EMBL) {
960  if (StringEquNI(ibp->division, "CON", 3))
961  fta_add_hist(pp, bioseq, embl->SetExtra_acc(), Parser::ESource::EMBL, CSeq_id::e_Embl, true, ibp->acnum);
962  else
963  fta_add_hist(pp, bioseq, embl->SetExtra_acc(), Parser::ESource::EMBL, CSeq_id::e_Embl, false, ibp->acnum);
964 
965  if (embl->GetExtra_acc().empty())
966  embl->ResetExtra_acc();
967  } else {
968  if (StringEquNI(ibp->division, "CON", 3))
969  fta_add_hist(pp, bioseq, gbb->SetExtra_accessions(), Parser::ESource::DDBJ, CSeq_id::e_Ddbj, true, ibp->acnum);
970  else
971  fta_add_hist(pp, bioseq, gbb->SetExtra_accessions(), Parser::ESource::DDBJ, CSeq_id::e_Ddbj, false, ibp->acnum);
972  }
973 
974  if (pp->source == Parser::ESource::EMBL) {
975  if (! gbdiv.empty()) {
976  gbb.Reset(new CGB_block);
977  gbb->SetDiv(gbdiv);
978  gbdiv.clear();
979  }
980 
981  CRef<CSeqdesc> descr(new CSeqdesc);
982  descr->SetEmbl(*embl);
983  bioseq.SetDescr().Set().push_back(descr);
984  }
985 
987  if (! offset && ibp->is_tpa && ibp->is_wgs == false) {
988  if (ibp->inferential || ibp->experimental) {
989  if (! fta_dblink_has_sra(dbuop)) {
990  ErrPostEx(SEV_REJECT, ERR_TPA_TpaSpansMissing, "TPA:%s record lacks both AH/PRIMARY linetype and Sequence Read Archive links. Entry dropped.", (ibp->inferential == false) ? "experimental" : "inferential");
991  ibp->drop = true;
992  return;
993  }
994  } else if (ibp->specialist_db == false) {
995  ErrPostEx(SEV_REJECT, ERR_TPA_TpaSpansMissing, "TPA record lacks required AH/PRIMARY linetype. Entry dropped.");
996  ibp->drop = true;
997  return;
998  }
999  }
1000 
1001  if (offset) {
1002  if (! fta_parse_tpa_tsa_block(bioseq, offset, ibp->acnum, ibp->vernum, 10, 0, ibp->is_tpa)) {
1003  ibp->drop = true;
1004  MemFree(offset);
1005  return;
1006  }
1007  MemFree(offset);
1008  }
1009 
1010  if (gbb.NotEmpty()) {
1011  if (pp->taxserver == 1 && gbb->IsSetDiv())
1012  fta_fix_orgref_div(bioseq.SetAnnot(), org_ref, *gbb);
1013 
1014  CRef<CSeqdesc> descr(new CSeqdesc);
1015  descr->SetGenbank(*gbb);
1016  bioseq.SetDescr().Set().push_back(descr);
1017  }
1018 
1019  /* COMMENT data
1020  */
1022  if (offset) {
1023  bool bad = false;
1024  TUserObjVector user_objs;
1025 
1026  fta_parse_structured_comment(offset, bad, user_objs);
1027 
1028  if (bad) {
1029  ibp->drop = true;
1030  MemFree(offset);
1031  return;
1032  }
1033 
1034  for (auto& user_obj : user_objs) {
1035  CRef<CSeqdesc> descr(new CSeqdesc);
1036  descr->SetUser(*user_obj);
1037  bioseq.SetDescr().Set().push_back(descr);
1038  }
1039 
1041  if (pp->xml_comp) {
1042  for (q = offset, p = q; *p != '\0';) {
1043  if (*p == ';' && (p[1] == ' ' || p[1] == '~'))
1044  *p = ' ';
1045  if (*p == '~' || *p == ' ') {
1046  *q++ = ' ';
1047  for (p++; *p == ' ' || *p == '~';)
1048  p++;
1049  } else
1050  *q++ = *p++;
1051  }
1052  *q = '\0';
1053  }
1054 
1055  if (offset[0] != 0) {
1056  CRef<CSeqdesc> descr(new CSeqdesc);
1057  descr->SetComment(offset);
1058  bioseq.SetDescr().Set().push_back(descr);
1059  }
1060  MemFree(offset);
1061  }
1062 
1063  /* DATE
1064  */
1065  if (pp->no_date) /* -N in command line means no date */
1066  return;
1067 
1068  CRef<CDate_std> std_upd_date,
1069  std_cre_date;
1070 
1071  if (pp->date) /* -L in command line means replace
1072  date */
1073  {
1074  CTime cur_time(CTime::eCurrent);
1075 
1076  std_upd_date.Reset(new CDate_std);
1077  std_upd_date->SetToTime(cur_time);
1078 
1079  std_cre_date.Reset(new CDate_std);
1080  std_cre_date->SetToTime(cur_time);
1081 
1082  update = nullptr;
1083  crdate = nullptr;
1084  } else {
1085  update = XMLFindTagValue(entry->mOffset, ibp->xip, INSDSEQ_UPDATE_DATE);
1086  if (update)
1087  std_upd_date = GetUpdateDate(update, pp->source);
1088 
1089  crdate = XMLFindTagValue(entry->mOffset, ibp->xip, INSDSEQ_CREATE_DATE);
1090  if (crdate)
1091  std_cre_date = GetUpdateDate(crdate, pp->source);
1092  }
1093 
1094  if (std_upd_date.NotEmpty()) {
1095  CRef<CSeqdesc> descr(new CSeqdesc);
1096  descr->SetUpdate_date().SetStd(*std_upd_date);
1097  bioseq.SetDescr().Set().push_back(descr);
1098 
1099  if (std_cre_date.NotEmpty() && std_cre_date->Compare(*std_upd_date) == CDate::eCompare_after) {
1100  ErrPostEx(SEV_ERROR, ERR_DATE_IllegalDate, "Update-date \"%s\" precedes create-date \"%s\".", update, crdate);
1101  }
1102  }
1103 
1104  if (std_cre_date.NotEmpty()) {
1105  if (pp->xml_comp == false || pp->source == Parser::ESource::EMBL) {
1106  CRef<CSeqdesc> descr(new CSeqdesc);
1107  descr->SetCreate_date().SetStd(*std_cre_date);
1108  bioseq.SetDescr().Set().push_back(descr);
1109  }
1110  }
1111 
1112  if (update)
1113  MemFree(update);
1114  if (crdate)
1115  MemFree(crdate);
1116 }
1117 
1118 /**********************************************************/
1119 static void XMLGetDivision(const char* entry, IndexblkPtr ibp)
1120 {
1121  char* div;
1122 
1123  if (! ibp || ! entry)
1124  return;
1125 
1126  div = XMLFindTagValue(entry, ibp->xip, INSDSEQ_DIVISION);
1127  if (! div)
1128  return;
1129  div[3] = '\0';
1130  StringCpy(ibp->division, div);
1131  MemFree(div);
1132 }
1133 
1134 /**********************************************************/
1136 {
1137  Int4 i;
1138  Int4 imax;
1139  Int4 j;
1140  Int4 segindx;
1141  Int4 total = 0;
1142  Int4 total_long = 0;
1143  Int4 total_dropped = 0;
1144  char* div;
1145  char* entry;
1146  EntryBlkPtr ebp;
1147 
1148  TEntryList seq_entries;
1149 
1150  CSeq_loc locs;
1151 
1152  bool seq_long = false;
1153  IndexblkPtr ibp;
1154  IndexblkPtr tibp;
1155  DataBlkPtr dbp;
1156 
1157  /* set up sequence alphabets
1158  */
1159  auto dnaconv = GetDNAConv();
1160  auto protconv = GetProteinConv();
1161 
1162  segindx = -1;
1163 
1164  for (imax = pp->indx, i = 0; i < imax; i++) {
1165  pp->curindx = i;
1166  ibp = pp->entrylist[i];
1167 
1168  err_install(ibp, pp->accver);
1169 
1170  if (ibp->segnum == 1)
1171  segindx = i;
1172 
1173  if (ibp->drop && ibp->segnum == 0) {
1174  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1175  total_dropped++;
1176  continue;
1177  }
1178 
1179  entry = XMLLoadEntry(pp, false);
1180  if (! entry) {
1182  return false;
1183  }
1184 
1185  XMLGetDivision(entry, ibp);
1186 
1187  if (StringEqu(ibp->division, "TSA")) {
1188  if (ibp->tsa_allowed == false)
1189  ErrPostEx(SEV_WARNING, ERR_TSA_UnexpectedPrimaryAccession, "The record with accession \"%s\" is not expected to have a TSA division code.", ibp->acnum);
1190  ibp->is_tsa = true;
1191  }
1192 
1193  XMLCheckContigEverywhere(ibp, pp->source);
1194  if (ibp->drop && ibp->segnum == 0) {
1195  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1196  MemFree(entry);
1197  total_dropped++;
1198  continue;
1199  }
1200 
1201  ebp = new EntryBlk();
1202 
1203  CRef<CBioseq> bioseq = CreateEntryBioseq(pp);
1204  ebp->seq_entry.Reset(new CSeq_entry);
1205  ebp->seq_entry->SetSeq(*bioseq);
1206  GetScope().AddBioseq(*bioseq);
1207 
1208  dbp = new DataBlk();
1209  dbp->mpData = ebp;
1210  dbp->mOffset = entry;
1211  dbp->len = StringLen(entry);
1212 
1213  if (! XMLGetInst(pp, dbp, ibp->is_prot ? protconv.get() : dnaconv.get(), *bioseq)) {
1214  ibp->drop = true;
1215  ErrPostStr(SEV_REJECT, ERR_SEQUENCE_BadData, "Bad sequence data. Entry dropped.");
1216  if (ibp->segnum == 0) {
1217  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1218  delete dbp;
1219  MemFree(entry);
1220  total_dropped++;
1221  continue;
1222  }
1223  }
1224 
1225  XMLFakeBioSources(ibp->xip, dbp->mOffset, *bioseq, pp->source);
1226  LoadFeat(pp, *dbp, *bioseq);
1227 
1228  if (! bioseq->IsSetAnnot() && ibp->drop && ibp->segnum == 0) {
1229  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1230  delete dbp;
1231  MemFree(entry);
1232  total_dropped++;
1233  continue;
1234  }
1235 
1236  XMLGetDescr(pp, dbp, *bioseq);
1237 
1238  if (ibp->drop && ibp->segnum == 0) {
1239  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1240  delete dbp;
1241  MemFree(entry);
1242  total_dropped++;
1243  continue;
1244  }
1245 
1246  fta_set_molinfo_completeness(*bioseq, ibp);
1247 
1248  if (ibp->is_tsa)
1249  fta_tsa_tls_comment_dblink_check(*bioseq, true);
1250 
1251  if (ibp->is_tls)
1252  fta_tsa_tls_comment_dblink_check(*bioseq, false);
1253 
1254  if (bioseq->GetInst().IsNa()) {
1255  if (bioseq->GetInst().GetRepr() == CSeq_inst::eRepr_raw) {
1256  if (ibp->gaps)
1257  GapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
1258  else if (ibp->htg == 4 || ibp->htg == 1 || ibp->htg == 2 ||
1259  (ibp->is_pat && pp->source == Parser::ESource::DDBJ))
1260  SeqToDelta(*bioseq, ibp->htg);
1261  } else if (ibp->gaps)
1262  AssemblyGapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
1263  }
1264 
1265  if (no_date(pp->format, bioseq->GetDescr().Get()) &&
1266  pp->debug == false && pp->no_date == false &&
1267  pp->xml_comp == false && pp->source != Parser::ESource::USPTO) {
1268  ibp->drop = true;
1269  ErrPostStr(SEV_ERROR, ERR_DATE_IllegalDate, "Illegal create date. Entry dropped.");
1270  if (ibp->segnum == 0) {
1271  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1272  delete dbp;
1273  MemFree(entry);
1274  total_dropped++;
1275  continue;
1276  }
1277  }
1278 
1279  if (dbp->mpQscore.empty() && pp->accver) {
1280  if (pp->ff_get_qscore)
1281  dbp->mpQscore = (*pp->ff_get_qscore)(ibp->acnum, ibp->vernum);
1282  else if (pp->ff_get_qscore_pp)
1283  dbp->mpQscore = (*pp->ff_get_qscore_pp)(ibp->acnum, ibp->vernum, pp);
1284  else if (pp->qsfd && ibp->qslength > 0)
1285  dbp->mpQscore = GetQSFromFile(pp->qsfd, ibp);
1286  }
1287 
1288  if (! QscoreToSeqAnnot(dbp->mpQscore, *bioseq, ibp->acnum, ibp->vernum, false, true)) {
1289  if (pp->ign_bad_qs == false) {
1290  ibp->drop = true;
1291  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse, "Error while parsing QScore. Entry dropped.");
1292  if (ibp->segnum == 0) {
1293  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1294  delete dbp;
1295  MemFree(entry);
1296  total_dropped++;
1297  continue;
1298  }
1299  } else {
1300  ErrPostEx(SEV_ERROR, ERR_QSCORE_FailedToParse, "Error while parsing QScore.");
1301  }
1302  }
1303 
1304  dbp->mpQscore.clear();
1305 
1306  if (ibp->psip.NotEmpty()) {
1307  CRef<CSeq_id> id(new CSeq_id);
1308  id->SetPatent(*ibp->psip);
1309  bioseq->SetId().push_back(id);
1310  ibp->psip.Reset();
1311  }
1312 
1313  /* add PatentSeqId if patent is found in reference
1314  */
1315  if (no_reference(*bioseq) && ! pp->debug) {
1316  if (pp->source == Parser::ESource::Flybase) {
1317  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for entry from FlyBase. Continue anyway.");
1318  } else if (pp->source == Parser::ESource::Refseq &&
1319  StringEquN(ibp->acnum, "NW_", 3)) {
1320  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for RefSeq's NW_ entry. Continue anyway.");
1321  } else if (ibp->is_wgs) {
1322  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for WGS entry. Continue anyway.");
1323  } else {
1324  ibp->drop = true;
1325  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references. Entry dropped.");
1326  if (ibp->segnum == 0) {
1327  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1328  delete dbp;
1329  MemFree(entry);
1330  total_dropped++;
1331  continue;
1332  }
1333  }
1334  }
1335 
1336  if (ibp->segnum == ibp->segtotal) {
1337  seq_entries.push_back(ebp->seq_entry);
1338  ebp->seq_entry.Reset();
1339 
1340  if (ibp->segnum < 2) {
1341  if (ibp->segnum != 0) {
1342  ErrPostEx(SEV_WARNING, ERR_SEGMENT_OnlyOneMember, "Segmented set contains only one member.");
1343  }
1344  segindx = i;
1345  } else {
1346  GetSeqExt(pp, locs);
1347  // LCOV_EXCL_START
1348  // Excluded per Mark's request on 12/14/2016
1349  BuildBioSegHeader(pp, seq_entries, locs);
1350  // LCOV_EXCL_STOP
1351  }
1352 
1353  /* reject the whole set if any one entry was rejected
1354  */
1355  if (ibp->segnum != 0) {
1356  div = pp->entrylist[segindx]->division;
1357  for (j = segindx; j <= i; j++) {
1358  tibp = pp->entrylist[j];
1359  err_install(tibp, pp->accver);
1360  if (! StringEqu(div, tibp->division)) {
1361  ErrPostEx(SEV_WARNING, ERR_DIVISION_Mismatch, "Division different in segmented set: %s: %s", div, tibp->division);
1362  }
1363  if (tibp->drop) {
1364  ErrPostEx(SEV_WARNING, ERR_SEGMENT_Rejected, "Reject the whole segmented set");
1365  break;
1366  }
1367  }
1368  if (j <= i) {
1369  for (j = segindx; j <= i; j++) {
1370  tibp = pp->entrylist[j];
1371  err_install(tibp, pp->accver);
1372  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", tibp->locusname, tibp->acnum);
1373  total_dropped++;
1374  }
1375 
1376  seq_entries.clear();
1377 
1378  delete dbp;
1379  MemFree(entry);
1380  GetScope().ResetHistory();
1381  continue;
1382  }
1383  }
1384 
1385  if (pp->source == Parser::ESource::USPTO) {
1386  GeneRefFeats gene_refs;
1387  gene_refs.valid = false;
1388  ProcNucProt(pp, seq_entries, gene_refs);
1389  } else
1390  DealWithGenes(seq_entries, pp);
1391 
1392  if (seq_entries.empty()) {
1393  if (ibp->segnum != 0) {
1394  ErrPostEx(SEV_WARNING, ERR_SEGMENT_Rejected, "Reject the whole segmented set.");
1395  for (j = segindx; j <= i; j++) {
1396  tibp = pp->entrylist[j];
1397  err_install(tibp, pp->accver);
1398  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", tibp->locusname, tibp->acnum);
1399  total_dropped++;
1400  }
1401  } else {
1402  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1403  total_dropped++;
1404  }
1405  delete dbp;
1406  MemFree(entry);
1407  GetScope().ResetHistory();
1408  continue;
1409  }
1410 
1411  /* remove out all the features if their seqloc has
1412  * "join" or "order" among other segments, to the annot
1413  * which in class = parts
1414  */
1415  if (ibp->segnum != 0)
1416  // LCOV_EXCL_START
1417  // Excluded per Mark's request on 12/14/2016
1418  CheckFeatSeqLoc(seq_entries);
1419  // LCOV_EXCL_STOP
1420 
1421  fta_find_pub_explore(pp, seq_entries);
1422 
1423  /* change qual "citation' on features to SeqFeat.cit
1424  * find citation in the list by serial_number.
1425  * If serial number not found remove /citation
1426  */
1427  ProcessCitations(seq_entries);
1428 
1429  /* check for long sequences in each segment
1430  */
1431  if (pp->limit != 0) {
1432  if (ibp->segnum != 0) {
1433  for (j = segindx; j <= i; j++) {
1434  tibp = pp->entrylist[j];
1435  err_install(tibp, pp->accver);
1436  if (tibp->bases <= (size_t)pp->limit)
1437  continue;
1438 
1439  if (tibp->htg == 1 || tibp->htg == 2 || tibp->htg == 4) {
1440  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence, "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem", tibp->locusname, tibp->acnum, pp->limit);
1441  } else {
1442  seq_long = true;
1443  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence, "Sequence %s|%s is longer than limit %ld", tibp->locusname, tibp->acnum, pp->limit);
1444  }
1445  }
1446  } else if (ibp->bases > (size_t)pp->limit) {
1447  if (ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 4) {
1448  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence, "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem", ibp->locusname, ibp->acnum, pp->limit);
1449  } else {
1450  seq_long = true;
1451  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence, "Sequence %s|%s is longer than limit %ld", ibp->locusname, ibp->acnum, pp->limit);
1452  }
1453  }
1454  }
1455 
1456  if (pp->convert) {
1457  if (pp->cleanup <= 1) {
1458  FinalCleanup(seq_entries);
1459 
1460  if (pp->qamode && ! seq_entries.empty())
1461  fta_remove_cleanup_user_object(*(*seq_entries.begin()));
1462  }
1463 
1464  MaybeCutGbblockSource(seq_entries);
1465  }
1466 
1467  EntryCheckDivCode(seq_entries, pp);
1468 
1469  if (pp->xml_comp)
1470  fta_set_strandedness(seq_entries);
1471 
1472  if (fta_EntryCheckGBBlock(seq_entries)) {
1473  ErrPostStr(SEV_WARNING, ERR_ENTRY_GBBlock_not_Empty, "Attention: GBBlock is not empty");
1474  }
1475 
1476  /* check for identical features
1477  */
1478  if (pp->qamode) {
1479  fta_sort_descr(seq_entries);
1480  fta_sort_seqfeat_cit(seq_entries);
1481  }
1482 
1483  if (pp->citat) {
1484  StripSerialNumbers(seq_entries);
1485  }
1486 
1487  PackEntries(seq_entries);
1488  CheckDupDates(seq_entries);
1489 
1490  if (ibp->segnum != 0)
1491  for (j = segindx; j <= i; j++)
1492  err_install(pp->entrylist[j], pp->accver);
1493 
1494  if (seq_long) {
1495  seq_long = false;
1496  if (ibp->segnum != 0)
1497  total_long += (i - segindx + 1);
1498  else
1499  total_long++;
1500  } else {
1501  pp->entries.splice(pp->entries.end(), seq_entries);
1502 
1503  if (ibp->segnum != 0)
1504  total += (i - segindx + 1);
1505  else
1506  total++;
1507  }
1508 
1509  if (ibp->segnum != 0) {
1510  for (j = segindx; j <= i; j++) {
1511  tibp = pp->entrylist[j];
1512  err_install(tibp, pp->accver);
1513  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry parsed successfully: \"%s|%s\".", tibp->locusname, tibp->acnum);
1514  }
1515  } else {
1516  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry parsed successfully: \"%s|%s\".", ibp->locusname, ibp->acnum);
1517  }
1518 
1519  seq_entries.clear();
1520  } else {
1521  GetSeqExt(pp, locs);
1522 
1523  seq_entries.push_back(ebp->seq_entry);
1524  ebp->seq_entry.Reset();
1525  }
1526 
1527  delete dbp;
1528  MemFree(entry);
1529  GetScope().ResetHistory();
1530 
1531  } /* for, ascii block entries */
1532 
1534 
1535  ErrPostEx(SEV_INFO, ERR_ENTRY_ParsingComplete, "COMPLETED : SUCCEEDED = %d (including: LONG ones = %d); SKIPPED = %d.", total, total_long, total_dropped);
1536 
1537  return true;
1538 }
1539 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
bool no_reference(const CBioseq &bioseq)
Definition: add.cpp:208
void SeqToDelta(CBioseq &bioseq, Int2 tech)
Definition: add.cpp:493
bool fta_check_htg_kwds(TKeywordList &kwds, IndexblkPtr ibp, CMolInfo &mol_info)
Definition: add.cpp:885
void fta_set_molinfo_completeness(CBioseq &bioseq, const Indexblk *ibp)
Definition: add.cpp:2770
void fta_add_hist(ParserPtr pp, CBioseq &bioseq, CGB_block::TExtra_accessions &extra_accs, Parser::ESource source, CSeq_id::E_Choice acctype, bool pricon, const char *acc)
Definition: add.cpp:781
void AssemblyGapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:327
bool fta_parse_tpa_tsa_block(CBioseq &bioseq, char *offset, char *acnum, Int2 vernum, size_t len, Int2 col_data, bool tpa)
Definition: add.cpp:1090
string GetQSFromFile(FILE *fd, const Indexblk *ibp)
Definition: add.cpp:2673
void fta_create_far_fetch_policy_user_object(CBioseq &bsp, Int4 num)
Definition: add.cpp:2795
void fta_tsa_tls_comment_dblink_check(const CBioseq &bioseq, bool is_tsa)
Definition: add.cpp:2725
void fta_remove_cleanup_user_object(CSeq_entry &seq_entry)
Definition: add.cpp:2692
bool fta_dblink_has_sra(const CRef< CUser_object > &uop)
Definition: add.cpp:2864
void GapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:375
void err_install(const Indexblk *ibp, bool accver)
Definition: add.cpp:290
Int4 fta_fix_seq_loc_id(TSeqLocList &locs, ParserPtr pp, char *location, const char *name, bool iscon)
Definition: add.cpp:2293
bool no_date(Parser::EFormat format, const TSeqdescList &descrs)
Definition: add.cpp:178
void fta_parse_structured_comment(char *str, bool &bad, TUserObjVector &objs)
Definition: add.cpp:2555
void StripSerialNumbers(TEntryList &seq_entries)
Definition: asci_blk.cpp:3377
void fta_fix_orgref_div(const CBioseq::TAnnot &annots, COrg_ref *org_ref, CGB_block &gbb)
Definition: asci_blk.cpp:3241
void fta_sort_seqfeat_cit(TEntryList &seq_entries)
Definition: asci_blk.cpp:3213
void PackEntries(TEntryList &seq_entries)
Definition: asci_blk.cpp:3477
void fta_set_strandedness(TEntryList &seq_entries)
Definition: asci_blk.cpp:3312
void CheckHTGDivision(const char *div, CMolInfo::TTech tech)
Definition: asci_blk.cpp:2917
unique_ptr< unsigned char[]> GetDNAConv(void)
Definition: asci_blk.cpp:1744
bool XMLCheckCDS(const char *entry, XmlIndexPtr xip)
Definition: asci_blk.cpp:3281
unique_ptr< unsigned char[]> GetProteinConv(void)
Definition: asci_blk.cpp:1772
void EntryCheckDivCode(TEntryList &seq_entries, ParserPtr pp)
Definition: asci_blk.cpp:2776
void GetSeqExt(ParserPtr pp, CSeq_loc &seq_loc)
Definition: asci_blk.cpp:2439
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
Definition: asci_blk.cpp:1632
bool fta_EntryCheckGBBlock(TEntryList &seq_entries)
Definition: asci_blk.cpp:3084
void fta_sort_descr(TEntryList &seq_entries)
Definition: asci_blk.cpp:3156
void XMLDefVsHTGKeywords(CMolInfo::TTech tech, const char *entry, XmlIndexPtr xip, bool cancelled)
Definition: asci_blk.cpp:2860
void BuildBioSegHeader(ParserPtr pp, TEntryList &entries, const CSeq_loc &seqloc)
Definition: asci_blk.cpp:2464
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
Definition: asci_blk.cpp:1274
bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string &div, CMolInfo::TTech *tech, size_t bases, Parser::ESource source, bool &drop)
Definition: asci_blk.cpp:2535
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
Definition: asci_blk.cpp:1020
list< string > TStringList
Definition: cgictx.cpp:719
void ProcessCitations(TEntryList &seq_entries)
Definition: citation.cpp:307
CDate::ECompare Compare(const CDate_std &date) const
Indicate how *this relates to another date.
Definition: Date_std.cpp:91
void SetToTime(const CTime &time, CDate::EPrecision prec=CDate::ePrecision_second)
Definition: Date_std.cpp:59
@ eCompare_after
*this comes second.
Definition: Date.hpp:76
Definition: Seq_entry.hpp:56
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
CTime –.
Definition: ncbitime.hpp:296
char * mOffset
Definition: ftablock.h:332
size_t len
Definition: ftablock.h:333
string mpQscore
Definition: ftablock.h:334
void SimpleDelete()
Definition: ftablock.h:323
CFlatFileData * mpData
Definition: ftablock.h:331
DataBlk * mpNext
Definition: ftablock.h:336
void fta_build_ena_user_object(CSeq_descr::Tdata &descrs, TStringList &dr_ena, TStringList &dr_biosample, CRef< CUser_object > &dbuop)
Definition: em_ascii.cpp:1513
CRef< CEMBL_block > XMLGetEMBLBlock(ParserPtr pp, const char *entry, CMolInfo &mol_info, string &gbdiv, CBioSource *bio_src, TStringList &dr_ena, TStringList &dr_biosample)
Definition: em_ascii.cpp:2427
void FinalCleanup(TEntryList &seq_entries)
Definition: fcleanup.cpp:377
#define ERR_SEQUENCE_BadData
Definition: flat2err.h:150
#define ERR_TPA_TpaSpansMissing
Definition: flat2err.h:593
#define ERR_ENTRY_LongSequence
Definition: flat2err.h:82
#define ERR_FORMAT_MissingContigFeature
Definition: flat2err.h:43
#define ERR_KEYWORD_ShouldNotBeTPA
Definition: flat2err.h:208
#define ERR_DIVISION_BadTSADivcode
Definition: flat2err.h:261
#define ERR_FORMAT_MissingSequenceData
Definition: flat2err.h:41
#define ERR_DIVISION_InvalidHTCKeyword
Definition: flat2err.h:254
#define ERR_KEYWORD_IllegalForCON
Definition: flat2err.h:210
#define ERR_DIVISION_MissingHTGKeywords
Definition: flat2err.h:249
#define ERR_QSCORE_FailedToParse
Definition: flat2err.h:577
#define ERR_ENTRY_LongHTGSSequence
Definition: flat2err.h:86
#define ERR_KEYWORD_MissingTSA
Definition: flat2err.h:216
#define ERR_DIVISION_BadTPADivcode
Definition: flat2err.h:257
#define ERR_REFERENCE_No_references
Definition: flat2err.h:289
#define ERR_KEYWORD_ShouldNotBeTLS
Definition: flat2err.h:218
#define ERR_ENTRY_GBBlock_not_Empty
Definition: flat2err.h:85
#define ERR_KEYWORD_HTGPlusENV
Definition: flat2err.h:217
#define ERR_DEFINITION_MissingTPA
Definition: flat2err.h:269
#define ERR_ENTRY_Skipped
Definition: flat2err.h:80
#define ERR_DEFINITION_MissingTLS
Definition: flat2err.h:273
#define ERR_KEYWORD_ESTSubstring
Definition: flat2err.h:204
#define ERR_KEYWORD_ConflictingKeywords
Definition: flat2err.h:207
#define ERR_DIVISION_ConDivLacksContig
Definition: flat2err.h:252
#define ERR_LOCATION_ContigHasNull
Definition: flat2err.h:397
#define ERR_SEGMENT_OnlyOneMember
Definition: flat2err.h:165
#define ERR_KEYWORD_ENV_NoMatchingQualifier
Definition: flat2err.h:214
#define ERR_KEYWORD_ShouldNotBeTSA
Definition: flat2err.h:215
#define ERR_KEYWORD_STSSubstring
Definition: flat2err.h:205
#define ERR_DIVISION_UnknownDivCode
Definition: flat2err.h:222
#define ERR_KEYWORD_MissingTLS
Definition: flat2err.h:219
#define ERR_DEFINITION_ShouldNotBeTSA
Definition: flat2err.h:270
#define ERR_SEGMENT_Rejected
Definition: flat2err.h:166
#define ERR_DIVISION_MissingHTCKeyword
Definition: flat2err.h:253
#define ERR_DIVISION_MappedtoCON
Definition: flat2err.h:248
#define ERR_DIVISION_MappedtoEST
Definition: flat2err.h:223
#define ERR_FORMAT_ContigWithSequenceData
Definition: flat2err.h:42
#define ERR_KEYWORD_NoGeneExpressionKeywords
Definition: flat2err.h:213
#define ERR_DEFINITION_MissingTSA
Definition: flat2err.h:271
#define ERR_DEFINITION_ShouldNotBeTPA
Definition: flat2err.h:268
#define ERR_KEYWORD_MissingTPA
Definition: flat2err.h:209
#define ERR_DIVISION_ConDivInSegset
Definition: flat2err.h:251
#define ERR_ENTRY_ParsingComplete
Definition: flat2err.h:79
#define ERR_DIVISION_Mismatch
Definition: flat2err.h:226
#define ERR_ORGANISM_NoOrganism
Definition: flat2err.h:184
#define ERR_DATE_IllegalDate
Definition: flat2err.h:102
#define ERR_ENTRY_Parsed
Definition: flat2err.h:83
#define ERR_DIVISION_HTCWrongMolType
Definition: flat2err.h:255
#define ERR_KEYWORD_ShouldNotBeCAGE
Definition: flat2err.h:211
#define ERR_DEFINITION_ShouldNotBeTLS
Definition: flat2err.h:272
#define ERR_TSA_UnexpectedPrimaryAccession
Definition: flat2err.h:609
list< CRef< objects::CSeq_entry > > TEntryList
bool QscoreToSeqAnnot(const string &qscore, CBioseq &bioseq, char *acc, Int2 ver, bool check_minmax, bool allow_na)
#define INSDSEQ_TOPOLOGY
Definition: fta_xml.h:46
#define INSDSEQ_MOLTYPE
Definition: fta_xml.h:45
#define INSDSEQ_DEFINITION
Definition: fta_xml.h:52
DataBlkPtr XMLBuildRefDataBlk(char *entry, const XmlIndex *xip, int type)
Definition: xm_index.cpp:1489
char * XMLFindTagValue(const char *entry, const XmlIndex *xip, Int4 tag)
Definition: xm_index.cpp:213
#define INSDSEQ_STRANDEDNESS
Definition: fta_xml.h:44
char * XMLGetTagValue(const char *entry, const XmlIndex *xip)
Definition: xm_index.cpp:197
#define INSDSEQ_COMMENT
Definition: fta_xml.h:64
#define INSDSEQ_ORGANISM
Definition: fta_xml.h:61
void XMLGetKeywords(const char *entry, const XmlIndex *xip, TKeywordList &keywords)
Definition: xm_index.cpp:1522
char * XMLLoadEntry(ParserPtr pp, bool err)
Definition: xm_index.cpp:967
#define INSDSEQ_KEYWORDS
Definition: fta_xml.h:58
#define INSDSEQ_TAXONOMY
Definition: fta_xml.h:62
#define INSDSEQ_CREATE_DATE
Definition: fta_xml.h:49
#define INSDSEQ_DIVISION
Definition: fta_xml.h:47
#define INSDSEQ_UPDATE_DATE
Definition: fta_xml.h:48
#define INSDSEQ_SOURCE
Definition: fta_xml.h:60
#define INSDSEQ_CONTIG
Definition: fta_xml.h:70
char * XMLConcatSubTags(const char *entry, const XmlIndex *xip, Int4 tag, Char sep)
Definition: xm_index.cpp:1548
#define INSDSEQ_PRIMARY
Definition: fta_xml.h:65
std::vector< CRef< objects::CUser_object > > TUserObjVector
Definition: ftablock.h:61
char * StringSave(const char *s)
Definition: ftacpp.hpp:61
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:116
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:106
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:96
void StringCpy(char *d, const char *s)
Definition: ftacpp.hpp:74
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
void MemCpy(void *p, const void *q, size_t sz)
Definition: ftacpp.hpp:50
char * StringRChr(char *s, const char c)
Definition: ftacpp.hpp:78
void FtaDeletePrefix(int prefix)
Definition: ftaerr.cpp:344
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
void fta_find_pub_explore(ParserPtr pp, TEntryList &seq_entries)
Definition: ftanet.cpp:762
void CheckFeatSeqLoc(TEntryList &seq_entries)
Definition: gb_ascii.cpp:2403
void DealWithGenes(TEntryList &seq_entries, ParserPtr pp)
Definition: genref.cpp:2981
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
#define StringStr
Definition: ncbistr.hpp:322
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
void ResetHistory(EActionIfLocked action=eKeepIfLocked)
Clean all unused TSEs from the scope's cache and release the memory.
Definition: scope.cpp:325
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
TOrigin GetOrigin(void) const
Get the Origin member data.
Definition: BioSource_.hpp:472
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
Definition: BioSource_.hpp:447
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
TStd & SetStd(void)
Select the variant.
Definition: Date_.cpp:115
const TDiv & GetDiv(void) const
Get the Div member data.
Definition: OrgName_.hpp:1005
void SetCommon(const TCommon &value)
Assign a value to Common data member.
Definition: Org_ref_.hpp:428
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
bool IsSetDiv(void) const
GenBank division code Check if a value has been assigned to Div data member.
Definition: OrgName_.hpp:993
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
Definition: Org_ref_.hpp:381
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSeq_code_type_iupacaa
IUPAC 1 letter amino acid code.
@ eSeq_code_type_iupacna
IUPAC 1 letter nuc acid code.
bool IsMix(void) const
Check if variant Mix is selected.
Definition: Seq_loc_.hpp:552
const TMix & GetMix(void) const
Get the variant data.
Definition: Seq_loc_.cpp:282
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
TPub & SetPub(void)
Select the variant.
Definition: Seqdesc_.cpp:362
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
TGenbank & SetGenbank(void)
Select the variant.
Definition: Seqdesc_.cpp:340
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
TComment & SetComment(void)
Select the variant.
Definition: Seqdesc_.hpp:1065
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
void ResetTech(void)
Reset Tech data member.
Definition: MolInfo_.hpp:484
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
void SetTopology(TTopology value)
Assign a value to Topology data member.
Definition: Seq_inst_.hpp:739
ETopology
topology of molecule
Definition: Seq_inst_.hpp:121
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
TEmbl & SetEmbl(void)
Select the variant.
Definition: Seqdesc_.cpp:456
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
EStrand
strandedness in living organism
Definition: Seq_inst_.hpp:133
void SetStrand(TStrand value)
Assign a value to Strand data member.
Definition: Seq_inst_.hpp:786
void SetTech(TTech value)
Assign a value to Tech data member.
Definition: MolInfo_.hpp:503
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
TMolinfo & SetMolinfo(void)
Select the variant.
Definition: Seqdesc_.cpp:594
TCreate_date & SetCreate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:478
TUpdate_date & SetUpdate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:500
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_targeted
targeted locus sets/studies
Definition: MolInfo_.hpp:147
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ ParFlat_REF_NO_TARGET
Definition: index.h:63
@ ParFlat_REF_END
Definition: index.h:60
CRef< CDate_std > GetUpdateDate(const char *ptr, Parser::ESource source)
Definition: indx_blk.cpp:611
Int2 XMLCheckSTRAND(const char *str)
Definition: indx_blk.cpp:485
Int4 IsNewAccessFormat(const Char *acnum)
Definition: indx_blk.cpp:995
Int2 CheckDIV(const char *str)
Definition: indx_blk.cpp:532
Int2 XMLCheckTPG(const char *str)
Definition: indx_blk.cpp:491
int i
void GetFlatBiomol(CMolInfo::TBiomol &biomol, CMolInfo::TTech tech, char *molstr, ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
Definition: loadfeat.cpp:5131
void LoadFeat(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
Definition: loadfeat.cpp:4825
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
std::list< SeqLoc > TSeqLocList
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void CheckDupDates(TEntryList &seq_entries)
Definition: nucprot.cpp:2691
void ProcNucProt(ParserPtr pp, TEntryList &seq_entries, GeneRefFeats &gene_refs)
Definition: nucprot.cpp:2537
CRef< CPubdesc > DescrRefs(ParserPtr pp, DataBlkPtr dbp, Int4 col_data)
Definition: ref.cpp:2445
int offset
Definition: replacements.h:160
static const char * str(char *buf, int n)
Definition: stats.c:84
CRef< objects::CSeq_entry > seq_entry
Definition: ftablock.h:346
bool valid
Definition: nucprot.h:64
Char acnum[200]
Definition: ftablock.h:169
CRef< objects::CPatent_seq_id > psip
Definition: ftablock.h:193
Char division[4]
Definition: ftablock.h:174
bool is_mga
Definition: ftablock.h:202
bool tsa_allowed
Definition: ftablock.h:214
Int2 htg
Definition: ftablock.h:199
bool is_tls
Definition: ftablock.h:211
Int2 vernum
Definition: ftablock.h:170
bool is_tpa
Definition: ftablock.h:209
TKeywordList keywords
Definition: ftablock.h:243
bool is_prot
Definition: ftablock.h:225
bool is_wgs
Definition: ftablock.h:208
bool origin
Definition: ftablock.h:204
bool is_contig
Definition: ftablock.h:200
bool STS
Definition: ftablock.h:196
bool is_pat
Definition: ftablock.h:205
bool HTC
Definition: ftablock.h:198
bool drop
Definition: ftablock.h:185
bool experimental
Definition: ftablock.h:250
size_t bases
Definition: ftablock.h:175
bool inferential
Definition: ftablock.h:248
Uint2 segtotal
Definition: ftablock.h:178
bool is_tsa
Definition: ftablock.h:210
bool EST
Definition: ftablock.h:195
GapFeatsPtr gaps
Definition: ftablock.h:217
string wgssec
Definition: ftablock.h:239
bool specialist_db
Definition: ftablock.h:246
Uint2 segnum
Definition: ftablock.h:176
Char locusname[200]
Definition: ftablock.h:173
bool env_sample_qual
Definition: ftablock.h:222
XmlIndexPtr xip
Definition: ftablock.h:220
size_t qslength
Definition: ftablock.h:233
bool GSS
Definition: ftablock.h:197
char *(* ff_get_qscore_pp)(const char *accession, Int2 v, Parser *pp)
vector< IndexblkPtr > entrylist
bool allow_crossdb_featloc
char *(* ff_get_qscore)(const char *accession, Int2 v)
TEntryList entries
XmlIndex * next
Definition: ftablock.h:161
Int4 tag
Definition: ftablock.h:153
CScope & GetScope()
void MaybeCutGbblockSource(TEntryList &seq_entries)
Definition: utilfeat.cpp:454
bool GetGenomeInfo(CBioSource &bsp, const Char *bptr)
Definition: utilfeat.cpp:244
bool HasHtg(const TKeywordList &keywords)
Definition: utilfun.cpp:1719
bool HasHtc(const TKeywordList &keywords)
Definition: utilfun.cpp:1748
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1281
void RemoveHtgPhase(TKeywordList &keywords)
Definition: utilfun.cpp:1733
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1422
void fta_remove_tpa_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1408
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
Definition: utilfun.cpp:1377
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1437
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
Definition: utilfun.cpp:1340
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1641
bool IsCancelled(const TKeywordList &keywords)
Definition: utilfun.cpp:1708
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1247
void fta_remove_env_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1452
bool fta_tpa_keywords_check(const TKeywordList &kwds)
Definition: utilfun.cpp:1165
bool fta_check_mga_keywords(CMolInfo &mol_info, const TKeywordList &kwds)
Definition: utilfun.cpp:1592
CRef< CSeq_loc > xgbparseint_ver(const char *raw_intervals, bool &keep_rawPt, int &numErrors, const TSeqIdList &seq_ids, bool accver)
Definition: xgbparint.cpp:1466
USING_SCOPE(objects)
bool XMLAscii(ParserPtr pp)
Definition: xm_ascii.cpp:1135
bool XMLGetInst(ParserPtr pp, DataBlkPtr dbp, unsigned char *dnaconv, CBioseq &bioseq)
Definition: xm_ascii.cpp:189
static bool XMLGetInstContig(XmlIndexPtr xip, DataBlkPtr dbp, CBioseq &bioseq, ParserPtr pp)
Definition: xm_ascii.cpp:126
static void XMLGetDescr(ParserPtr pp, DataBlkPtr entry, CBioseq &bioseq)
Definition: xm_ascii.cpp:791
static CRef< CGB_block > XMLGetGBBlock(ParserPtr pp, const char *entry, CMolInfo &mol_info, CBioSource *bio_src)
Definition: xm_ascii.cpp:240
static CRef< CMolInfo > XMLGetMolInfo(ParserPtr pp, DataBlkPtr entry, COrg_ref *org_ref)
Definition: xm_ascii.cpp:614
static void XMLCheckContigEverywhere(IndexblkPtr ibp, Parser::ESource source)
Definition: xm_ascii.cpp:92
static void XMLGetDivision(const char *entry, IndexblkPtr ibp)
Definition: xm_ascii.cpp:1119
static void XMLFakeBioSources(XmlIndexPtr xip, const char *entry, CBioseq &bioseq, Parser::ESource source)
Definition: xm_ascii.cpp:660
static void XMLGetDescrComment(char *offset)
Definition: xm_ascii.cpp:728
void XGappedSeqLocsToDeltaSeqs(const TSeqLocList &locs, TDeltaList &deltas)
Definition: xutils.cpp:91
Modified on Thu Nov 30 04:53:52 2023 by modify_doxy.py rev. 669887