NCBI C++ ToolKit
xm_ascii.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: xm_ascii.cpp 102964 2024-08-11 13:12:24Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: xm_ascii.cpp
27  *
28  * Author: Sergey Bazhin
29  *
30  * File Description:
31  * Parse INSDSEQ from blocks to asn.
32  * Build XML format entry block.
33  *
34  */
35 
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 
40 #include <objects/seq/Seq_inst.hpp>
42 #include <objects/seq/Seq_ext.hpp>
48 #include <objmgr/scope.hpp>
49 #include <objects/seq/MolInfo.hpp>
55 #include <objects/seq/Pubdesc.hpp>
56 
57 
58 #include "index.h"
59 
60 #include "ftanet.h"
63 
64 #include "ftaerr.hpp"
65 #include "indx_blk.h"
66 #include "asci_blk.h"
67 #include "utilref.h"
68 #include "utilfeat.h"
69 #include "loadfeat.h"
70 #include "add.h"
71 #include "gb_ascii.h"
72 #include "nucprot.h"
73 #include "fta_qscore.h"
74 #include "em_ascii.h"
75 #include "citation.h"
76 #include "fcleanup.h"
77 #include "utilfun.h"
78 #include "ref.h"
79 #include "xgbparint.h"
80 #include "xutils.h"
81 #include "fta_xml.h"
82 
83 #ifdef THIS_FILE
84 # undef THIS_FILE
85 #endif
86 #define THIS_FILE "xm_ascii.cpp"
87 
90 
91 /**********************************************************/
93 {
94  if (! ibp)
95  return;
96 
97  bool condiv = (NStr::CompareNocase(ibp->division, "CON") == 0);
98 
99  if (condiv && ibp->segnum != 0) {
100  ErrPostEx(SEV_ERROR, ERR_DIVISION_ConDivInSegset, "Use of the CON division is not allowed for members of segmented set : %s|%s. Entry dropped.", ibp->locusname, ibp->acnum);
101  ibp->drop = true;
102  return;
103  }
104 
105  if (! condiv && ibp->is_contig == false && ibp->origin == false) {
106  ErrPostStr(SEV_ERROR, ERR_FORMAT_MissingSequenceData, "Required sequence data is absent. Entry dropped.");
107  ibp->drop = true;
108  } else if (! condiv && ibp->is_contig && ibp->origin == false) {
109  ErrPostEx(SEV_WARNING, ERR_DIVISION_MappedtoCON, "Division [%s] mapped to CON based on the existence of <INSDSeq_contig> line.", ibp->division);
110  } else if (ibp->is_contig && ibp->origin) {
112  ErrPostStr(SEV_INFO, ERR_FORMAT_ContigWithSequenceData, "The <INSDSeq_contig> linetype and sequence data are both present. Ignoring sequence data.");
113  } else {
114  ErrPostStr(SEV_REJECT, ERR_FORMAT_ContigWithSequenceData, "The <INSDSeq_contig> linetype and sequence data may not both be present in a sequence record.");
115  ibp->drop = true;
116  }
117  } else if (condiv && ibp->is_contig == false && ibp->origin == false) {
118  ErrPostStr(SEV_ERROR, ERR_FORMAT_MissingContigFeature, "No <INSDSeq_contig> data in XML format file. Entry dropped.");
119  ibp->drop = true;
120  } else if (condiv && ibp->is_contig == false && ibp->origin) {
121  ErrPostStr(SEV_WARNING, ERR_DIVISION_ConDivLacksContig, "Division is CON, but <INSDSeq_contig> data have not been found.");
122  }
123 }
124 
125 /**********************************************************/
126 static bool XMLGetInstContig(XmlIndexPtr xip, DataBlkPtr dbp, CBioseq& bioseq, ParserPtr pp)
127 {
128  char* p;
129  char* q;
130  char* r;
131  bool locmap;
132  bool allow_crossdb_featloc;
133  Int4 i;
134  int numerr;
135 
137  if (! p)
138  return false;
139 
140  for (q = p, r = p; *q != '\0'; q++)
141  if (*q != '\n' && *q != '\t' && *q != ' ')
142  *r++ = *q;
143  *r = '\0';
144 
145  for (q = p; *q != '\0'; q++)
146  if ((q[0] == ',' && q[1] == ',') || (q[0] == '(' && q[1] == ',') ||
147  (q[0] == ',' && q[1] == ')'))
148  break;
149  if (*q != '\0') {
150  ErrPostStr(SEV_REJECT, ERR_LOCATION_ContigHasNull, "The join() statement for this record's contig line contains one or more comma-delimited components which are null.");
151  MemFree(p);
152  return false;
153  }
154 
155  pp->buf.reset();
156 
157  CRef<CSeq_loc> loc = xgbparseint_ver(p, locmap, numerr, bioseq.GetId(), pp->accver);
158 
159  if (loc.Empty()) {
160  MemFree(p);
161  return true;
162  }
163 
164  allow_crossdb_featloc = pp->allow_crossdb_featloc;
165  pp->allow_crossdb_featloc = true;
166 
167  TSeqLocList locs;
168  locs.push_back(loc);
169  i = fta_fix_seq_loc_id(locs, pp, p, nullptr, true);
170  if (i > 999)
172 
173  pp->allow_crossdb_featloc = allow_crossdb_featloc;
174 
175  if (loc->IsMix()) {
176  XGappedSeqLocsToDeltaSeqs(loc->GetMix(), bioseq.SetInst().SetExt().SetDelta().Set());
177  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_delta);
178  } else
179  bioseq.SetInst().ResetExt();
180 
181  MemFree(p);
182 
183  return true;
184 }
185 
186 /**********************************************************/
187 bool XMLGetInst(ParserPtr pp, DataBlkPtr dbp, unsigned char* dnaconv, CBioseq& bioseq)
188 {
189  IndexblkPtr ibp;
190  XmlIndexPtr xip;
191  Int2 topology;
192  Int2 strand;
193  char* topstr;
194  char* strandstr;
195 
196  ibp = pp->entrylist[pp->curindx];
197  topstr = nullptr;
198  strandstr = nullptr;
199  for (xip = ibp->xip; xip; xip = xip->next) {
200  if (xip->tag == INSDSEQ_TOPOLOGY && ! topstr)
201  topstr = StringSave(XMLGetTagValue(dbp->mOffset, xip));
202  else if (xip->tag == INSDSEQ_STRANDEDNESS && ! strandstr)
203  strandstr = StringSave(XMLGetTagValue(dbp->mOffset, xip));
204  }
205  if (! topstr)
206  topstr = StringSave(" ");
207  if (! strandstr)
208  strandstr = StringSave(" ");
209 
210  CSeq_inst& inst = bioseq.SetInst();
212 
213  /* get linear, circular, tandem topology, blank is linear which = 1
214  */
215  topology = XMLCheckTPG(topstr);
216  if (topology > 1)
217  inst.SetTopology(static_cast<CSeq_inst::ETopology>(topology));
218 
219  strand = XMLCheckSTRAND(strandstr);
220  if (strand > 0)
221  inst.SetStrand(static_cast<CSeq_inst::EStrand>(strand));
222 
223  if (topstr)
224  MemFree(topstr);
225  if (strandstr)
226  MemFree(strandstr);
227 
228  if (! GetSeqData(pp, *dbp, bioseq, 0, dnaconv, ibp->is_prot ? eSeq_code_type_iupacaa : eSeq_code_type_iupacna))
229  return false;
230 
231  if (ibp->is_contig && ! XMLGetInstContig(ibp->xip, dbp, bioseq, pp))
232  return false;
233 
234  return true;
235 }
236 
237 /**********************************************************/
238 static CRef<CGB_block> XMLGetGBBlock(ParserPtr pp, const char* entry, CMolInfo& mol_info, CBioSource* bio_src)
239 {
240  CRef<CGB_block> gbb(new CGB_block),
241  ret;
242 
243  IndexblkPtr ibp;
244  char* bptr;
245  char* str;
246  char msg[4];
247  Int2 div;
248  bool if_cds;
249 
250  bool pat_ref = false;
251  bool est_kwd = false;
252  bool sts_kwd = false;
253  bool gss_kwd = false;
254  bool htc_kwd = false;
255  bool fli_kwd = false;
256  bool wgs_kwd = false;
257  bool tpa_kwd = false;
258  bool tsa_kwd = false;
259  bool tls_kwd = false;
260  bool env_kwd = false;
261  bool mga_kwd = false;
262 
263  bool cancelled;
264  bool drop;
265  char* tempdiv;
266  Int2 thtg;
267  char* p;
268  Int4 i;
269 
270  ibp = pp->entrylist[pp->curindx];
271 
272  ibp->wgssec[0] = '\0';
273 
275  if (str) {
276  p = StringRChr(str, '.');
277  if (p && p > str && p[1] == '\0' && *(p - 1) == '.')
278  *p = '\0';
279 
280  gbb->SetSource(str);
281  MemFree(str);
282  }
283 
284  if (! ibp->keywords.empty()) {
285  gbb->SetKeywords().swap(ibp->keywords);
286  ibp->keywords.clear();
287  } else
288  XMLGetKeywords(entry, ibp->xip, gbb->SetKeywords());
289 
290  if (ibp->is_mga && ! fta_check_mga_keywords(mol_info, gbb->GetKeywords())) {
291  return ret;
292  }
293 
294  if (ibp->is_tpa && ! fta_tpa_keywords_check(gbb->SetKeywords())) {
295  return ret;
296  }
297 
298  if (ibp->is_tsa && ! fta_tsa_keywords_check(gbb->SetKeywords(), pp->source)) {
299  return ret;
300  }
301 
302  if (ibp->is_tls && ! fta_tls_keywords_check(gbb->SetKeywords(), pp->source)) {
303  return ret;
304  }
305 
306  for (const string& key : gbb->GetKeywords()) {
307  fta_keywords_check(key.c_str(), &est_kwd, &sts_kwd, &gss_kwd, &htc_kwd, &fli_kwd, &wgs_kwd, &tpa_kwd, &env_kwd, &mga_kwd, &tsa_kwd, &tls_kwd);
308  }
309 
310  if (ibp->env_sample_qual == false && env_kwd) {
311  ErrPostStr(SEV_REJECT, ERR_KEYWORD_ENV_NoMatchingQualifier, "This record utilizes the ENV keyword, but there are no /environmental_sample qualifiers among its source features.");
312  return ret;
313  }
314 
315  bptr = StringSave(XMLFindTagValue(entry, ibp->xip, INSDSEQ_DIVISION));
316  if (bptr) {
317  if_cds = XMLCheckCDS(entry, ibp->xip);
318  div = CheckDIV(bptr);
319  if (div != -1) {
320  string div_str(bptr, bptr + 3);
321  gbb->SetDiv(div_str);
322 
323  if (div == 16) /* "ORG" replaced by "UNA" */
324  gbb->SetDiv("UNA");
325 
326  /* preserve the division code for later use
327  */
328  const char* p_div = gbb->GetDiv().c_str();
329  StringCpy(ibp->division, p_div);
330 
331  if (ibp->psip.NotEmpty())
332  pat_ref = true;
333 
334  if (ibp->is_tpa &&
335  (StringEqu(p_div, "EST") || StringEqu(p_div, "GSS") ||
336  StringEqu(p_div, "PAT") || StringEqu(p_div, "HTG"))) {
337  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTPADivcode, "Division code \"%s\" is not legal for TPA records. Entry dropped.", p_div);
338  return ret;
339  }
340 
341  if (ibp->is_tsa && ! StringEqu(p_div, "TSA")) {
342  ErrPostEx(SEV_REJECT, ERR_DIVISION_BadTSADivcode, "Division code \"%s\" is not legal for TSA records. Entry dropped.", p_div);
343  return ret;
344  }
345 
346  cancelled = IsCancelled(gbb->GetKeywords());
347 
348  if (StringEqu(p_div, "HTG")) {
349  if (! HasHtg(gbb->GetKeywords())) {
350  ErrPostStr(SEV_ERROR, ERR_DIVISION_MissingHTGKeywords, "Division is HTG, but entry lacks HTG-related keywords. Entry dropped.");
351  return ret;
352  }
353  }
354 
355  tempdiv = StringSave(gbb->GetDiv());
356 
357  if (fta_check_htg_kwds(gbb->SetKeywords(), pp->entrylist[pp->curindx], mol_info))
358  gbb->SetDiv("");
359 
360  XMLDefVsHTGKeywords(mol_info.GetTech(), entry, ibp->xip, cancelled);
361 
362  CheckHTGDivision(tempdiv, mol_info.GetTech());
363  if (tempdiv)
364  MemFree(tempdiv);
365 
366  i = 0;
367  if (est_kwd)
368  i++;
369  if (sts_kwd)
370  i++;
371  if (gss_kwd)
372  i++;
373  if (ibp->htg > 0)
374  i++;
375  if (htc_kwd)
376  i++;
377  if (fli_kwd)
378  i++;
379  if (wgs_kwd)
380  i++;
381  if (env_kwd)
382  i++;
383  if (mga_kwd) {
384  if (ibp->is_mga == false) {
385  ErrPostStr(SEV_REJECT, ERR_KEYWORD_ShouldNotBeCAGE, "This is apparently _not_ a CAGE record, but the special keywords are present. Entry dropped.");
386  return ret;
387  }
388  i++;
389  } else if (ibp->is_mga) {
390  ErrPostStr(SEV_REJECT, ERR_KEYWORD_NoGeneExpressionKeywords, "This is apparently a CAGE or 5'-SAGE record, but it lacks the required keywords. Entry dropped.");
391  return ret;
392  }
393  if (tpa_kwd) {
394  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL) {
395  ErrPostStr(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA\" and/or \"Third Party Annotation\" keywords are present. Entry dropped.");
396  return ret;
397  }
398  i++;
399  } else if (ibp->is_tpa) {
400  ErrPostStr(SEV_REJECT, ERR_KEYWORD_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA\" and/or \"Third Party Annotation\" keywords. Entry dropped.");
401  return ret;
402  }
403  if (tsa_kwd) {
404  if (ibp->is_tsa == false) {
405  ErrPostStr(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords are present. Entry dropped.");
406  return ret;
407  }
408  i++;
409  } else if (ibp->is_tsa) {
410  ErrPostStr(SEV_REJECT, ERR_KEYWORD_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TSA\" and/or \"Transcriptome Shotgun Assembly\" keywords. Entry dropped.");
411  return ret;
412  }
413  if (tls_kwd) {
414  if (ibp->is_tls == false) {
415  ErrPostStr(SEV_REJECT, ERR_KEYWORD_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS\" and/or \"Targeted Locus Study\" keywords are present. Entry dropped.");
416  return ret;
417  }
418  i++;
419  } else if (ibp->is_tls) {
420  ErrPostStr(SEV_REJECT, ERR_KEYWORD_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS\" and/or \"Targeted Locus Study\" keywords. Entry dropped.");
421  return ret;
422  }
423  if (i > 1) {
424  if (i == 2 && ibp->htg > 0 && env_kwd)
425  ErrPostStr(SEV_WARNING, ERR_KEYWORD_HTGPlusENV, "This HTG record also has the ENV keyword, which is an unusual combination. Confirmation that isolation and cloning steps actually occured might be appropriate.");
426  else if (i != 2 || env_kwd == false ||
427  (est_kwd == false && gss_kwd == false && wgs_kwd == false)) {
428  ErrPostStr(SEV_REJECT, ERR_KEYWORD_ConflictingKeywords, "This record contains more than one of the special keywords used to indicate that a sequence is an HTG, EST, GSS, STS, HTC, WGS, ENV, FLI_CDNA, TPA, CAGE, TSA or TLS sequence.");
429  return ret;
430  }
431  }
432 
433  if (wgs_kwd)
434  i--;
435  if (ibp->is_contig && i > 0 &&
436  wgs_kwd == false && tpa_kwd == false && env_kwd == false) {
437  ErrPostStr(SEV_REJECT, ERR_KEYWORD_IllegalForCON, "This CON record should not have HTG, EST, GSS, STS, HTC, FLI_CDNA, CAGE, TSA or TLS special keywords. Entry dropped.");
438  return ret;
439  }
440 
441  thtg = mol_info.GetTech();
442  if (thtg == CMolInfo::eTech_htgs_0 || thtg == CMolInfo::eTech_htgs_1 ||
443  thtg == CMolInfo::eTech_htgs_2 || thtg == CMolInfo::eTech_htgs_3) {
444  RemoveHtgPhase(gbb->SetKeywords());
445  }
446 
447  char* kw = StringSave(XMLConcatSubTags(entry, ibp->xip, INSDSEQ_KEYWORDS, ';'));
448  if (kw) {
449  if (! est_kwd && StringStr(kw, "EST")) {
450  ErrPostEx(SEV_WARNING, ERR_KEYWORD_ESTSubstring, "Keyword %s has substring EST, but no official EST keywords found", kw);
451  }
452  if (! sts_kwd && StringStr(kw, "STS")) {
453  ErrPostEx(SEV_WARNING, ERR_KEYWORD_STSSubstring, "Keyword %s has substring STS, but no official STS keywords found", kw);
454  }
455  MemFree(kw);
456  }
457 
458  if (! ibp->is_contig) {
459  drop = false;
460  CMolInfo::TTech tech = mol_info.GetTech();
461  string p_div = gbb->GetDiv();
462 
463  check_div(ibp->is_pat, pat_ref, est_kwd, sts_kwd, gss_kwd, if_cds, p_div, &tech, ibp->bases, pp->source, drop);
464 
465  if (tech != CMolInfo::eTech_unknown)
466  mol_info.SetTech(tech);
467  else
468  mol_info.ResetTech();
469 
470  if (! p_div.empty())
471  gbb->SetDiv(p_div);
472  else
473  gbb->SetDiv("");
474 
475  if (drop) {
476  MemFree(bptr);
477  return ret;
478  }
479  } else if (gbb->GetDiv() == "CON") {
480  gbb->SetDiv("");
481  }
482  } else {
483  MemCpy(msg, bptr, 3);
484  msg[3] = '\0';
485  ErrPostEx(SEV_REJECT, ERR_DIVISION_UnknownDivCode, "Unknown division code \"%s\" found in GenBank flatfile. Record rejected.", msg);
486  MemFree(bptr);
487  return ret;
488  }
489 
490  if (IsNewAccessFormat(ibp->acnum) == 0 && *ibp->acnum == 'T' &&
491  gbb->GetDiv() != "EST") {
492  ErrPostStr(SEV_INFO, ERR_DIVISION_MappedtoEST, "Leading T in accession number.");
493  mol_info.SetTech(CMolInfo::eTech_est);
494 
495  gbb->SetDiv("");
496  }
497 
498  MemFree(bptr);
499  }
500 
501  bool is_htc_div = gbb->GetDiv() == "HTC",
502  has_htc = HasHtc(gbb->GetKeywords());
503 
504  if (is_htc_div && ! has_htc) {
505  ErrPostStr(SEV_ERROR, ERR_DIVISION_MissingHTCKeyword, "This record is in the HTC division, but lacks the required HTC keyword.");
506  return ret;
507  }
508 
509  if (! is_htc_div && has_htc) {
510  ErrPostStr(SEV_ERROR, ERR_DIVISION_InvalidHTCKeyword, "This record has the special HTC keyword, but is not in HTC division. If this record has graduated out of HTC, then the keyword should be removed.");
511  return ret;
512  }
513 
514  if (is_htc_div) {
516  if (str) {
517  p = str;
518  if (*str == 'm' || *str == 'r')
519  p = str + 1;
520  else if (StringEquN(str, "pre-", 4))
521  p = str + 4;
522  else if (StringEquN(str, "transcribed ", 12))
523  p = str + 12;
524 
525  if (! StringEquN(p, "RNA", 3)) {
526  ErrPostStr(SEV_ERROR, ERR_DIVISION_HTCWrongMolType, "All HTC division records should have a moltype of pre-RNA, mRNA or RNA.");
527  MemFree(str);
528  return ret;
529  }
530  MemFree(str);
531  }
532  }
533 
534  if (fli_kwd)
536 
537  /* will be used in flat file database
538  */
539  if (! gbb->GetDiv().empty()) {
540  if (gbb->GetDiv() == "EST") {
541  ibp->EST = true;
542  mol_info.SetTech(CMolInfo::eTech_est);
543  gbb->SetDiv("");
544  } else if (gbb->GetDiv() == "STS") {
545  ibp->STS = true;
546  mol_info.SetTech(CMolInfo::eTech_sts);
547  gbb->SetDiv("");
548  } else if (gbb->GetDiv() == "GSS") {
549  ibp->GSS = true;
551  gbb->SetDiv("");
552  } else if (gbb->GetDiv() == "HTC") {
553  ibp->HTC = true;
554  mol_info.SetTech(CMolInfo::eTech_htc);
555  gbb->SetDiv("");
556  } else if (gbb->GetDiv() == "SYN" && bio_src && bio_src->IsSetOrigin() &&
557  bio_src->GetOrigin() == 5) /* synthetic */
558  {
559  gbb->SetDiv("");
560  }
561  } else if (mol_info.IsSetTech()) {
562  if (mol_info.GetTech() == CMolInfo::eTech_est)
563  ibp->EST = true;
564  if (mol_info.GetTech() == CMolInfo::eTech_sts)
565  ibp->STS = true;
566  if (mol_info.GetTech() == CMolInfo::eTech_survey)
567  ibp->GSS = true;
568  if (mol_info.GetTech() == CMolInfo::eTech_htc)
569  ibp->HTC = true;
570  }
571 
572  if (mol_info.IsSetTech())
573  fta_remove_keywords(mol_info.GetTech(), gbb->SetKeywords());
574 
575  if (ibp->is_tpa)
576  fta_remove_tpa_keywords(gbb->SetKeywords());
577 
578  if (ibp->is_tsa)
579  fta_remove_tsa_keywords(gbb->SetKeywords(), pp->source);
580 
581  if (ibp->is_tls)
582  fta_remove_tls_keywords(gbb->SetKeywords(), pp->source);
583 
584  if (bio_src && bio_src->IsSetSubtype()) {
585  for (const auto& subtype : bio_src->GetSubtype()) {
586  if (subtype->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
587  fta_remove_env_keywords(gbb->SetKeywords());
588  break;
589  }
590  }
591  }
592 
593  GetExtraAccession(ibp, pp->allow_uwsec, pp->source, gbb->SetExtra_accessions());
594 
595  if (gbb->IsSetDiv() &&
596  bio_src &&
597  bio_src->IsSetOrg() &&
598  bio_src->GetOrg().IsSetOrgname() &&
599  bio_src->GetOrg().GetOrgname().IsSetDiv() &&
600  bio_src->GetOrg().GetOrgname().GetDiv() == gbb->GetDiv()) {
601  gbb->ResetDiv();
602  }
603 
604  return gbb;
605 }
606 
607 /**********************************************************/
609 {
610  IndexblkPtr ibp;
611 
612  char* div;
613  char* molstr;
614 
615  ibp = pp->entrylist[pp->curindx];
616 
617  CRef<CMolInfo> mol_info(new CMolInfo);
618 
619  molstr = StringSave(XMLFindTagValue(entry->mOffset, ibp->xip, INSDSEQ_MOLTYPE));
620  div = StringSave(XMLFindTagValue(entry->mOffset, ibp->xip, INSDSEQ_DIVISION));
621 
622  if (StringEquN(div, "EST", 3))
623  mol_info->SetTech(CMolInfo::eTech_est);
624  else if (StringEquN(div, "STS", 3))
625  mol_info->SetTech(CMolInfo::eTech_sts);
626  else if (StringEquN(div, "GSS", 3))
627  mol_info->SetTech(CMolInfo::eTech_survey);
628  else if (StringEquN(div, "HTG", 3))
629  mol_info->SetTech(CMolInfo::eTech_htgs_1);
630  else if (ibp->is_wgs) {
631  if (ibp->is_tsa)
632  mol_info->SetTech(CMolInfo::eTech_tsa);
633  else if (ibp->is_tls)
634  mol_info->SetTech(CMolInfo::eTech_targeted);
635  else
636  mol_info->SetTech(CMolInfo::eTech_wgs);
637  } else if (ibp->is_tsa)
638  mol_info->SetTech(CMolInfo::eTech_tsa);
639  else if (ibp->is_tls)
640  mol_info->SetTech(CMolInfo::eTech_targeted);
641 
642  MemFree(div);
643  GetFlatBiomol(mol_info->SetBiomol(), mol_info->GetTech(), molstr, pp, *entry, org_ref);
644  if (mol_info->GetBiomol() == CMolInfo::eBiomol_unknown) // not set
645  mol_info->ResetBiomol();
646 
647  if (molstr)
648  MemFree(molstr);
649 
650  return mol_info;
651 }
652 
653 /**********************************************************/
654 static void XMLFakeBioSources(XmlIndexPtr xip, const char* entry, CBioseq& bioseq, Parser::ESource source)
655 {
656  char* organism = nullptr;
657  char* taxonomy = nullptr;
658 
659  char* p;
660  char* q;
661 
662  for (; xip; xip = xip->next) {
663  if (xip->tag == INSDSEQ_ORGANISM && ! organism)
664  organism = StringSave(XMLGetTagValue(entry, xip));
665  else if (xip->tag == INSDSEQ_TAXONOMY && ! taxonomy)
666  taxonomy = StringSave(XMLGetTagValue(entry, xip));
667  }
668 
669  if (! organism) {
670  ErrPostStr(SEV_WARNING, ERR_ORGANISM_NoOrganism, "No <INSDSeq_organism> data in XML format file.");
671  if (taxonomy)
672  MemFree(taxonomy);
673  return;
674  }
675 
676  CRef<CBioSource> bio_src(new CBioSource);
677 
678  p = organism;
679  if (GetGenomeInfo(*bio_src, p) && bio_src->GetGenome() != CBioSource::eGenome_plasmid) {
680  while (*p != ' ' && *p != '\0')
681  p++;
682  while (*p == ' ')
683  p++;
684  }
685 
686  COrg_ref& org_ref = bio_src->SetOrg();
687 
688  if (source == Parser::ESource::EMBL) {
689  q = StringChr(p, '(');
690  if (q && q > p) {
691  for (q--; *q == ' ' || *q == '\t'; q--)
692  if (q == p)
693  break;
694  if (*q != ' ' && *q != '\t')
695  q++;
696  if (q > p) {
697  *q = '\0';
698  org_ref.SetCommon(p);
699  }
700  }
701  }
702 
703  org_ref.SetTaxname(p);
704  MemFree(organism);
705 
706  if (org_ref.GetTaxname() == "Unknown.") {
707  string& taxname = org_ref.SetTaxname();
708  taxname = taxname.substr(0, taxname.size() - 1);
709  }
710 
711  if (taxonomy) {
712  org_ref.SetOrgname().SetLineage(taxonomy);
713  }
714 
715  CRef<CSeqdesc> descr(new CSeqdesc);
716  descr->SetSource(*bio_src);
717  bioseq.SetDescr().Set().push_back(descr);
718 }
719 
720 /**********************************************************/
721 static void XMLGetDescrComment(char* offset)
722 {
723  char* p;
724  char* q;
725 
726  for (p = offset; *p == '\n' || *p == ' ';)
727  p++;
728  if (p > offset)
729  fta_StringCpy(offset, p);
730 
731  for (p = offset, q = offset; *p != '\0';) {
732  if (*p != '\n') {
733  *q++ = *p++;
734  continue;
735  }
736 
737  *q++ = '~';
738  for (p++; *p == ' ';)
739  p++;
740  }
741  *q = '\0';
742 
743  for (p = offset;;) {
744  p = StringStr(p, "; ");
745  if (! p)
746  break;
747  for (p += 2, q = p; *q == ' ';)
748  q++;
749  if (q > p)
750  fta_StringCpy(p, q);
751  }
752 
753  for (p = offset; *p == ' ';)
754  p++;
755  if (p > offset)
756  fta_StringCpy(offset, p);
757  for (p = offset; *p != '\0';)
758  p++;
759 
760  if (p > offset) {
761  for (p--;; p--) {
762  if (*p == ' ' || *p == '\t' || *p == ';' || *p == ',' ||
763  *p == '.' || *p == '~') {
764  if (p > offset)
765  continue;
766  *p = '\0';
767  }
768  break;
769  }
770  if (*p != '\0') {
771  p++;
772  if (StringEquN(p, "...", 3))
773  p[3] = '\0';
774  else if (StringChr(p, '.')) {
775  *p = '.';
776  p[1] = '\0';
777  } else
778  *p = '\0';
779  }
780  }
781 }
782 
783 /**********************************************************/
784 static void XMLGetDescr(ParserPtr pp, DataBlkPtr entry, CBioseq& bioseq)
785 {
786  IndexblkPtr ibp;
787 
788  DataBlkPtr dbp;
789  DataBlkPtr dbpnext;
790 
791  char* crdate;
792  char* update;
793  char* offset;
794  char* str;
795  char* p;
796  char* q;
797  string gbdiv;
798 
799  ibp = pp->entrylist[pp->curindx];
800 
801  CBioSource* bio_src = nullptr;
802  COrg_ref* org_ref = nullptr;
803 
804  /* ORGANISM
805  */
806  for (auto& descr : bioseq.SetDescr().Set()) {
807  if (descr->IsSource()) {
808  bio_src = &(descr->SetSource());
809  if (bio_src->IsSetOrg())
810  org_ref = &bio_src->SetOrg();
811  break;
812  }
813  }
814 
815  /* MolInfo from LOCUS line
816  */
817  CRef<CMolInfo> mol_info = XMLGetMolInfo(pp, entry, org_ref);
818 
819  /* DEFINITION data ==> descr_title
820  */
822  string title;
823 
824  if (str) {
825  for (p = str; *p == ' ';)
826  p++;
827  if (p > str)
828  fta_StringCpy(str, p);
829  if (pp->xml_comp && pp->source != Parser::ESource::EMBL) {
830  p = StringRChr(str, '.');
831  if (! p || p[1] != '\0') {
832  string s = str;
833  s += '.';
834  MemFree(str);
835  str = StringSave(s);
836  p = nullptr;
837  }
838  }
839 
840  title = str;
841  MemFree(str);
842  str = nullptr;
843 
844  CRef<CSeqdesc> descr(new CSeqdesc);
845  descr->SetTitle(title);
846  bioseq.SetDescr().Set().push_back(descr);
847 
848  if (ibp->is_tpa == false && pp->source != Parser::ESource::EMBL &&
849  StringEquN(title.c_str(), "TPA:", 4)) {
850  ErrPostStr(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTPA, "This is apparently _not_ a TPA record, but the special \"TPA:\" prefix is present on its definition line. Entry dropped.");
851  ibp->drop = true;
852  return;
853  }
854 
855  if (ibp->is_tsa == false && StringEquN(title.c_str(), "TSA:", 4)) {
856  ErrPostStr(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTSA, "This is apparently _not_ a TSA record, but the special \"TSA:\" prefix is present on its definition line. Entry dropped.");
857  ibp->drop = true;
858  return;
859  }
860 
861  if (ibp->is_tls == false && StringEquN(title.c_str(), "TLS:", 4)) {
862  ErrPostStr(SEV_REJECT, ERR_DEFINITION_ShouldNotBeTLS, "This is apparently _not_ a TLS record, but the special \"TLS:\" prefix is present on its definition line. Entry dropped.");
863  ibp->drop = true;
864  return;
865  }
866  }
867 
868  if (ibp->is_tpa &&
869  (title.empty() || ! StringEquN(title.c_str(), "TPA:", 4))) {
870  ErrPostStr(SEV_REJECT, ERR_DEFINITION_MissingTPA, "This is apparently a TPA record, but it lacks the required \"TPA:\" prefix on its definition line. Entry dropped.");
871  ibp->drop = true;
872  return;
873  }
874 
875  if (ibp->is_tsa &&
876  (title.empty() || ! StringEquN(title.c_str(), "TSA:", 4))) {
877  ErrPostStr(SEV_REJECT, ERR_DEFINITION_MissingTSA, "This is apparently a TSA record, but it lacks the required \"TSA:\" prefix on its definition line. Entry dropped.");
878  ibp->drop = true;
879  return;
880  }
881 
882  if (ibp->is_tls &&
883  (title.empty() || ! StringEquN(title.c_str(), "TLS:", 4))) {
884  ErrPostStr(SEV_REJECT, ERR_DEFINITION_MissingTLS, "This is apparently a TLS record, but it lacks the required \"TLS:\" prefix on its definition line. Entry dropped.");
885  ibp->drop = true;
886  return;
887  }
888 
889  /* REFERENCE
890  */
891  /* pub should be before GBblock because we need patent ref
892  */
893  dbp = XMLBuildRefDataBlk(entry->mOffset, ibp->xip, ParFlat_REF_END);
894  for (; dbp; dbp = dbpnext) {
895  dbpnext = dbp->mpNext;
896 
897  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, 0);
898  if (pubdesc.NotEmpty()) {
899  CRef<CSeqdesc> descr(new CSeqdesc);
900  descr->SetPub(*pubdesc);
901  bioseq.SetDescr().Set().push_back(descr);
902  }
903 
904  dbp->SimpleDelete();
905  }
906 
908  for (; dbp; dbp = dbpnext) {
909  dbpnext = dbp->mpNext;
910 
911  CRef<CPubdesc> pubdesc = DescrRefs(pp, dbp, 0);
912  if (pubdesc.NotEmpty()) {
913  CRef<CSeqdesc> descr(new CSeqdesc);
914  descr->SetPub(*pubdesc);
915  bioseq.SetDescr().Set().push_back(descr);
916  }
917 
918  dbp->SimpleDelete();
919  }
920 
921  TStringList dr_ena,
922  dr_biosample;
923 
924  CRef<CEMBL_block> embl;
925  CRef<CGB_block> gbb;
926 
927  if (pp->source == Parser::ESource::EMBL)
928  embl = XMLGetEMBLBlock(pp, entry->mOffset, *mol_info, gbdiv, bio_src, dr_ena, dr_biosample);
929  else
930  gbb = XMLGetGBBlock(pp, entry->mOffset, *mol_info, bio_src);
931 
932  CRef<CUser_object> dbuop;
933  if (! dr_ena.empty() || ! dr_biosample.empty())
934  fta_build_ena_user_object(bioseq.SetDescr().Set(), dr_ena, dr_biosample, dbuop);
935 
936  if (mol_info->IsSetBiomol() || mol_info->IsSetTech()) {
937  CRef<CSeqdesc> descr(new CSeqdesc);
938  descr->SetMolinfo(*mol_info);
939  bioseq.SetDescr().Set().push_back(descr);
940  }
941 
942  if (pp->source == Parser::ESource::EMBL) {
943  if (embl.Empty()) {
944  ibp->drop = true;
945  return;
946  }
947  } else if (gbb.Empty()) {
948  ibp->drop = true;
949  return;
950  }
951 
952  if (pp->source == Parser::ESource::EMBL) {
953  if (StringEquNI(ibp->division, "CON", 3))
954  fta_add_hist(pp, bioseq, embl->SetExtra_acc(), Parser::ESource::EMBL, CSeq_id::e_Embl, true, ibp->acnum);
955  else
956  fta_add_hist(pp, bioseq, embl->SetExtra_acc(), Parser::ESource::EMBL, CSeq_id::e_Embl, false, ibp->acnum);
957 
958  if (embl->GetExtra_acc().empty())
959  embl->ResetExtra_acc();
960  } else {
961  if (StringEquNI(ibp->division, "CON", 3))
962  fta_add_hist(pp, bioseq, gbb->SetExtra_accessions(), Parser::ESource::DDBJ, CSeq_id::e_Ddbj, true, ibp->acnum);
963  else
964  fta_add_hist(pp, bioseq, gbb->SetExtra_accessions(), Parser::ESource::DDBJ, CSeq_id::e_Ddbj, false, ibp->acnum);
965  }
966 
967  if (pp->source == Parser::ESource::EMBL) {
968  if (! gbdiv.empty()) {
969  gbb.Reset(new CGB_block);
970  gbb->SetDiv(gbdiv);
971  gbdiv.clear();
972  }
973 
974  CRef<CSeqdesc> descr(new CSeqdesc);
975  descr->SetEmbl(*embl);
976  bioseq.SetDescr().Set().push_back(descr);
977  }
978 
980  if (! offset && ibp->is_tpa && ibp->is_wgs == false) {
981  if (ibp->inferential || ibp->experimental) {
982  if (! fta_dblink_has_sra(dbuop)) {
983  ErrPostEx(SEV_REJECT, ERR_TPA_TpaSpansMissing, "TPA:%s record lacks both AH/PRIMARY linetype and Sequence Read Archive links. Entry dropped.", (ibp->inferential == false) ? "experimental" : "inferential");
984  ibp->drop = true;
985  return;
986  }
987  } else if (ibp->specialist_db == false) {
988  ErrPostStr(SEV_REJECT, ERR_TPA_TpaSpansMissing, "TPA record lacks required AH/PRIMARY linetype. Entry dropped.");
989  ibp->drop = true;
990  return;
991  }
992  }
993 
994  if (offset) {
995  if (! fta_parse_tpa_tsa_block(bioseq, offset, ibp->acnum, ibp->vernum, 10, 0, ibp->is_tpa)) {
996  ibp->drop = true;
997  MemFree(offset);
998  return;
999  }
1000  MemFree(offset);
1001  }
1002 
1003  if (gbb.NotEmpty()) {
1004  if (pp->taxserver == 1 && gbb->IsSetDiv())
1005  fta_fix_orgref_div(bioseq.SetAnnot(), org_ref, *gbb);
1006 
1007  CRef<CSeqdesc> descr(new CSeqdesc);
1008  descr->SetGenbank(*gbb);
1009  bioseq.SetDescr().Set().push_back(descr);
1010  }
1011 
1012  /* COMMENT data
1013  */
1015  if (offset) {
1016  bool bad = false;
1017  TUserObjVector user_objs;
1018 
1019  fta_parse_structured_comment(offset, bad, user_objs);
1020 
1021  if (bad) {
1022  ibp->drop = true;
1023  MemFree(offset);
1024  return;
1025  }
1026 
1027  for (auto& user_obj : user_objs) {
1028  CRef<CSeqdesc> descr(new CSeqdesc);
1029  descr->SetUser(*user_obj);
1030  bioseq.SetDescr().Set().push_back(descr);
1031  }
1032 
1034  if (pp->xml_comp) {
1035  for (q = offset, p = q; *p != '\0';) {
1036  if (*p == ';' && (p[1] == ' ' || p[1] == '~'))
1037  *p = ' ';
1038  if (*p == '~' || *p == ' ') {
1039  *q++ = ' ';
1040  for (p++; *p == ' ' || *p == '~';)
1041  p++;
1042  } else
1043  *q++ = *p++;
1044  }
1045  *q = '\0';
1046  }
1047 
1048  if (offset[0] != 0) {
1049  CRef<CSeqdesc> descr(new CSeqdesc);
1050  descr->SetComment(offset);
1051  bioseq.SetDescr().Set().push_back(descr);
1052  }
1053  MemFree(offset);
1054  }
1055 
1056  /* DATE
1057  */
1058  if (pp->no_date) /* -N in command line means no date */
1059  return;
1060 
1061  CRef<CDate_std> std_upd_date,
1062  std_cre_date;
1063 
1064  if (pp->date) /* -L in command line means replace
1065  date */
1066  {
1067  CTime cur_time(CTime::eCurrent);
1068 
1069  std_upd_date.Reset(new CDate_std);
1070  std_upd_date->SetToTime(cur_time);
1071 
1072  std_cre_date.Reset(new CDate_std);
1073  std_cre_date->SetToTime(cur_time);
1074 
1075  update = nullptr;
1076  crdate = nullptr;
1077  } else {
1078  update = StringSave(XMLFindTagValue(entry->mOffset, ibp->xip, INSDSEQ_UPDATE_DATE));
1079  if (update)
1080  std_upd_date = GetUpdateDate(update, pp->source);
1081 
1082  crdate = StringSave(XMLFindTagValue(entry->mOffset, ibp->xip, INSDSEQ_CREATE_DATE));
1083  if (crdate)
1084  std_cre_date = GetUpdateDate(crdate, pp->source);
1085  }
1086 
1087  if (std_upd_date.NotEmpty()) {
1088  CRef<CSeqdesc> descr(new CSeqdesc);
1089  descr->SetUpdate_date().SetStd(*std_upd_date);
1090  bioseq.SetDescr().Set().push_back(descr);
1091 
1092  if (std_cre_date.NotEmpty() && std_cre_date->Compare(*std_upd_date) == CDate::eCompare_after) {
1093  ErrPostEx(SEV_ERROR, ERR_DATE_IllegalDate, "Update-date \"%s\" precedes create-date \"%s\".", update, crdate);
1094  }
1095  }
1096 
1097  if (std_cre_date.NotEmpty()) {
1098  if (pp->xml_comp == false || pp->source == Parser::ESource::EMBL) {
1099  CRef<CSeqdesc> descr(new CSeqdesc);
1100  descr->SetCreate_date().SetStd(*std_cre_date);
1101  bioseq.SetDescr().Set().push_back(descr);
1102  }
1103  }
1104 
1105  if (update)
1106  MemFree(update);
1107  if (crdate)
1108  MemFree(crdate);
1109 }
1110 
1111 /**********************************************************/
1112 static void XMLGetDivision(const char* entry, IndexblkPtr ibp)
1113 {
1114  char* div;
1115 
1116  if (! ibp || ! entry)
1117  return;
1118 
1119  div = StringSave(XMLFindTagValue(entry, ibp->xip, INSDSEQ_DIVISION));
1120  if (! div)
1121  return;
1122  div[3] = '\0';
1123  StringCpy(ibp->division, div);
1124  MemFree(div);
1125 }
1126 
1127 /**********************************************************/
1129 {
1130  Int4 i;
1131  Int4 imax;
1132  Int4 j;
1133  Int4 segindx;
1134  Int4 total = 0;
1135  Int4 total_long = 0;
1136  Int4 total_dropped = 0;
1137  char* div;
1138  char* entry;
1139  EntryBlkPtr ebp;
1140 
1141  TEntryList seq_entries;
1142 
1143  CSeq_loc locs;
1144 
1145  bool seq_long = false;
1146  IndexblkPtr ibp;
1147  IndexblkPtr tibp;
1148  DataBlkPtr dbp;
1149 
1150  /* set up sequence alphabets
1151  */
1152  auto dnaconv = GetDNAConv();
1153  auto protconv = GetProteinConv();
1154 
1155  segindx = -1;
1156 
1157  for (imax = pp->indx, i = 0; i < imax; i++) {
1158  pp->curindx = i;
1159  ibp = pp->entrylist[i];
1160 
1161  err_install(ibp, pp->accver);
1162 
1163  if (ibp->segnum == 1)
1164  segindx = i;
1165 
1166  if (ibp->drop && ibp->segnum == 0) {
1167  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1168  total_dropped++;
1169  continue;
1170  }
1171 
1172  entry = XMLLoadEntry(pp, false);
1173  if (! entry) {
1175  return false;
1176  }
1177 
1178  XMLGetDivision(entry, ibp);
1179 
1180  if (StringEqu(ibp->division, "TSA")) {
1181  if (ibp->tsa_allowed == false)
1182  ErrPostEx(SEV_WARNING, ERR_TSA_UnexpectedPrimaryAccession, "The record with accession \"%s\" is not expected to have a TSA division code.", ibp->acnum);
1183  ibp->is_tsa = true;
1184  }
1185 
1186  XMLCheckContigEverywhere(ibp, pp->source);
1187  if (ibp->drop && ibp->segnum == 0) {
1188  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1189  MemFree(entry);
1190  total_dropped++;
1191  continue;
1192  }
1193 
1194  ebp = new EntryBlk();
1195 
1196  CRef<CBioseq> bioseq = CreateEntryBioseq(pp);
1197  ebp->seq_entry.Reset(new CSeq_entry);
1198  ebp->seq_entry->SetSeq(*bioseq);
1199  GetScope().AddBioseq(*bioseq);
1200 
1201  dbp = new DataBlk();
1202  dbp->mpData = ebp;
1203  dbp->mOffset = entry;
1204  dbp->len = StringLen(entry);
1205 
1206  if (! XMLGetInst(pp, dbp, ibp->is_prot ? protconv.get() : dnaconv.get(), *bioseq)) {
1207  ibp->drop = true;
1208  ErrPostStr(SEV_REJECT, ERR_SEQUENCE_BadData, "Bad sequence data. Entry dropped.");
1209  if (ibp->segnum == 0) {
1210  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1211  delete dbp;
1212  MemFree(entry);
1213  total_dropped++;
1214  continue;
1215  }
1216  }
1217 
1218  XMLFakeBioSources(ibp->xip, dbp->mOffset, *bioseq, pp->source);
1219  LoadFeat(pp, *dbp, *bioseq);
1220 
1221  if (! bioseq->IsSetAnnot() && ibp->drop && ibp->segnum == 0) {
1222  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1223  delete dbp;
1224  MemFree(entry);
1225  total_dropped++;
1226  continue;
1227  }
1228 
1229  XMLGetDescr(pp, dbp, *bioseq);
1230 
1231  if (ibp->drop && ibp->segnum == 0) {
1232  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1233  delete dbp;
1234  MemFree(entry);
1235  total_dropped++;
1236  continue;
1237  }
1238 
1239  fta_set_molinfo_completeness(*bioseq, ibp);
1240 
1241  if (ibp->is_tsa)
1242  fta_tsa_tls_comment_dblink_check(*bioseq, true);
1243 
1244  if (ibp->is_tls)
1245  fta_tsa_tls_comment_dblink_check(*bioseq, false);
1246 
1247  if (bioseq->GetInst().IsNa()) {
1248  if (bioseq->GetInst().GetRepr() == CSeq_inst::eRepr_raw) {
1249  if (ibp->gaps)
1250  GapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
1251  else if (ibp->htg == 4 || ibp->htg == 1 || ibp->htg == 2 ||
1252  (ibp->is_pat && pp->source == Parser::ESource::DDBJ))
1253  SeqToDelta(*bioseq, ibp->htg);
1254  } else if (ibp->gaps)
1255  AssemblyGapsToDelta(*bioseq, ibp->gaps, &ibp->drop);
1256  }
1257 
1258  if (no_date(pp->format, bioseq->GetDescr().Get()) &&
1259  pp->debug == false && pp->no_date == false &&
1260  pp->xml_comp == false && pp->source != Parser::ESource::USPTO) {
1261  ibp->drop = true;
1262  ErrPostStr(SEV_ERROR, ERR_DATE_IllegalDate, "Illegal create date. Entry dropped.");
1263  if (ibp->segnum == 0) {
1264  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1265  delete dbp;
1266  MemFree(entry);
1267  total_dropped++;
1268  continue;
1269  }
1270  }
1271 
1272  if (dbp->mpQscore.empty() && pp->accver) {
1273  if (pp->ff_get_qscore)
1274  dbp->mpQscore = (*pp->ff_get_qscore)(ibp->acnum, ibp->vernum);
1275  else if (pp->ff_get_qscore_pp)
1276  dbp->mpQscore = (*pp->ff_get_qscore_pp)(ibp->acnum, ibp->vernum, pp);
1277  else if (pp->qsfd && ibp->qslength > 0)
1278  dbp->mpQscore = GetQSFromFile(pp->qsfd, ibp);
1279  }
1280 
1281  if (! QscoreToSeqAnnot(dbp->mpQscore, *bioseq, ibp->acnum, ibp->vernum, false, true)) {
1282  if (pp->ign_bad_qs == false) {
1283  ibp->drop = true;
1284  ErrPostStr(SEV_ERROR, ERR_QSCORE_FailedToParse, "Error while parsing QScore. Entry dropped.");
1285  if (ibp->segnum == 0) {
1286  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1287  delete dbp;
1288  MemFree(entry);
1289  total_dropped++;
1290  continue;
1291  }
1292  } else {
1293  ErrPostStr(SEV_ERROR, ERR_QSCORE_FailedToParse, "Error while parsing QScore.");
1294  }
1295  }
1296 
1297  dbp->mpQscore.clear();
1298 
1299  if (ibp->psip.NotEmpty()) {
1300  CRef<CSeq_id> id(new CSeq_id);
1301  id->SetPatent(*ibp->psip);
1302  bioseq->SetId().push_back(id);
1303  ibp->psip.Reset();
1304  }
1305 
1306  /* add PatentSeqId if patent is found in reference
1307  */
1308  if (no_reference(*bioseq) && ! pp->debug) {
1309  if (pp->source == Parser::ESource::Flybase) {
1310  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for entry from FlyBase. Continue anyway.");
1311  } else if (pp->source == Parser::ESource::Refseq &&
1312  StringEquN(ibp->acnum, "NW_", 3)) {
1313  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for RefSeq's NW_ entry. Continue anyway.");
1314  } else if (ibp->is_wgs) {
1315  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references for WGS entry. Continue anyway.");
1316  } else {
1317  ibp->drop = true;
1318  ErrPostStr(SEV_ERROR, ERR_REFERENCE_No_references, "No references. Entry dropped.");
1319  if (ibp->segnum == 0) {
1320  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1321  delete dbp;
1322  MemFree(entry);
1323  total_dropped++;
1324  continue;
1325  }
1326  }
1327  }
1328 
1329  if (ibp->segnum == ibp->segtotal) {
1330  seq_entries.push_back(ebp->seq_entry);
1331  ebp->seq_entry.Reset();
1332 
1333  if (ibp->segnum < 2) {
1334  if (ibp->segnum != 0) {
1335  ErrPostStr(SEV_WARNING, ERR_SEGMENT_OnlyOneMember, "Segmented set contains only one member.");
1336  }
1337  segindx = i;
1338  } else {
1339  GetSeqExt(pp, locs);
1340  // LCOV_EXCL_START
1341  // Excluded per Mark's request on 12/14/2016
1342  BuildBioSegHeader(pp, seq_entries, locs);
1343  // LCOV_EXCL_STOP
1344  }
1345 
1346  /* reject the whole set if any one entry was rejected
1347  */
1348  if (ibp->segnum != 0) {
1349  div = pp->entrylist[segindx]->division;
1350  for (j = segindx; j <= i; j++) {
1351  tibp = pp->entrylist[j];
1352  err_install(tibp, pp->accver);
1353  if (! StringEqu(div, tibp->division)) {
1354  ErrPostEx(SEV_WARNING, ERR_DIVISION_Mismatch, "Division different in segmented set: %s: %s", div, tibp->division);
1355  }
1356  if (tibp->drop) {
1357  ErrPostStr(SEV_WARNING, ERR_SEGMENT_Rejected, "Reject the whole segmented set");
1358  break;
1359  }
1360  }
1361  if (j <= i) {
1362  for (j = segindx; j <= i; j++) {
1363  tibp = pp->entrylist[j];
1364  err_install(tibp, pp->accver);
1365  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", tibp->locusname, tibp->acnum);
1366  total_dropped++;
1367  }
1368 
1369  seq_entries.clear();
1370 
1371  delete dbp;
1372  MemFree(entry);
1373  GetScope().ResetHistory();
1374  continue;
1375  }
1376  }
1377 
1378  if (pp->source == Parser::ESource::USPTO) {
1379  GeneRefFeats gene_refs;
1380  gene_refs.valid = false;
1381  ProcNucProt(pp, seq_entries, gene_refs);
1382  } else
1383  DealWithGenes(seq_entries, pp);
1384 
1385  if (seq_entries.empty()) {
1386  if (ibp->segnum != 0) {
1387  ErrPostStr(SEV_WARNING, ERR_SEGMENT_Rejected, "Reject the whole segmented set.");
1388  for (j = segindx; j <= i; j++) {
1389  tibp = pp->entrylist[j];
1390  err_install(tibp, pp->accver);
1391  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", tibp->locusname, tibp->acnum);
1392  total_dropped++;
1393  }
1394  } else {
1395  ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped, "Entry skipped: \"%s|%s\".", ibp->locusname, ibp->acnum);
1396  total_dropped++;
1397  }
1398  delete dbp;
1399  MemFree(entry);
1400  GetScope().ResetHistory();
1401  continue;
1402  }
1403 
1404  /* remove out all the features if their seqloc has
1405  * "join" or "order" among other segments, to the annot
1406  * which in class = parts
1407  */
1408  if (ibp->segnum != 0)
1409  // LCOV_EXCL_START
1410  // Excluded per Mark's request on 12/14/2016
1411  CheckFeatSeqLoc(seq_entries);
1412  // LCOV_EXCL_STOP
1413 
1414  fta_find_pub_explore(pp, seq_entries);
1415 
1416  /* change qual "citation' on features to SeqFeat.cit
1417  * find citation in the list by serial_number.
1418  * If serial number not found remove /citation
1419  */
1420  ProcessCitations(seq_entries);
1421 
1422  /* check for long sequences in each segment
1423  */
1424  if (pp->limit != 0) {
1425  if (ibp->segnum != 0) {
1426  for (j = segindx; j <= i; j++) {
1427  tibp = pp->entrylist[j];
1428  err_install(tibp, pp->accver);
1429  if (tibp->bases <= (size_t)pp->limit)
1430  continue;
1431 
1432  if (tibp->htg == 1 || tibp->htg == 2 || tibp->htg == 4) {
1433  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence, "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem", tibp->locusname, tibp->acnum, pp->limit);
1434  } else {
1435  seq_long = true;
1436  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence, "Sequence %s|%s is longer than limit %ld", tibp->locusname, tibp->acnum, pp->limit);
1437  }
1438  }
1439  } else if (ibp->bases > (size_t)pp->limit) {
1440  if (ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 4) {
1441  ErrPostEx(SEV_WARNING, ERR_ENTRY_LongHTGSSequence, "HTGS Phase 0/1/2 sequence %s|%s exceeds length limit %ld: entry has been processed regardless of this problem", ibp->locusname, ibp->acnum, pp->limit);
1442  } else {
1443  seq_long = true;
1444  ErrPostEx(SEV_REJECT, ERR_ENTRY_LongSequence, "Sequence %s|%s is longer than limit %ld", ibp->locusname, ibp->acnum, pp->limit);
1445  }
1446  }
1447  }
1448 
1449  if (pp->convert) {
1450  if (pp->cleanup <= 1) {
1451  FinalCleanup(seq_entries);
1452 
1453  if (pp->qamode && ! seq_entries.empty())
1454  fta_remove_cleanup_user_object(*(*seq_entries.begin()));
1455  }
1456 
1457  MaybeCutGbblockSource(seq_entries);
1458  }
1459 
1460  EntryCheckDivCode(seq_entries, pp);
1461 
1462  if (pp->xml_comp)
1463  fta_set_strandedness(seq_entries);
1464 
1465  if (fta_EntryCheckGBBlock(seq_entries)) {
1466  ErrPostStr(SEV_WARNING, ERR_ENTRY_GBBlock_not_Empty, "Attention: GBBlock is not empty");
1467  }
1468 
1469  /* check for identical features
1470  */
1471  if (pp->qamode) {
1472  fta_sort_descr(seq_entries);
1473  fta_sort_seqfeat_cit(seq_entries);
1474  }
1475 
1476  if (pp->citat) {
1477  StripSerialNumbers(seq_entries);
1478  }
1479 
1480  PackEntries(seq_entries);
1481  CheckDupDates(seq_entries);
1482 
1483  if (ibp->segnum != 0)
1484  for (j = segindx; j <= i; j++)
1485  err_install(pp->entrylist[j], pp->accver);
1486 
1487  if (seq_long) {
1488  seq_long = false;
1489  if (ibp->segnum != 0)
1490  total_long += (i - segindx + 1);
1491  else
1492  total_long++;
1493  } else {
1494  pp->entries.splice(pp->entries.end(), seq_entries);
1495 
1496  if (ibp->segnum != 0)
1497  total += (i - segindx + 1);
1498  else
1499  total++;
1500  }
1501 
1502  if (ibp->segnum != 0) {
1503  for (j = segindx; j <= i; j++) {
1504  tibp = pp->entrylist[j];
1505  err_install(tibp, pp->accver);
1506  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry parsed successfully: \"%s|%s\".", tibp->locusname, tibp->acnum);
1507  }
1508  } else {
1509  ErrPostEx(SEV_INFO, ERR_ENTRY_Parsed, "OK - entry parsed successfully: \"%s|%s\".", ibp->locusname, ibp->acnum);
1510  }
1511 
1512  seq_entries.clear();
1513  } else {
1514  GetSeqExt(pp, locs);
1515 
1516  seq_entries.push_back(ebp->seq_entry);
1517  ebp->seq_entry.Reset();
1518  }
1519 
1520  delete dbp;
1521  MemFree(entry);
1522  GetScope().ResetHistory();
1523 
1524  } /* for, ascii block entries */
1525 
1527 
1528  ErrPostEx(SEV_INFO, ERR_ENTRY_ParsingComplete, "COMPLETED : SUCCEEDED = %d (including: LONG ones = %d); SKIPPED = %d.", total, total_long, total_dropped);
1529 
1530  return true;
1531 }
1532 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
bool no_reference(const CBioseq &bioseq)
Definition: add.cpp:220
void SeqToDelta(CBioseq &bioseq, Int2 tech)
Definition: add.cpp:500
bool fta_check_htg_kwds(TKeywordList &kwds, IndexblkPtr ibp, CMolInfo &mol_info)
Definition: add.cpp:907
void fta_set_molinfo_completeness(CBioseq &bioseq, const Indexblk *ibp)
Definition: add.cpp:2759
void fta_add_hist(ParserPtr pp, CBioseq &bioseq, CGB_block::TExtra_accessions &extra_accs, Parser::ESource source, CSeq_id::E_Choice acctype, bool pricon, const char *acc)
Definition: add.cpp:787
void AssemblyGapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:334
bool fta_parse_tpa_tsa_block(CBioseq &bioseq, char *offset, char *acnum, Int2 vernum, size_t len, Int2 col_data, bool tpa)
Definition: add.cpp:1112
string GetQSFromFile(FILE *fd, const Indexblk *ibp)
Definition: add.cpp:2662
void fta_create_far_fetch_policy_user_object(CBioseq &bsp, Int4 num)
Definition: add.cpp:2784
void fta_tsa_tls_comment_dblink_check(const CBioseq &bioseq, bool is_tsa)
Definition: add.cpp:2714
void fta_remove_cleanup_user_object(CSeq_entry &seq_entry)
Definition: add.cpp:2681
bool fta_dblink_has_sra(const CRef< CUser_object > &uop)
Definition: add.cpp:2825
void GapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:382
void err_install(const Indexblk *ibp, bool accver)
Definition: add.cpp:297
bool no_date(Parser::EFormat format, const TSeqdescList &descrs)
Definition: add.cpp:190
void fta_parse_structured_comment(char *str, bool &bad, TUserObjVector &objs)
Definition: add.cpp:2546
Int4 fta_fix_seq_loc_id(TSeqLocList &locs, ParserPtr pp, const char *location, const char *name, bool iscon)
Definition: add.cpp:2293
void StripSerialNumbers(TEntryList &seq_entries)
Definition: asci_blk.cpp:3402
void fta_fix_orgref_div(const CBioseq::TAnnot &annots, COrg_ref *org_ref, CGB_block &gbb)
Definition: asci_blk.cpp:3266
void fta_sort_seqfeat_cit(TEntryList &seq_entries)
Definition: asci_blk.cpp:3238
void PackEntries(TEntryList &seq_entries)
Definition: asci_blk.cpp:3502
void fta_set_strandedness(TEntryList &seq_entries)
Definition: asci_blk.cpp:3337
void CheckHTGDivision(const char *div, CMolInfo::TTech tech)
Definition: asci_blk.cpp:2942
unique_ptr< unsigned char[]> GetDNAConv(void)
Definition: asci_blk.cpp:1786
bool XMLCheckCDS(const char *entry, XmlIndexPtr xip)
Definition: asci_blk.cpp:3306
unique_ptr< unsigned char[]> GetProteinConv(void)
Definition: asci_blk.cpp:1814
void EntryCheckDivCode(TEntryList &seq_entries, ParserPtr pp)
Definition: asci_blk.cpp:2806
void GetSeqExt(ParserPtr pp, CSeq_loc &seq_loc)
Definition: asci_blk.cpp:2469
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
Definition: asci_blk.cpp:1674
bool fta_EntryCheckGBBlock(TEntryList &seq_entries)
Definition: asci_blk.cpp:3109
void fta_sort_descr(TEntryList &seq_entries)
Definition: asci_blk.cpp:3181
void XMLDefVsHTGKeywords(CMolInfo::TTech tech, const char *entry, XmlIndexPtr xip, bool cancelled)
Definition: asci_blk.cpp:2885
void BuildBioSegHeader(ParserPtr pp, TEntryList &entries, const CSeq_loc &seqloc)
Definition: asci_blk.cpp:2494
void GetExtraAccession(IndexblkPtr ibp, bool allow_uwsec, Parser::ESource source, TAccessionList &accessions)
Definition: asci_blk.cpp:1317
bool check_div(bool pat_acc, bool pat_ref, bool est_kwd, bool sts_kwd, bool gss_kwd, bool if_cds, string &div, CMolInfo::TTech *tech, size_t bases, Parser::ESource source, bool &drop)
Definition: asci_blk.cpp:2565
CRef< CBioseq > CreateEntryBioseq(ParserPtr pp)
Definition: asci_blk.cpp:1074
list< string > TStringList
Definition: cgictx.cpp:719
void ProcessCitations(TEntryList &seq_entries)
Definition: citation.cpp:307
CDate::ECompare Compare(const CDate_std &date) const
Indicate how *this relates to another date.
Definition: Date_std.cpp:91
void SetToTime(const CTime &time, CDate::EPrecision prec=CDate::ePrecision_second)
Definition: Date_std.cpp:59
@ eCompare_after
*this comes second.
Definition: Date.hpp:76
Definition: Seq_entry.hpp:56
static bool IsNa(EMol mol)
Definition: Seq_inst.hpp:90
CTime –.
Definition: ncbitime.hpp:296
char * mOffset
Definition: ftablock.h:329
size_t len
Definition: ftablock.h:330
string mpQscore
Definition: ftablock.h:331
void SimpleDelete()
Definition: ftablock.h:320
CFlatFileData * mpData
Definition: ftablock.h:328
DataBlk * mpNext
Definition: ftablock.h:333
void fta_build_ena_user_object(CSeq_descr::Tdata &descrs, TStringList &dr_ena, TStringList &dr_biosample, CRef< CUser_object > &dbuop)
Definition: em_ascii.cpp:1505
CRef< CEMBL_block > XMLGetEMBLBlock(ParserPtr pp, const char *entry, CMolInfo &mol_info, string &gbdiv, CBioSource *bio_src, TStringList &dr_ena, TStringList &dr_biosample)
Definition: em_ascii.cpp:2409
void FinalCleanup(TEntryList &seq_entries)
Definition: fcleanup.cpp:377
#define ERR_SEQUENCE_BadData
Definition: flat2err.h:150
#define ERR_TPA_TpaSpansMissing
Definition: flat2err.h:593
#define ERR_ENTRY_LongSequence
Definition: flat2err.h:82
#define ERR_FORMAT_MissingContigFeature
Definition: flat2err.h:43
#define ERR_KEYWORD_ShouldNotBeTPA
Definition: flat2err.h:208
#define ERR_DIVISION_BadTSADivcode
Definition: flat2err.h:261
#define ERR_FORMAT_MissingSequenceData
Definition: flat2err.h:41
#define ERR_DIVISION_InvalidHTCKeyword
Definition: flat2err.h:254
#define ERR_KEYWORD_IllegalForCON
Definition: flat2err.h:210
#define ERR_DIVISION_MissingHTGKeywords
Definition: flat2err.h:249
#define ERR_QSCORE_FailedToParse
Definition: flat2err.h:577
#define ERR_ENTRY_LongHTGSSequence
Definition: flat2err.h:86
#define ERR_KEYWORD_MissingTSA
Definition: flat2err.h:216
#define ERR_DIVISION_BadTPADivcode
Definition: flat2err.h:257
#define ERR_REFERENCE_No_references
Definition: flat2err.h:289
#define ERR_KEYWORD_ShouldNotBeTLS
Definition: flat2err.h:218
#define ERR_ENTRY_GBBlock_not_Empty
Definition: flat2err.h:85
#define ERR_KEYWORD_HTGPlusENV
Definition: flat2err.h:217
#define ERR_DEFINITION_MissingTPA
Definition: flat2err.h:269
#define ERR_ENTRY_Skipped
Definition: flat2err.h:80
#define ERR_DEFINITION_MissingTLS
Definition: flat2err.h:273
#define ERR_KEYWORD_ESTSubstring
Definition: flat2err.h:204
#define ERR_KEYWORD_ConflictingKeywords
Definition: flat2err.h:207
#define ERR_DIVISION_ConDivLacksContig
Definition: flat2err.h:252
#define ERR_LOCATION_ContigHasNull
Definition: flat2err.h:397
#define ERR_SEGMENT_OnlyOneMember
Definition: flat2err.h:165
#define ERR_KEYWORD_ENV_NoMatchingQualifier
Definition: flat2err.h:214
#define ERR_KEYWORD_ShouldNotBeTSA
Definition: flat2err.h:215
#define ERR_KEYWORD_STSSubstring
Definition: flat2err.h:205
#define ERR_DIVISION_UnknownDivCode
Definition: flat2err.h:222
#define ERR_KEYWORD_MissingTLS
Definition: flat2err.h:219
#define ERR_DEFINITION_ShouldNotBeTSA
Definition: flat2err.h:270
#define ERR_SEGMENT_Rejected
Definition: flat2err.h:166
#define ERR_DIVISION_MissingHTCKeyword
Definition: flat2err.h:253
#define ERR_DIVISION_MappedtoCON
Definition: flat2err.h:248
#define ERR_DIVISION_MappedtoEST
Definition: flat2err.h:223
#define ERR_FORMAT_ContigWithSequenceData
Definition: flat2err.h:42
#define ERR_KEYWORD_NoGeneExpressionKeywords
Definition: flat2err.h:213
#define ERR_DEFINITION_MissingTSA
Definition: flat2err.h:271
#define ERR_DEFINITION_ShouldNotBeTPA
Definition: flat2err.h:268
#define ERR_KEYWORD_MissingTPA
Definition: flat2err.h:209
#define ERR_DIVISION_ConDivInSegset
Definition: flat2err.h:251
#define ERR_ENTRY_ParsingComplete
Definition: flat2err.h:79
#define ERR_DIVISION_Mismatch
Definition: flat2err.h:226
#define ERR_ORGANISM_NoOrganism
Definition: flat2err.h:184
#define ERR_DATE_IllegalDate
Definition: flat2err.h:102
#define ERR_ENTRY_Parsed
Definition: flat2err.h:83
#define ERR_DIVISION_HTCWrongMolType
Definition: flat2err.h:255
#define ERR_KEYWORD_ShouldNotBeCAGE
Definition: flat2err.h:211
#define ERR_DEFINITION_ShouldNotBeTLS
Definition: flat2err.h:272
#define ERR_TSA_UnexpectedPrimaryAccession
Definition: flat2err.h:609
list< CRef< objects::CSeq_entry > > TEntryList
bool QscoreToSeqAnnot(const string &qscore, CBioseq &bioseq, char *acc, Int2 ver, bool check_minmax, bool allow_na)
#define INSDSEQ_TOPOLOGY
Definition: fta_xml.h:46
#define INSDSEQ_MOLTYPE
Definition: fta_xml.h:45
unique_ptr< string > XMLGetTagValue(const char *entry, const XmlIndex *xip)
Definition: xm_index.cpp:202
#define INSDSEQ_DEFINITION
Definition: fta_xml.h:52
DataBlkPtr XMLBuildRefDataBlk(char *entry, const XmlIndex *xip, int type)
Definition: xm_index.cpp:1487
#define INSDSEQ_STRANDEDNESS
Definition: fta_xml.h:44
#define INSDSEQ_COMMENT
Definition: fta_xml.h:64
#define INSDSEQ_ORGANISM
Definition: fta_xml.h:61
void XMLGetKeywords(const char *entry, const XmlIndex *xip, TKeywordList &keywords)
Definition: xm_index.cpp:1520
char * XMLLoadEntry(ParserPtr pp, bool err)
Definition: xm_index.cpp:965
#define INSDSEQ_KEYWORDS
Definition: fta_xml.h:58
#define INSDSEQ_TAXONOMY
Definition: fta_xml.h:62
#define INSDSEQ_CREATE_DATE
Definition: fta_xml.h:49
#define INSDSEQ_DIVISION
Definition: fta_xml.h:47
#define INSDSEQ_UPDATE_DATE
Definition: fta_xml.h:48
#define INSDSEQ_SOURCE
Definition: fta_xml.h:60
unique_ptr< string > XMLConcatSubTags(const char *entry, const XmlIndex *xip, Int4 tag, Char sep)
Definition: xm_index.cpp:1546
#define INSDSEQ_CONTIG
Definition: fta_xml.h:70
#define INSDSEQ_PRIMARY
Definition: fta_xml.h:65
unique_ptr< string > XMLFindTagValue(const char *entry, const XmlIndex *xip, Int4 tag)
Definition: xm_index.cpp:214
std::vector< CRef< objects::CUser_object > > TUserObjVector
Definition: ftablock.h:62
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:131
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:121
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:111
void StringCpy(char *d, const char *s)
Definition: ftacpp.hpp:89
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
void MemCpy(void *p, const void *q, size_t sz)
Definition: ftacpp.hpp:50
char * StringRChr(char *s, const char c)
Definition: ftacpp.hpp:93
void FtaDeletePrefix(int prefix)
Definition: ftaerr.cpp:346
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
void fta_find_pub_explore(ParserPtr pp, TEntryList &seq_entries)
Definition: ftanet.cpp:753
static const char * str(char *buf, int n)
Definition: stats.c:84
int offset
Definition: replacements.h:160
void CheckFeatSeqLoc(TEntryList &seq_entries)
Definition: gb_ascii.cpp:2375
void DealWithGenes(TEntryList &seq_entries, ParserPtr pp)
Definition: genref.cpp:2957
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
#define StringStr
Definition: ncbistr.hpp:322
#define StringSave
Definition: ncbistr.hpp:326
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
void ResetHistory(EActionIfLocked action=eKeepIfLocked)
Clean all unused TSEs from the scope's cache and release the memory.
Definition: scope.cpp:325
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
const TSubtype & GetSubtype(void) const
Get the Subtype member data.
Definition: BioSource_.hpp:539
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
TOrigin GetOrigin(void) const
Get the Origin member data.
Definition: BioSource_.hpp:472
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
Definition: BioSource_.hpp:527
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetOrigin(void) const
Check if a value has been assigned to Origin data member.
Definition: BioSource_.hpp:447
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
@ eSubtype_environmental_sample
Definition: SubSource_.hpp:111
TStd & SetStd(void)
Select the variant.
Definition: Date_.cpp:115
const TDiv & GetDiv(void) const
Get the Div member data.
Definition: OrgName_.hpp:1005
void SetCommon(const TCommon &value)
Assign a value to Common data member.
Definition: Org_ref_.hpp:428
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
bool IsSetDiv(void) const
GenBank division code Check if a value has been assigned to Div data member.
Definition: OrgName_.hpp:993
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
Definition: Org_ref_.hpp:381
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSeq_code_type_iupacaa
IUPAC 1 letter amino acid code.
@ eSeq_code_type_iupacna
IUPAC 1 letter nuc acid code.
bool IsMix(void) const
Check if variant Mix is selected.
Definition: Seq_loc_.hpp:552
const TMix & GetMix(void) const
Get the variant data.
Definition: Seq_loc_.cpp:282
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
TPub & SetPub(void)
Select the variant.
Definition: Seqdesc_.cpp:362
bool IsSetAnnot(void) const
Check if a value has been assigned to Annot data member.
Definition: Bioseq_.hpp:354
TGenbank & SetGenbank(void)
Select the variant.
Definition: Seqdesc_.cpp:340
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
TComment & SetComment(void)
Select the variant.
Definition: Seqdesc_.hpp:1065
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
void ResetTech(void)
Reset Tech data member.
Definition: MolInfo_.hpp:484
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
void SetTopology(TTopology value)
Assign a value to Topology data member.
Definition: Seq_inst_.hpp:739
ETopology
topology of molecule
Definition: Seq_inst_.hpp:121
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
TEmbl & SetEmbl(void)
Select the variant.
Definition: Seqdesc_.cpp:456
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
EStrand
strandedness in living organism
Definition: Seq_inst_.hpp:133
void SetStrand(TStrand value)
Assign a value to Strand data member.
Definition: Seq_inst_.hpp:786
void SetTech(TTech value)
Assign a value to Tech data member.
Definition: MolInfo_.hpp:503
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
TMolinfo & SetMolinfo(void)
Select the variant.
Definition: Seqdesc_.cpp:594
TCreate_date & SetCreate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:478
TUpdate_date & SetUpdate_date(void)
Select the variant.
Definition: Seqdesc_.cpp:500
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_htc
high throughput cDNA
Definition: MolInfo_.hpp:142
@ eTech_targeted
targeted locus sets/studies
Definition: MolInfo_.hpp:147
@ eTech_sts
Sequence Tagged Site.
Definition: MolInfo_.hpp:126
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_survey
one-pass genomic sequence
Definition: MolInfo_.hpp:127
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
@ eTech_fli_cdna
full length insert cDNA
Definition: MolInfo_.hpp:140
@ eTech_est
Expressed Sequence Tag.
Definition: MolInfo_.hpp:125
@ ParFlat_REF_NO_TARGET
Definition: index.h:63
@ ParFlat_REF_END
Definition: index.h:60
CRef< CDate_std > GetUpdateDate(const char *ptr, Parser::ESource source)
Definition: indx_blk.cpp:610
Int2 XMLCheckSTRAND(const char *str)
Definition: indx_blk.cpp:484
Int4 IsNewAccessFormat(const Char *acnum)
Definition: indx_blk.cpp:992
Int2 CheckDIV(const char *str)
Definition: indx_blk.cpp:531
Int2 XMLCheckTPG(const char *str)
Definition: indx_blk.cpp:490
int i
void GetFlatBiomol(CMolInfo::TBiomol &biomol, CMolInfo::TTech tech, char *molstr, ParserPtr pp, const DataBlk &entry, const COrg_ref *org_ref)
Definition: loadfeat.cpp:5032
void LoadFeat(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq)
Definition: loadfeat.cpp:4724
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
std::list< SeqLoc > TSeqLocList
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void CheckDupDates(TEntryList &seq_entries)
Definition: nucprot.cpp:2661
void ProcNucProt(ParserPtr pp, TEntryList &seq_entries, GeneRefFeats &gene_refs)
Definition: nucprot.cpp:2507
CRef< CPubdesc > DescrRefs(ParserPtr pp, DataBlkPtr dbp, Uint2 col_data)
Definition: ref.cpp:2426
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
CRef< objects::CSeq_entry > seq_entry
Definition: ftablock.h:343
bool valid
Definition: nucprot.h:64
Char acnum[200]
Definition: ftablock.h:166
CRef< objects::CPatent_seq_id > psip
Definition: ftablock.h:190
Char division[4]
Definition: ftablock.h:171
bool is_mga
Definition: ftablock.h:199
bool tsa_allowed
Definition: ftablock.h:211
Int2 htg
Definition: ftablock.h:196
bool is_tls
Definition: ftablock.h:208
Int2 vernum
Definition: ftablock.h:167
bool is_tpa
Definition: ftablock.h:206
TKeywordList keywords
Definition: ftablock.h:240
bool is_prot
Definition: ftablock.h:222
bool is_wgs
Definition: ftablock.h:205
bool origin
Definition: ftablock.h:201
bool is_contig
Definition: ftablock.h:197
bool STS
Definition: ftablock.h:193
bool is_pat
Definition: ftablock.h:202
bool HTC
Definition: ftablock.h:195
bool drop
Definition: ftablock.h:182
bool experimental
Definition: ftablock.h:247
size_t bases
Definition: ftablock.h:172
bool inferential
Definition: ftablock.h:245
Uint2 segtotal
Definition: ftablock.h:175
bool is_tsa
Definition: ftablock.h:207
bool EST
Definition: ftablock.h:192
GapFeatsPtr gaps
Definition: ftablock.h:214
string wgssec
Definition: ftablock.h:236
bool specialist_db
Definition: ftablock.h:243
Uint2 segnum
Definition: ftablock.h:173
Char locusname[200]
Definition: ftablock.h:170
bool env_sample_qual
Definition: ftablock.h:219
XmlIndexPtr xip
Definition: ftablock.h:217
size_t qslength
Definition: ftablock.h:230
bool GSS
Definition: ftablock.h:194
char *(* ff_get_qscore_pp)(const char *accession, Int2 v, Parser *pp)
vector< IndexblkPtr > entrylist
bool allow_crossdb_featloc
optional< string > buf
char *(* ff_get_qscore)(const char *accession, Int2 v)
TEntryList entries
XmlIndex * next
Definition: ftablock.h:158
Int4 tag
Definition: ftablock.h:150
CScope & GetScope()
bool GetGenomeInfo(CBioSource &bsp, string_view bptr)
Definition: utilfeat.cpp:225
void MaybeCutGbblockSource(TEntryList &seq_entries)
Definition: utilfeat.cpp:435
bool HasHtg(const TKeywordList &keywords)
Definition: utilfun.cpp:1574
bool HasHtc(const TKeywordList &keywords)
Definition: utilfun.cpp:1603
bool fta_tls_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1136
void RemoveHtgPhase(TKeywordList &keywords)
Definition: utilfun.cpp:1588
void fta_remove_tsa_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1277
void fta_remove_tpa_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1263
void fta_remove_keywords(CMolInfo::TTech tech, TKeywordList &kwds)
Definition: utilfun.cpp:1232
void fta_remove_tls_keywords(TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1292
void fta_keywords_check(const char *str, bool *estk, bool *stsk, bool *gssk, bool *htck, bool *flik, bool *wgsk, bool *tpak, bool *envk, bool *mgak, bool *tsak, bool *tlsk)
Definition: utilfun.cpp:1195
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1496
bool IsCancelled(const TKeywordList &keywords)
Definition: utilfun.cpp:1563
bool fta_tsa_keywords_check(const TKeywordList &kwds, Parser::ESource source)
Definition: utilfun.cpp:1102
void fta_remove_env_keywords(TKeywordList &kwds)
Definition: utilfun.cpp:1307
bool fta_tpa_keywords_check(const TKeywordList &kwds)
Definition: utilfun.cpp:1020
bool fta_check_mga_keywords(CMolInfo &mol_info, const TKeywordList &kwds)
Definition: utilfun.cpp:1447
CRef< CSeq_loc > xgbparseint_ver(string_view raw_intervals, bool &keep_rawPt, int &numErrors, const TSeqIdList &seq_ids, bool accver)
Definition: xgbparint.cpp:1465
USING_SCOPE(objects)
bool XMLAscii(ParserPtr pp)
Definition: xm_ascii.cpp:1128
bool XMLGetInst(ParserPtr pp, DataBlkPtr dbp, unsigned char *dnaconv, CBioseq &bioseq)
Definition: xm_ascii.cpp:187
static bool XMLGetInstContig(XmlIndexPtr xip, DataBlkPtr dbp, CBioseq &bioseq, ParserPtr pp)
Definition: xm_ascii.cpp:126
static void XMLGetDescr(ParserPtr pp, DataBlkPtr entry, CBioseq &bioseq)
Definition: xm_ascii.cpp:784
static CRef< CGB_block > XMLGetGBBlock(ParserPtr pp, const char *entry, CMolInfo &mol_info, CBioSource *bio_src)
Definition: xm_ascii.cpp:238
static CRef< CMolInfo > XMLGetMolInfo(ParserPtr pp, DataBlkPtr entry, COrg_ref *org_ref)
Definition: xm_ascii.cpp:608
static void XMLCheckContigEverywhere(IndexblkPtr ibp, Parser::ESource source)
Definition: xm_ascii.cpp:92
static void XMLGetDivision(const char *entry, IndexblkPtr ibp)
Definition: xm_ascii.cpp:1112
static void XMLFakeBioSources(XmlIndexPtr xip, const char *entry, CBioseq &bioseq, Parser::ESource source)
Definition: xm_ascii.cpp:654
static void XMLGetDescrComment(char *offset)
Definition: xm_ascii.cpp:721
void XGappedSeqLocsToDeltaSeqs(const TSeqLocList &locs, TDeltaList &deltas)
Definition: xutils.cpp:91
Modified on Wed Sep 04 15:01:07 2024 by modify_doxy.py rev. 669887