NCBI C++ ToolKit
add.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: add.cpp 102982 2024-08-15 12:44:06Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: add.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Additional parser functions.
32  *
33  */
34 
35 #include <ncbi_pch.hpp>
36 
37 #include "ftacpp.hpp"
38 #include <objects/seq/Seq_gap.hpp>
44 #include <objects/seq/MolInfo.hpp>
45 #include <objects/seq/Seq_inst.hpp>
46 #include <objects/seq/Seq_ext.hpp>
47 #include <objects/seq/Seq_hist.hpp>
63 
64 #include "index.h"
65 #include "genbank.h" /* for ParFlat_FEATURES */
66 #include "embl.h" /* for ParFlat_FH */
67 
69 #include "ftanet.h"
70 
71 #include "ftaerr.hpp"
72 #include "indx_blk.h"
73 #include "asci_blk.h"
74 #include "utilfun.h"
75 #include "add.h"
76 
77 #ifdef THIS_FILE
78 # undef THIS_FILE
79 #endif
80 #define THIS_FILE "add.cpp"
81 
82 #define HTG_GAP 100
83 #define SHORT_GAP 20
84 
87 
88 struct SeqLocIds {
89  CSeq_loc* badslp = nullptr;
90  const Char* wgsacc = nullptr;
91  const Char* wgscont = nullptr;
92  const Char* wgsscaf = nullptr;
94  Int4 embl = 0;
95  Int4 pir = 0;
97  Int4 other = 0;
98  Int4 ddbj = 0;
99  Int4 prf = 0;
100  Int4 tpg = 0;
101  Int4 tpe = 0;
102  Int4 tpd = 0;
103  Int4 total = 0;
104 };
106 
107 struct FTATpaBlock {
108  Int4 from1 = 0;
109  Int4 to1 = 0;
110  char* accession = nullptr;
112  Int4 from2 = 0;
113  Int4 to2 = 0;
116  FTATpaBlock* next = nullptr;
117 };
119 
120 struct FTATpaSpan {
121  Int4 from = 0;
122  Int4 to = 0;
123  FTATpaSpan* next = nullptr;
124 };
126 
127 /**********************************************************/
129 {
131 
132  for (; ftbp; ftbp = next) {
133  next = ftbp->next;
134  if (ftbp->accession)
135  MemFree(ftbp->accession);
136  delete ftbp;
137  }
138 }
139 
140 /**********************************************************
141  *
142  * char* tata_save(str):
143  *
144  * Deletes spaces from the begining and the end and
145  * returns Nlm_StringSave.
146  *
147  **********************************************************/
148 string tata_save(string_view t)
149 {
150  if (t.empty())
151  return {};
152  string str(t);
153 
154  // strip from beginning
155  size_t i = 0;
156  for (char c : str) {
157  if (isspace(c) || c == ',')
158  ++i;
159  else
160  break;
161  }
162  if (i > 0)
163  str.erase(0, i);
164 
165  // strip from beginning of each line
166  for (i = 0; i < str.length(); ++i) {
167  if (str[i] != '\n')
168  continue;
169  size_t j = 0;
170  for (size_t k = i + 1; k < str.length() && isspace(str[k]); ++k)
171  ++j;
172  str[i] = ' ';
173  if (j > 0)
174  str.erase(i + 1, j);
175  }
176 
177  // strip from end
178  while (! str.empty()) {
179  char c = str.back();
180  if (c == ' ' || c == ';' || c == ',' || c == '\"' || c == '\t')
181  str.pop_back();
182  else
183  break;
184  }
185 
186  return str;
187 }
188 
189 /**********************************************************/
191 {
192  bool no_create = true;
193  bool no_update = true;
194 
195  for (const auto& desc : descrs) {
196  if (desc->IsCreate_date())
197  no_create = false;
198  else if (desc->IsUpdate_date())
199  no_update = false;
200 
201  if (no_create == false && no_update == false)
202  break;
203  }
204 
206  return (no_update);
207 
208  return (no_create || no_update);
209 }
210 
211 /**********************************************************
212  *
213  * bool no_reference(bsp):
214  *
215  * Search for at least one reference in bioseq->desr
216  * or in bioseq->annot.
217  * If no reference return TRUE.
218  *
219  **********************************************************/
220 bool no_reference(const CBioseq& bioseq)
221 {
222  for (const auto& desc : bioseq.GetDescr().Get()) {
223  if (desc->IsPub())
224  return false;
225  }
226 
227  for (const auto& annot : bioseq.GetAnnot()) {
228  if (! annot->IsFtable())
229  continue;
230 
231  for (const auto& feat : annot->GetData().GetFtable()) {
232  if (feat->IsSetData() && feat->GetData().IsPub())
233  return false;
234  }
235 
236  for (const auto& feat : annot->GetData().GetFtable()) {
237  if (! feat->IsSetData() || ! feat->GetData().IsImp())
238  continue;
239 
240  const CImp_feat& imp = feat->GetData().GetImp();
241  if (imp.GetKey() == "Site-ref") {
242  ErrPostStr(SEV_ERROR, ERR_REFERENCE_Illegalreference, "The entry has only 'sites' references");
243  return false;
244  }
245  }
246  }
247 
248  return true;
249 }
250 
251 /**********************************************************
252  *
253  * bool check_cds(entry, format):
254  *
255  * Returns TRUE if CDS is in the entry.
256  *
257  **********************************************************/
259 {
260  DataBlkPtr temp;
261  DataBlkPtr dbp;
262  const char* str;
263  Int2 type;
264 
265  if (format == Parser::EFormat::EMBL) {
266  type = ParFlat_FH;
267  str = "\nFT CDS ";
268  } else if (format == Parser::EFormat::GenBank) {
270  str = "\n CDS ";
271  } else
272  return false;
273 
274  for (temp = TrackNodeType(entry, type); temp; temp = temp->mpNext) {
275  if (temp->mType != type)
276  continue;
277 
278  size_t len = 0;
279  for (dbp = static_cast<DataBlk*>(temp->mpData); dbp; dbp = dbp->mpNext)
280  len += dbp->len;
281  if (len == 0)
282  continue;
283 
284  dbp = static_cast<DataBlk*>(temp->mpData);
285  char* p = SrchTheStr(dbp->mOffset, dbp->mOffset + len, str);
286 
287  if (p)
288  break;
289  }
290 
291  if (! temp)
292  return false;
293  return true;
294 }
295 
296 /**********************************************************/
297 void err_install(const Indexblk* ibp, bool accver)
298 {
299  string temp;
300 
302  temp = ibp->acnum;
303  if (accver && ibp->vernum > 0) {
304  temp += '.';
305  temp += to_string(ibp->vernum);
306  }
307  if (temp.empty())
308  temp = ibp->locusname;
309  FtaInstallPrefix(PREFIX_ACCESSION, temp.c_str());
310 }
311 
312 /**********************************************************/
313 static void CreateSeqGap(CSeq_literal& seq_lit, GapFeatsPtr gfp)
314 {
315  if (! gfp)
316  return;
317 
318  CSeq_gap& sgap = seq_lit.SetSeq_data().SetGap();
319  sgap.SetType(gfp->asn_gap_type);
320 
321  if (! gfp->asn_linkage_evidence.empty())
322  sgap.SetLinkage_evidence().swap(gfp->asn_linkage_evidence);
323 
325  if (! gfp->gap_type.empty()) {
326  const string& gapType(gfp->gap_type);
327  if (gapType == "unknown" || gapType == "within scaffold" || gapType == "repeat within scaffold") {
329  }
330  }
331 }
332 
333 /**********************************************************/
334 void AssemblyGapsToDelta(CBioseq& bioseq, GapFeatsPtr gfp, bool* drop)
335 {
336  if (! bioseq.GetInst().IsSetExt() || ! bioseq.GetInst().GetExt().IsDelta() ||
337  ! gfp)
338  return;
339 
340  CDelta_ext::Tdata& deltas = bioseq.SetInst().SetExt().SetDelta();
341  CDelta_ext::Tdata::iterator delta = deltas.begin();
342  for (; delta != deltas.end(); ++delta) {
343  if (! gfp)
344  break;
345 
346  if (! (*delta)->IsLiteral()) /* not Seq-lit */
347  continue;
348 
349  CSeq_literal& literal = (*delta)->SetLiteral();
350  if (literal.GetLength() != static_cast<Uint4>(gfp->to - gfp->from + 1)) {
351  ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigVersusAssemblyGapMissmatch, "The lengths of the CONTIG/CO line gaps disagrees with the lengths of assembly_gap features. First assembly_gap with a mismatch is at \"%d..%d\".", gfp->from, gfp->to);
352  *drop = true;
353  break;
354  }
355 
356  CreateSeqGap(literal, gfp);
357 
358  gfp = gfp->next;
359  }
360 
361  if (*drop || (delta == deltas.end() && ! gfp))
362  return;
363 
364  if (delta == deltas.end() && gfp) {
365  ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigVersusAssemblyGapMissmatch, "The number of the assembly_gap features exceeds the number of CONTIG/CO line gaps. First extra assembly_gap is at \"%d..%d\".", gfp->from, gfp->to);
366  *drop = true;
367  } else if (delta != deltas.end() && ! gfp) {
368  for (; delta != deltas.end(); ++delta) {
369  if ((*delta)->IsLiteral()) /* Seq-lit */
370  break;
371  }
372 
373  if (delta == deltas.end())
374  return;
375 
376  ErrPostStr(SEV_REJECT, ERR_FORMAT_ContigVersusAssemblyGapMissmatch, "The number of the CONTIG/CO line gaps exceeds the number of assembly_gap features.");
377  *drop = true;
378  }
379 }
380 
381 /**********************************************************/
382 void GapsToDelta(CBioseq& bioseq, GapFeatsPtr gfp, bool* drop)
383 {
384  GapFeatsPtr tgfp;
385 
386  const Char* p;
387  Int4 prevto;
388  Int4 nextfrom;
389  Int4 i;
390 
391  if (! gfp || ! bioseq.GetInst().IsSetSeq_data())
392  return;
393 
394  const string& sequence = bioseq.GetInst().GetSeq_data().GetIupacna();
395 
396  if (sequence.empty() || sequence.size() != bioseq.GetLength())
397  return;
398 
399  for (prevto = 0, tgfp = gfp; tgfp; tgfp = tgfp->next) {
400  if (tgfp->next) {
401  p = sequence.c_str() + tgfp->to;
402  for (i = tgfp->to + 1; i < tgfp->next->from; p++, i++)
403  if (*p != 'N')
404  break;
405  if (i == tgfp->next->from && tgfp->next->from > tgfp->to + 1) {
406  ErrPostEx(SEV_ERROR, ERR_FEATURE_AllNsBetweenGaps, "A run of all-N sequence exists between the gap features located at \"%d..%d\" and \"%d..%d\".", tgfp->from, tgfp->to, tgfp->next->from, tgfp->next->to);
407  tgfp->rightNs = true;
408  tgfp->next->leftNs = true;
409  }
410  nextfrom = tgfp->next->from;
411  } else
412  nextfrom = bioseq.GetLength() + 1;
413 
414  if (tgfp->leftNs == false && tgfp->from - prevto > 10) {
415  for (p = sequence.c_str() + tgfp->from - 11, i = 0; i < 10; p++, i++)
416  if (*p != 'N')
417  break;
418  if (i == 10) {
419  ErrPostEx(SEV_WARNING, ERR_FEATURE_NsAbutGap, "A run of N's greater or equal than 10 abuts the gap feature at \"%d..%d\" : possible problem with the boundaries of the gap.", tgfp->from, tgfp->to);
420  }
421  }
422 
423  if (tgfp->rightNs == false && nextfrom - tgfp->to > 10) {
424  for (p = sequence.c_str() + tgfp->to, i = 0; i < 10; p++, i++)
425  if (*p != 'N')
426  break;
427  if (i == 10) {
428  ErrPostEx(SEV_WARNING, ERR_FEATURE_NsAbutGap, "A run of N's greater or equal than 10 abuts the gap feature at \"%d..%d\" : possible problem with the boundaries of the gap.", tgfp->from, tgfp->to);
429  }
430  }
431 
432  for (i = tgfp->from - 1, p = sequence.c_str() + i; i < tgfp->to; p++, i++)
433  if (*p != 'N')
434  break;
435  if (i < tgfp->to) {
436  ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidGapSequence, "The sequence data associated with the gap feature at \"%d..%d\" contains basepairs other than N.", tgfp->from, tgfp->to);
437  *drop = true;
438  }
439 
440  prevto = tgfp->to;
441  }
442 
443  if (*drop)
444  return;
445 
446  CDelta_ext::Tdata deltas;
447 
448  for (prevto = 0, tgfp = gfp;; tgfp = tgfp->next) {
449  Int4 len = 0;
450 
452  if (tgfp->from - prevto - 1 > 0) {
453  len = tgfp->from - prevto - 1;
454  delta->SetLiteral().SetLength(len);
455  delta->SetLiteral().SetSeq_data().SetIupacna().Set() = sequence.substr(prevto, len);
456 
457  deltas.push_back(delta);
458 
459  delta.Reset(new CDelta_seq);
460  }
461 
462  len = tgfp->to - tgfp->from + 1;
463  delta->SetLiteral().SetLength(len);
464  if (tgfp->estimated_length == -100) {
465  delta->SetLiteral().SetFuzz().SetLim();
466  } else if (tgfp->estimated_length != len) {
467  delta->SetLiteral().SetFuzz().SetRange().SetMin(tgfp->estimated_length);
468  delta->SetLiteral().SetFuzz().SetRange().SetMax(len);
469  }
470 
471  if (tgfp->assembly_gap)
472  CreateSeqGap(delta->SetLiteral(), tgfp);
473 
474  deltas.push_back(delta);
475 
476  prevto = tgfp->to;
477 
478  if (! tgfp->next) {
479  if (bioseq.GetLength() - prevto > 0) {
480  delta.Reset(new CDelta_seq);
481 
482  len = bioseq.GetLength() - prevto;
483  delta->SetLiteral().SetLength(len);
484  delta->SetLiteral().SetSeq_data().SetIupacna().Set() = sequence.substr(prevto, len);
485 
486  deltas.push_back(delta);
487  }
488  break;
489  }
490  }
491 
492  if (! deltas.empty()) {
493  bioseq.SetInst().SetExt().SetDelta().Set().swap(deltas);
494  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_delta);
495  bioseq.SetInst().ResetSeq_data();
496  }
497 }
498 
499 /**********************************************************/
500 void SeqToDelta(CBioseq& bioseq, Int2 tech)
501 {
502  char* p;
503  char* q;
504  char* r;
505 
506  Int4 i;
507  Int4 j;
508  Int4 gotcha;
509 
510  if (! bioseq.GetInst().IsSetSeq_data())
511  return;
512 
513  const string& sequence = bioseq.GetInst().GetSeq_data().GetIupacna();
514  if (sequence.empty() || sequence.size() != bioseq.GetLength())
515  return;
516 
517  vector<Char> buf(sequence.begin(), sequence.end());
518  buf.push_back(0);
519  p = &buf[0];
520  gotcha = 0;
521 
522  CDelta_ext::Tdata deltas;
523 
524  for (q = p; *p != '\0';) {
525  if (*p != 'N') {
526  p++;
527  continue;
528  }
529 
530  for (r = p, p++, i = 1; *p == 'N'; i++)
531  p++;
532  if (i < HTG_GAP) {
533  if (i >= SHORT_GAP && gotcha == 0)
534  gotcha = 1;
535  continue;
536  }
537 
539  gotcha = 2;
540 
541  if (r != q) {
542  *r = '\0';
543  j = (Int4)(r - q);
544 
545  delta->SetLiteral().SetLength(j);
546  delta->SetLiteral().SetSeq_data().SetIupacna().Set(string(q, r));
547 
548  deltas.push_back(delta);
549 
550  delta.Reset(new CDelta_seq);
551 
552  *r = 'N';
553  }
554 
555  delta->SetLiteral().SetLength(i);
556  if (i == 100) {
557  delta->SetLiteral().SetFuzz().SetLim();
558  }
559 
560  deltas.push_back(delta);
561  q = p;
562  }
563 
564  if (p > q) {
565  j = (Int4)(p - q);
566 
568  delta->SetLiteral().SetLength(j);
569  delta->SetLiteral().SetSeq_data().SetIupacna().Set(string(q, p));
570 
571  deltas.push_back(delta);
572  }
573 
574  if (deltas.size() > 1) {
575  bioseq.SetInst().SetExt().SetDelta().Set().swap(deltas);
576  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_delta);
577  bioseq.SetInst().ResetSeq_data();
578  }
579 
580  if (bioseq.GetInst().GetRepr() != CSeq_inst::eRepr_delta && tech == 1) {
581  ErrPostStr(SEV_WARNING, ERR_SEQUENCE_HTGWithoutGaps, "This Phase 1 HTG sequence has no runs of 100 "
582  "or more N's to indicate gaps between component contigs. "
583  "This could be an error, or perhaps sequencing is finished "
584  "and this record should not be Phase 1.");
585  }
586 
587  if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_delta) {
588  if (tech == 4) /* Phase 0 */
589  ErrPostStr(SEV_WARNING, ERR_SEQUENCE_HTGPhaseZeroHasGap, "A Phase 0 HTG record usually consists of several reads "
590  "for one contig, and hence gaps are not expected. But "
591  "this record does have one (ore more) gaps, hence it "
592  "may require review.");
593  if (gotcha == 1)
594  ErrPostStr(SEV_WARNING, ERR_SEQUENCE_HTGPossibleShortGap, "This sequence has one or more runs "
595  "of at least 20 N's. They could indicate gaps, "
596  "but have not been treated that way because "
597  "they are below the minimum of 100 N's.");
598  }
599 }
600 
601 /**********************************************************/
602 static bool fta_ranges_to_hist(const CGB_block::TExtra_accessions& extra_accs)
603 {
604  string ppacc1;
605  string ppacc2;
606  char* master;
607  char* range;
608  char* acc1;
609  char* acc2;
610  char* p;
611  char* q;
612  Char ch1;
613  Char ch2;
614 
615  if (extra_accs.empty())
616  return false;
617 
618  if (extra_accs.size() != 2)
619  return true;
620 
621  CGB_block::TExtra_accessions::const_iterator it = extra_accs.begin();
622 
623  ppacc1 = *it;
624  ++it;
625  ppacc2 = *it;
626  acc1 = ppacc1.data();
627  acc2 = ppacc2.data();
628 
629  if (! acc1 && ! acc2)
630  return false;
631  if (! acc1 || ! acc2)
632  return true;
633 
634  p = StringChr(acc1, '-');
635  q = StringChr(acc2, '-');
636 
637  if (p && q)
638  return true;
639 
640  if (! p) {
641  master = acc1;
642  range = acc2;
643  if (q)
644  *q = '\0';
645  } else {
646  master = acc2;
647  range = acc1;
648  if (p) // ?
649  *p = '\0';
650  }
651 
652  if (fta_if_wgs_acc(master) != 0 || fta_if_wgs_acc(range) != 1) {
653  if (p)
654  *p = '-';
655  if (q)
656  *q = '-';
657  return true;
658  }
659 
660  if (p)
661  *p = '-';
662  if (q)
663  *q = '-';
664 
665  for (p = master; *p != '\0' && (*p < '0' || *p > '9');)
666  p++;
667  if (*p != '\0')
668  p++;
669  if (*p != '\0')
670  p++;
671  ch1 = *p;
672  *p = '\0';
673 
674  for (q = range; *q != '\0' && (*q < '0' || *q > '9');)
675  q++;
676  if (*q != '\0')
677  q++;
678  if (*q != '\0')
679  q++;
680  ch2 = *q;
681  *q = '\0';
682 
683  bool ret = (master == range);
684  *p = ch1;
685  *q = ch2;
686 
687  return ret;
688 }
689 
690 
692 {
693  if (bsh &&
694  bsh.IsSetInst_Repr() &&
696  bsh.IsSetInst_Ext()) {
697  const auto& ext = bsh.GetInst_Ext();
698  if (ext.IsDelta() &&
699  ext.GetDelta().IsSet()) {
700  const auto& delta = ext.GetDelta().Get();
701  return any_of(begin(delta),
702  end(delta),
703  [](CRef<CDelta_seq> pDeltaSeq) { return (pDeltaSeq && pDeltaSeq->IsLoc()); });
704  }
705  }
706  return false;
707 }
708 
709 static bool s_IsAccession(const CSeq_id& id)
710 {
711  const auto idType = id.Which();
712  switch (idType) {
713  case CSeq_id::e_Local:
714  case CSeq_id::e_General:
715  case CSeq_id::e_Gi:
717  return false;
718  default:
719  return true;
720  }
721 }
722 
723 
724 bool g_DoesNotReferencePrimary(const CDelta_ext& delta_ext, const CSeq_id& primary, CScope& scope)
725 {
726  const auto primaryType = primary.Which();
727  string primaryString = primary.GetSeqIdString();
728  const bool primaryIsAccession = s_IsAccession(primary);
729  const bool primaryIsGi = primaryIsAccession ? false : (primaryType == CSeq_id::e_Gi);
730 
731  unique_ptr<string> pPrimaryAccessionString;
732 
733  for (const auto& pDeltaSeq : delta_ext.Get()) {
734  if (pDeltaSeq && pDeltaSeq->IsLoc()) {
735  auto pId = pDeltaSeq->GetLoc().GetId();
736  const auto& deltaIdType = pId->Which();
737  if (deltaIdType == primaryType) {
738  if (pId->GetSeqIdString() == primaryString) {
739  return false;
740  }
741  } else {
742  if (primaryIsAccession && deltaIdType == CSeq_id::e_Gi) {
743  auto deltaHandle = CSeq_id_Handle::GetHandle(pId->GetGi());
744  auto deltaAccessionHandle = scope.GetAccVer(deltaHandle);
745  if (! deltaAccessionHandle) {
746  return false;
747  }
748 
749  if (deltaAccessionHandle.GetSeqId()->GetSeqIdString() ==
750  primaryString) {
751  return false;
752  }
753  } else if (primaryIsGi && s_IsAccession(*pId)) {
754  if (! pPrimaryAccessionString) {
755  auto primaryGiHandle = CSeq_id_Handle::GetHandle(primary.GetGi());
756  auto primaryAccessionHandle = scope.GetAccVer(primaryGiHandle);
757  if (! primaryAccessionHandle) {
758  return false;
759  }
760  pPrimaryAccessionString =
761  make_unique<string>(primaryAccessionHandle.GetSeqId()->GetSeqIdString());
762  }
763 
764  if (*pPrimaryAccessionString == pId->GetSeqIdString()) {
765  return false;
766  }
767  }
768  }
769  }
770  }
771  return true;
772 }
773 
774 
775 static int sGetPrefixLength(const CTempString& accession)
776 {
777  auto it = find_if(begin(accession),
778  end(accession),
779  [](char c) { return ! (isalpha(c) || c == '_'); });
780 
781  _ASSERT(it != accession.end());
782  return int(distance(accession.begin(), it));
783 }
784 
785 
786 /**********************************************************/
787 void fta_add_hist(ParserPtr pp, CBioseq& bioseq, CGB_block::TExtra_accessions& extra_accs, Parser::ESource source, CSeq_id::E_Choice acctype, bool pricon, const char* acc)
788 {
789  Int4 pri_acc;
790  Int4 sec_acc;
791 
792  if (pp->accver == false || pp->histacc == false ||
793  pp->source != source || pp->entrez_fetch == 0)
794  return;
795 
796  if (! fta_ranges_to_hist(extra_accs))
797  return;
798 
800  UnwrapAccessionRange(extra_accs, hist);
801  if (hist.empty())
802  return;
803 
804  // IndexblkPtr ibp = pp->entrylist[pp->curindx];
805 
806  pri_acc = fta_if_wgs_acc(acc);
807 
808  CTempString primaryAccession(acc);
809  SIZE_TYPE prefixLength = 0;
810 
811 
812  // bulk load sequences
813  vector<string> candidatesAccs;
814  vector<CRef<CSeq_id>> candidatesIds;
815  vector<CSeq_id_Handle> candidatesIdhs;
816 
817  list<CRef<CSeq_id>> replaces;
818 
819  for (const auto& accessionString : hist) {
820  if (accessionString.empty())
821  continue;
822 
823  const auto idChoice = GetNucAccOwner(accessionString);
824  if (idChoice == CSeq_id::e_not_set) {
825  continue;
826  }
827  sec_acc = fta_if_wgs_acc(accessionString);
828  if (sec_acc == 0) { // Project WGS accession
829  continue;
830  }
831 
832  if (sec_acc == 1) // Contig WGS accession
833  {
834  if (pri_acc == 0 || pri_acc == 2) { // A project WGS accession or
835  continue; // a scaffold WGS accession
836  }
837 
838  if (pri_acc == 1) { // Contig WGS accession
839  if (prefixLength <= 0) {
840  prefixLength = sGetPrefixLength(primaryAccession);
841  }
842 
843  if ((accessionString.length() <= prefixLength ||
844  ! NStr::EqualNocase(accessionString, 0, prefixLength, primaryAccession.substr(0, prefixLength)) ||
845  ! isdigit(accessionString[prefixLength])) &&
846  ! pp->allow_uwsec) {
847  continue;
848  }
849  }
850  }
851 
852  CRef<CSeq_id> id(new CSeq_id(idChoice, accessionString));
853  candidatesAccs.push_back(accessionString);
854  candidatesIds.push_back(id);
855  candidatesIdhs.push_back(CSeq_id_Handle::GetHandle(*id));
856  }
857 
858  vector<CBioseq_Handle> secondaryBshs = GetScope().GetBioseqHandles(candidatesIdhs);
859  for ( size_t i = 0; i < candidatesIdhs.size(); ++i ) {
860  auto& accessionString = candidatesAccs[i];
861  auto id = candidatesIds[i];
862  auto idChoice = id->Which();
863  auto secondaryBsh = secondaryBshs[i];
864  bool IsConOrScaffold = false;
865  try {
866  IsConOrScaffold = s_IsConOrScaffold(secondaryBsh);
867  } catch (...) {
868  ErrPostEx(SEV_ERROR, ERR_ACCESSION_CannotGetDivForSecondary, "Failed to determine division code for secondary accession \"%s\". Entry dropped.", accessionString.c_str());
869  continue;
870  }
871 
872  if (! IsConOrScaffold && pricon && idChoice == acctype) {
873  continue;
874  }
875 
876  if (IsConOrScaffold && ! pricon) {
877  CRef<CSeq_id> pPrimary(new CSeq_id(primaryAccession));
878  if (g_DoesNotReferencePrimary(secondaryBsh.GetInst_Ext().GetDelta(),
879  *pPrimary,
880  GetScope())) {
881  replaces.push_back(id);
882  }
883  continue;
884  }
885 
886  replaces.push_back(id);
887  }
888 
889 
890  if (! replaces.empty()) {
891  auto& hist_replaces_ids = bioseq.SetInst().SetHist().SetReplaces().SetIds();
892  hist_replaces_ids.splice(hist_replaces_ids.end(), replaces);
893  }
894 }
895 
896 /**********************************************************/
897 bool fta_strings_same(const char* s1, const char* s2)
898 {
899  if (! s1 && ! s2)
900  return true;
901  if (! s1 || ! s2 || ! StringEqu(s1, s2))
902  return false;
903  return true;
904 }
905 
906 /**********************************************************/
908 {
909  bool deldiv = false;
910 
911  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
912  bool delnode = false;
913  bool errpost = false;
914  if (*key == "HTGS_PHASE0") {
915  if (ibp->htg != 0 && ibp->htg != 5) {
916  delnode = true;
917  if (ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 3)
918  errpost = true;
919  } else {
920  ibp->htg = 4;
922  }
923  deldiv = true;
924  } else if (*key == "HTGS_PHASE1") {
925  if (ibp->htg != 0 && ibp->htg != 5) {
926  delnode = true;
927  if (ibp->htg == 2 || ibp->htg == 3 || ibp->htg == 4)
928  errpost = true;
929  } else {
930  ibp->htg = 1;
932  }
933  deldiv = true;
934  } else if (*key == "HTGS_PHASE2") {
935  if (ibp->htg != 0 && ibp->htg != 5) {
936  delnode = true;
937  if (ibp->htg == 1 || ibp->htg == 3 || ibp->htg == 4)
938  errpost = true;
939  } else {
940  ibp->htg = 2;
942  }
943  deldiv = true;
944  } else if (*key == "HTGS_PHASE3") {
945  if (ibp->htg != 0 && ibp->htg != 5) {
946  delnode = true;
947  if (ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 4)
948  errpost = true;
949  } else {
950  ibp->htg = 3;
952  }
953  deldiv = true;
954  } else if (*key == "HTG") {
955  if (ibp->htg == 0) {
956  ibp->htg = 5;
958  }
959  deldiv = true;
960  }
961 
962  if (errpost) {
963  ErrPostStr(SEV_ERROR, ERR_KEYWORD_MultipleHTGPhases, "This entry has multiple HTG-related keywords, for differing HTG phases. Ignoring all but the first.");
964  }
965 
966  if (delnode)
967  key = kwds.erase(key);
968  else
969  ++key;
970  }
971  if (ibp->htg == 5)
972  ibp->htg = 3;
973 
974  return deldiv;
975 }
976 
977 /**********************************************************/
978 static void fta_check_tpa_tsa_coverage(FTATpaBlockPtr ftbp, Int4 length, bool tpa)
979 {
980  FTATpaBlockPtr tftbp;
981  FTATpaSpanPtr ftsp;
982  FTATpaSpanPtr tftsp;
983  Int4 i1;
984  Int4 i2;
985  Int4 j;
986 
987  if (! ftbp || length < 1)
988  return;
989 
990  ftsp = new FTATpaSpan;
991  ftsp->from = ftbp->from1;
992  ftsp->to = ftbp->to1;
993  ftsp->next = nullptr;
994  tftsp = ftsp;
995  for (tftbp = ftbp; tftbp; tftbp = tftbp->next) {
996  i1 = tftbp->to1 - tftbp->from1;
997  i2 = tftbp->to2 - tftbp->from2;
998  j = (i2 > i1) ? (i2 - i1) : (i1 - i2);
999  i1++;
1000 
1001  if (i1 < 3000 && j * 10 > i1) {
1002  if (tpa)
1003  ErrPostEx(SEV_ERROR, ERR_TPA_SpanLengthDiff, "Span \"%d..%d\" of this TPA record differs from the span \"%d..%d\" of the contributing primary sequence or trace record by more than 10 percent.", tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1004  else
1005  ErrPostEx(SEV_ERROR, ERR_TSA_SpanLengthDiff, "Span \"%d..%d\" of this TSA record differs from the span \"%d..%d\" of the contributing primary sequence or trace record by more than 10 percent.", tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1006  }
1007 
1008  if (i1 >= 3000 && j > 300) {
1009  if (tpa)
1010  ErrPostEx(SEV_ERROR, ERR_TPA_SpanDiffOver300bp, "Span \"%d..%d\" of this TPA record differs from span \"%d..%d\" of the contributing primary sequence or trace record by more than 300 basepairs.", tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1011  else
1012  ErrPostEx(SEV_ERROR, ERR_TSA_SpanDiffOver300bp, "Span \"%d..%d\" of this TSA record differs from span \"%d..%d\" of the contributing primary sequence or trace record by more than 300 basepairs.", tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1013  }
1014 
1015  if (tftbp->from1 <= tftsp->to + 1) {
1016  if (tftbp->to1 > tftsp->to)
1017  tftsp->to = tftbp->to1;
1018  continue;
1019  }
1020 
1021  tftsp->next = new FTATpaSpan;
1022  tftsp = tftsp->next;
1023  tftsp->from = tftbp->from1;
1024  tftsp->to = tftbp->to1;
1025  }
1026 
1027  if (ftsp->from - 1 > 50) {
1028  if (tpa)
1029  ErrPostEx(SEV_ERROR, ERR_TPA_IncompleteCoverage, "This TPA record contains a sequence region \"1..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->from - 1);
1030  else
1031  ErrPostEx(SEV_ERROR, ERR_TSA_IncompleteCoverage, "This TSA record contains a sequence region \"1..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->from - 1);
1032  }
1033 
1034  for (; ftsp; ftsp = tftsp) {
1035  tftsp = ftsp->next;
1036  if (tftsp && tftsp->from - ftsp->to - 1 > 50) {
1037  if (tpa)
1038  ErrPostEx(SEV_ERROR, ERR_TPA_IncompleteCoverage, "This TPA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->to + 1, tftsp->from - 1);
1039  else
1040  ErrPostEx(SEV_ERROR, ERR_TSA_IncompleteCoverage, "This TSA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->to + 1, tftsp->from - 1);
1041  } else if (! tftsp && length - ftsp->to > 50) {
1042  if (tpa)
1043  ErrPostEx(SEV_ERROR, ERR_TPA_IncompleteCoverage, "This TPA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->to + 1, length);
1044  else
1045  ErrPostEx(SEV_ERROR, ERR_TSA_IncompleteCoverage, "This TSA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->to + 1, length);
1046  }
1047 
1048  delete ftsp;
1049  }
1050 }
1051 
1052 /**********************************************************/
1053 bool fta_number_is_huge(const Char* s)
1054 {
1055  size_t i = StringLen(s);
1056  if (i > 10)
1057  return true;
1058  else if (i < 10)
1059  return false;
1060 
1061  if (*s > '2')
1062  return true;
1063  else if (*s < '2')
1064  return false;
1065 
1066  if (*++s > '1')
1067  return true;
1068  else if (*s < '1')
1069  return false;
1070 
1071  if (*++s > '4')
1072  return true;
1073  else if (*s < '4')
1074  return false;
1075 
1076  if (*++s > '7')
1077  return true;
1078  else if (*s < '7')
1079  return false;
1080 
1081  if (*++s > '4')
1082  return true;
1083  else if (*s < '4')
1084  return false;
1085 
1086  if (*++s > '8')
1087  return true;
1088  else if (*s < '8')
1089  return false;
1090 
1091  if (*++s > '3')
1092  return true;
1093  else if (*s < '3')
1094  return false;
1095 
1096  if (*++s > '6')
1097  return true;
1098  else if (*s < '6')
1099  return false;
1100 
1101  if (*++s > '4')
1102  return true;
1103  else if (*s < '4')
1104  return false;
1105 
1106  if (*++s > '7')
1107  return true;
1108  return false;
1109 }
1110 
1111 /**********************************************************/
1112 bool fta_parse_tpa_tsa_block(CBioseq& bioseq, char* offset, char* acnum, Int2 vernum, size_t len, Int2 col_data, bool tpa)
1113 {
1114  FTATpaBlockPtr ftbp;
1115  FTATpaBlockPtr tftbp;
1116  FTATpaBlockPtr ft;
1117 
1118  string buf;
1119  char* p;
1120  char* q;
1121  char* r;
1122  char* t;
1123  const char* bad_accession;
1124  bool bad_line;
1125  bool bad_interval;
1126  Char ch;
1127  Int4 from1;
1128  Int4 to1;
1129  Int4 len1;
1130  Int4 len2;
1131 
1132  CSeq_id::E_Choice choice;
1133 
1134  if (! offset || ! acnum || len < 2)
1135  return false;
1136 
1137  choice = GetNucAccOwner(acnum);
1138 
1139  if (col_data == 0) /* HACK: XML format */
1140  {
1141  for (p = offset; *p != '\0'; p++)
1142  if (*p == '~')
1143  *p = '\n';
1144  p = StringChr(offset, '\n');
1145  if (! p)
1146  return false;
1147  buf.assign(p + 1);
1148  buf.append("\n");
1149  } else {
1150  ch = offset[len];
1151  offset[len] = '\0';
1152  p = StringChr(offset, '\n');
1153  if (! p) {
1154  offset[len] = ch;
1155  return false;
1156  }
1157  buf.assign(p + 1);
1158  offset[len] = ch;
1159  }
1160 
1161  ftbp = new FTATpaBlock;
1162 
1163  bad_line = false;
1164  bad_interval = false;
1165  bad_accession = nullptr;
1166  p = buf.data();
1167  for (q = StringChr(p, '\n'); q; p = q + 1, q = StringChr(p, '\n')) {
1168  *q = '\0';
1169  if ((Int2)StringLen(p) < col_data)
1170  break;
1171  for (p += col_data; *p == ' ';)
1172  p++;
1173  for (r = p; *p >= '0' && *p <= '9';)
1174  p++;
1175  if (*p != '-') {
1176  bad_interval = true;
1177  break;
1178  }
1179 
1180  *p++ = '\0';
1181  from1 = atoi(r);
1182 
1183  for (r = p; *p >= '0' && *p <= '9';)
1184  p++;
1185  if (*p != ' ' && *p != '\n' && *p != '\0') {
1186  bad_interval = true;
1187  break;
1188  }
1189  if (*p != '\0')
1190  *p++ = '\0';
1191  to1 = atoi(r);
1192 
1193  if (from1 >= to1) {
1194  bad_interval = true;
1195  break;
1196  }
1197 
1198  for (ft = ftbp; ft->next; ft = ft->next)
1199  if ((ft->next->from1 > from1) ||
1200  (ft->next->from1 == from1 && ft->next->to1 > to1))
1201  break;
1202  tftbp = new FTATpaBlock;
1203  tftbp->next = ft->next;
1204  ft->next = tftbp;
1205 
1206  tftbp->from1 = from1;
1207  tftbp->to1 = to1;
1208 
1209  while (*p == ' ')
1210  p++;
1211  for (r = p; *p != '\0' && *p != ' ' && *p != '\n';)
1212  p++;
1213  if (*p != '\0')
1214  *p++ = '\0';
1215  tftbp->accession = StringSave(r);
1216  r = StringChr(tftbp->accession, '.');
1217  if (r) {
1218  *r++ = '\0';
1219  for (t = r; *t >= '0' && *t <= '9';)
1220  t++;
1221  if (*t != '\0') {
1222  *--r = '.';
1223  bad_accession = tftbp->accession;
1224  break;
1225  }
1226  tftbp->version = atoi(r);
1227  }
1228 
1229  if (StringEquNI(tftbp->accession, "ti", 2)) {
1230  for (r = tftbp->accession + 2; *r == '0';)
1231  r++;
1232  if (*r == '\0') {
1233  bad_accession = tftbp->accession;
1234  break;
1235  }
1236  while (*r >= '0' && *r <= '9')
1237  r++;
1238  if (*r != '\0') {
1239  bad_accession = tftbp->accession;
1240  break;
1241  }
1242  } else {
1243  tftbp->sicho = GetNucAccOwner(tftbp->accession);
1244  if ((tftbp->sicho != CSeq_id::e_Genbank && tftbp->sicho != CSeq_id::e_Embl &&
1245  tftbp->sicho != CSeq_id::e_Ddbj &&
1246  (tftbp->sicho != CSeq_id::e_Tpg || tpa == false))) {
1247  bad_accession = tftbp->accession;
1248  break;
1249  }
1250  }
1251 
1252  while (*p == ' ')
1253  p++;
1254 
1255  if (StringEquNI(p, "not_available", 13)) {
1256  p += 13;
1257  tftbp->from2 = 1;
1258  tftbp->to2 = 1;
1259  } else {
1260  for (r = p; *p >= '0' && *p <= '9';)
1261  p++;
1262  if (*p != '-') {
1263  bad_interval = true;
1264  break;
1265  }
1266  *p++ = '\0';
1267  tftbp->from2 = atoi(r);
1268 
1269  for (r = p; *p >= '0' && *p <= '9';)
1270  p++;
1271  if (*p != ' ' && *p != '\n' && *p != '\0') {
1272  bad_interval = true;
1273  break;
1274  }
1275  if (*p != '\0')
1276  *p++ = '\0';
1277  tftbp->to2 = atoi(r);
1278 
1279  if (tftbp->from2 >= tftbp->to2) {
1280  bad_interval = true;
1281  break;
1282  }
1283  }
1284 
1285  while (*p == ' ')
1286  p++;
1287  if (*p == 'c') {
1288  tftbp->strand = eNa_strand_minus;
1289  for (p++; *p == ' ';)
1290  p++;
1291  } else
1292  tftbp->strand = eNa_strand_plus;
1293  if (*p != '\0') {
1294  bad_line = true;
1295  break;
1296  }
1297  }
1298 
1299  buf.clear();
1300  if (bad_line || bad_interval || bad_accession) {
1301  if (bad_interval) {
1302  if (tpa)
1303  ErrPostStr(SEV_REJECT, ERR_TPA_InvalidPrimarySpan, "Intervals from primary records on which a TPA record is based must be of form X-Y, where X is less than Y and both X and Y are integers. Entry dropped.");
1304  else
1305  ErrPostStr(SEV_REJECT, ERR_TSA_InvalidPrimarySpan, "Intervals from primary records on which a TSA record is based must be of form X-Y, where X is less than Y and both X and Y are integers. Entry dropped.");
1306  } else if (bad_accession) {
1307  if (tpa)
1308  ErrPostEx(SEV_REJECT, ERR_TPA_InvalidPrimarySeqId, "\"%s\" is not a GenBank/EMBL/DDBJ/Trace sequence identifier. Entry dropped.", bad_accession);
1309  else
1310  ErrPostEx(SEV_REJECT, ERR_TSA_InvalidPrimarySeqId, "\"%s\" is not a GenBank/EMBL/DDBJ/Trace sequence identifier. Entry dropped.", bad_accession);
1311  } else {
1312  if (tpa)
1313  ErrPostStr(SEV_REJECT, ERR_TPA_InvalidPrimaryBlock, "Supplied PRIMARY block for TPA record is incorrect. Cannot parse. Entry dropped.");
1314  else
1315  ErrPostStr(SEV_REJECT, ERR_TSA_InvalidPrimaryBlock, "Supplied PRIMARY block for TSA record is incorrect. Cannot parse. Entry dropped.");
1316  }
1317 
1318  if (ftbp)
1319  fta_tpa_block_free(ftbp);
1320  return false;
1321  }
1322 
1323  tftbp = ftbp->next;
1324  ftbp->next = nullptr;
1325  delete ftbp;
1326  ftbp = tftbp;
1327 
1328  fta_check_tpa_tsa_coverage(ftbp, bioseq.GetLength(), tpa);
1329 
1330  CSeq_hist::TAssembly& assembly = bioseq.SetInst().SetHist().SetAssembly();
1331  if (! assembly.empty())
1332  assembly.clear();
1333 
1334  CRef<CSeq_align> root_align(new CSeq_align);
1335 
1336  root_align->SetType(CSeq_align::eType_not_set);
1337  CSeq_align_set& align_set = root_align->SetSegs().SetDisc();
1338 
1339  for (; tftbp; tftbp = tftbp->next) {
1340  len1 = tftbp->to1 - tftbp->from1 + 1;
1341  len2 = tftbp->to2 - tftbp->from2 + 1;
1342 
1343  CRef<CSeq_align> align(new CSeq_align);
1345  align->SetDim(2);
1346 
1347  CSeq_align::C_Segs::TDenseg& seg = align->SetSegs().SetDenseg();
1348 
1349  seg.SetDim(2);
1350  seg.SetNumseg((len1 == len2) ? 1 : 2);
1351 
1352  seg.SetStarts().push_back(tftbp->from1 - 1);
1353  seg.SetStarts().push_back(tftbp->from2 - 1);
1354 
1355  if (len1 != len2) {
1356  if (len1 < len2) {
1357  seg.SetStarts().push_back(-1);
1358  seg.SetStarts().push_back(tftbp->from2 - 1 + len1);
1359  } else {
1360  seg.SetStarts().push_back(tftbp->from1 - 1 + len2);
1361  seg.SetStarts().push_back(-1);
1362  }
1363  }
1364 
1365  if (len1 == len2)
1366  seg.SetLens().push_back(len1);
1367  else if (len1 < len2) {
1368  seg.SetLens().push_back(len1);
1369  seg.SetLens().push_back(len2 - len1);
1370  } else {
1371  seg.SetLens().push_back(len2);
1372  seg.SetLens().push_back(len1 - len2);
1373  }
1374 
1375  seg.SetStrands().push_back(eNa_strand_plus);
1376  seg.SetStrands().push_back(tftbp->strand);
1377 
1378  if (len1 != len2) {
1379  seg.SetStrands().push_back(eNa_strand_plus);
1380  seg.SetStrands().push_back(tftbp->strand);
1381  }
1382 
1383  CRef<CTextseq_id> text_id(new CTextseq_id);
1384  text_id->SetAccession(acnum);
1385 
1386  if (vernum > 0)
1387  text_id->SetVersion(vernum);
1388 
1389  CRef<CSeq_id> id(new CSeq_id),
1390  aux_id;
1391  SetTextId(choice, *id, *text_id);
1392  seg.SetIds().push_back(id);
1393 
1394  if (StringEquNI(tftbp->accession, "ti", 2)) {
1395  CRef<CSeq_id> gen_id(new CSeq_id);
1396  CDbtag& tag = gen_id->SetGeneral();
1397 
1398  for (r = tftbp->accession + 2; *r == '0';)
1399  r++;
1400  if (fta_number_is_huge(r) == false)
1401  tag.SetTag().SetId(atoi(r));
1402  else
1403  tag.SetTag().SetStr(r);
1404 
1405  tag.SetDb("ti");
1406  seg.SetIds().push_back(gen_id);
1407  } else {
1408  CRef<CTextseq_id> otext_id(new CTextseq_id);
1409  otext_id->SetAccession(tftbp->accession);
1410 
1411  if (tftbp->version > 0)
1412  otext_id->SetVersion(tftbp->version);
1413 
1414  aux_id.Reset(new CSeq_id);
1415  SetTextId(tftbp->sicho, *aux_id, *otext_id);
1416  }
1417 
1418  if (aux_id.NotEmpty())
1419  seg.SetIds().push_back(aux_id);
1420 
1421  align_set.Set().push_back(align);
1422  }
1423 
1424  assembly.push_back(root_align);
1425 
1426  if (ftbp)
1427  fta_tpa_block_free(ftbp);
1428  return true;
1429 }
1430 
1431 /**********************************************************/
1432 char* StringRStr(char* where, const char* what)
1433 {
1434  if (! where || ! what || *where == '\0' || *what == '\0')
1435  return nullptr;
1436 
1437  size_t i = StringLen(what);
1438  char* res = nullptr;
1439  for (char* p = where; *p != '\0'; p++)
1440  if (StringEquN(p, what, i))
1441  res = p;
1442 
1443  return (res);
1444 }
1445 
1446 /**********************************************************/
1448 {
1449  CRef<CSeq_loc> ret;
1450 
1451  if (len < 1)
1452  return ret;
1453 
1454  ret.Reset(new CSeq_loc);
1455  CSeq_interval& interval = ret->SetInt();
1456 
1457  interval.SetFrom(0);
1458  interval.SetTo(static_cast<TSeqPos>(len) - 1);
1459  interval.SetId(seq_id);
1460 
1461  return ret;
1462 }
1463 
1464 /**********************************************************/
1465 static void fta_validate_assembly(char* name)
1466 {
1467  bool bad_format = false;
1468 
1469  char* p = name;
1470  if (! p || *p == '\0' || StringLen(p) < 7)
1471  bad_format = true;
1472  else if (p[0] != 'G' || p[1] != 'C' || (p[2] != 'F' && p[2] != 'A') ||
1473  p[3] != '_' || p[4] < '0' || p[4] > '9')
1474  bad_format = true;
1475  else {
1476  for (p += 5; *p != '\0'; p++)
1477  if (*p < '0' || *p > '9')
1478  break;
1479  if (*p != '.' || p[1] < '0' || p[1] > '9')
1480  bad_format = true;
1481  else {
1482  for (p++; *p != '\0'; p++)
1483  if (*p < '0' || *p > '9')
1484  break;
1485  if (*p != '\0')
1486  bad_format = true;
1487  }
1488  }
1489 
1490  if (bad_format)
1491  ErrPostEx(SEV_WARNING, ERR_DBLINK_InvalidIdentifier, "\"%s\" is not a validly formatted identifier for the Assembly resource.", name);
1492 }
1493 
1494 /**********************************************************/
1496 {
1497  char* p;
1498  bool bad_format = false;
1499 
1500  if (StringLen(name) < 6)
1501  bad_format = true;
1502  else if (name[0] != 'P' || name[1] != 'R' || name[2] != 'J' ||
1503  (name[3] != 'E' && name[3] != 'N' && name[3] != 'D') ||
1504  name[4] < 'A' || name[4] > 'Z' || name[5] < '0' || name[5] > '9')
1505  bad_format = true;
1506  else {
1507  for (p = name + 6; *p != '\0'; p++)
1508  if (*p < '0' || *p > '9')
1509  break;
1510  if (*p != '\0')
1511  bad_format = true;
1512  }
1513 
1514  if (bad_format) {
1515  ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc, "BioProject accession number is not validly formatted: \"%s\". Entry dropped.", name);
1516  return false;
1517  }
1518 
1519  if ((source == Parser::ESource::NCBI && name[3] != 'N') ||
1520  (source == Parser::ESource::DDBJ && name[3] != 'D' &&
1521  (name[3] != 'N' || name[4] != 'A')) ||
1522  (source == Parser::ESource::EMBL && name[3] != 'E' &&
1523  (name[3] != 'N' || name[4] != 'A')))
1524  ErrPostEx(SEV_WARNING, ERR_FORMAT_WrongBioProjectPrefix, "BioProject accession number does not agree with this record's database of origin: \"%s\".", name);
1525 
1526  return true;
1527 }
1528 
1529 /**********************************************************/
1531 {
1532  ValNodePtr vnp;
1533  ValNodePtr tvnp;
1534  char* p;
1535  char* q;
1536  char* r;
1537  bool bad;
1538  Char ch;
1539 
1540  if (! str || *str == '\0') {
1541  ErrPostStr(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc, "Empty PROJECT/PR line type supplied. Entry dropped.");
1542  return nullptr;
1543  }
1544 
1545  for (p = str; *p != '\0'; p++)
1546  if (*p == ';' || *p == ',' || *p == '\t')
1547  *p = ' ';
1548 
1549  for (p = str; *p == ' ';)
1550  p++;
1551  if (*p == '\0') {
1552  ErrPostStr(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc, "Empty PROJECT/PR line type supplied. Entry dropped.");
1553  return nullptr;
1554  }
1555 
1556  vnp = ValNodeNew(nullptr);
1557  tvnp = vnp;
1558 
1559  for (bad = false, p = str; *p != '\0';) {
1560  while (*p == ' ')
1561  p++;
1562 
1563  if (*p == '\0')
1564  break;
1565 
1566  for (q = p; *p != ' ' && *p != '\0';)
1567  p++;
1568 
1569  ch = *p;
1570  *p = '\0';
1571  if (! newstyle) {
1572  for (r = q; *r >= '0' && *r <= '9';)
1573  r++;
1574  if (*r != '\0') {
1575  ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc, "BioProject accession number is not validly formatted: \"%s\". Entry dropped.", q);
1576  bad = true;
1577  }
1578  } else if (fta_validate_bioproject(q, source) == false)
1579  bad = true;
1580 
1581  if (bad) {
1582  *p = ch;
1583  break;
1584  }
1585 
1586  tvnp = ValNodeNew(tvnp, q);
1587  *p = ch;
1588  }
1589 
1590  tvnp = vnp->next;
1591  delete vnp;
1592 
1593  if (! tvnp)
1594  return nullptr;
1595 
1596  if (! bad)
1597  return (tvnp);
1598 
1599  ValNodeFreeData(tvnp);
1600  return nullptr;
1601 }
1602 
1603 /**********************************************************/
1605 {
1606  ValNodePtr vnp;
1607  ValNodePtr tvnp;
1608 
1609  const Char* name;
1610 
1611  char* str;
1612  char* p;
1613  Char ch;
1614  Int4 i;
1615 
1616  if (! offset)
1617  return;
1618 
1619  bool newstyle = false;
1621  i = ParFlat_COL_DATA;
1622  name = "GenomeProject:";
1623  ch = '\n';
1624  } else {
1626  name = "Project:";
1627  ch = ';';
1628  }
1629 
1630  size_t len = StringLen(name);
1631  str = StringSave(offset + i);
1632  p = StringChr(str, ch);
1633  if (p)
1634  *p = '\0';
1635 
1636  if (! StringEquN(str, name, len)) {
1638  ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc, "PROJECT line is missing \"GenomeProject:\" tag. Entry dropped.", str);
1639  MemFree(str);
1640  *drop = true;
1641  return;
1642  }
1643  newstyle = true;
1644  len = 0;
1645  } else if (format == Parser::EFormat::EMBL && str[len] == 'P')
1646  newstyle = true;
1647 
1648  vnp = fta_tokenize_project(str + len, source, newstyle);
1649  if (! vnp) {
1650  *drop = true;
1651  MemFree(str);
1652  return;
1653  }
1654 
1655  CUser_object* user_obj_ptr;
1656  bool got = false;
1657 
1658  for (auto& descr : descrs) {
1659  if (! descr->IsUser() || ! descr->GetUser().IsSetData())
1660  continue;
1661 
1662  user_obj_ptr = &(descr->SetUser());
1663 
1664  CObject_id* obj_id = nullptr;
1665  if (user_obj_ptr->IsSetType())
1666  obj_id = &(user_obj_ptr->SetType());
1667 
1668  if (obj_id && obj_id->IsStr() && obj_id->GetStr() == "DBLink") {
1669  got = true;
1670  break;
1671  }
1672  }
1673 
1674  CRef<CUser_object> user_obj;
1675  if (newstyle) {
1676  for (i = 0, tvnp = vnp; tvnp; tvnp = tvnp->next)
1677  i++;
1678 
1679  if (! got) {
1680  user_obj.Reset(new CUser_object);
1681  user_obj_ptr = user_obj.GetNCPointer();
1682 
1683  CObject_id& id = user_obj_ptr->SetType();
1684  id.SetStr("DBLink");
1685  }
1686 
1687  CRef<CUser_field> user_field(new CUser_field);
1688  user_field->SetLabel().SetStr("BioProject");
1689  user_field->SetNum(i);
1690 
1691  for (tvnp = vnp; tvnp; tvnp = tvnp->next)
1692  user_field->SetData().SetStrs().push_back(tvnp->data);
1693 
1694  user_obj_ptr->SetData().push_back(user_field);
1695  } else {
1696  got = false;
1697 
1698  user_obj.Reset(new CUser_object);
1699  user_obj_ptr = user_obj.GetNCPointer();
1700 
1701  CObject_id& id = user_obj_ptr->SetType();
1702  id.SetStr("GenomeProjectsDB");
1703 
1704  for (tvnp = vnp; tvnp; tvnp = tvnp->next) {
1705 
1706  CRef<CUser_field> user_field(new CUser_field);
1707  user_field->SetLabel().SetStr("ProjectID");
1708  user_field->SetData().SetInt(atoi(tvnp->data));
1709  user_obj_ptr->SetData().push_back(user_field);
1710 
1711 
1712  user_field.Reset(new CUser_field);
1713  user_field->SetLabel().SetStr("ParentID");
1714  user_field->SetData().SetInt(0);
1715  user_obj_ptr->SetData().push_back(user_field);
1716  }
1717  }
1718 
1719  if (! got) {
1720  CRef<CSeqdesc> descr(new CSeqdesc);
1721  descr->SetUser(*user_obj_ptr);
1722  descrs.push_back(descr);
1723  }
1724 
1725  MemFree(str);
1726  ValNodeFree(vnp);
1727 }
1728 
1729 /**********************************************************/
1730 bool fta_if_valid_sra(const Char* id, bool dblink)
1731 {
1732  const Char* p = id;
1733 
1734  if (p && StringLen(p) > 3 &&
1735  (p[0] == 'E' || p[0] == 'S' || p[0] == 'D') && p[1] == 'R' &&
1736  (p[2] == 'A' || p[2] == 'P' || p[2] == 'R' || p[2] == 'S' ||
1737  p[2] == 'X' || p[2] == 'Z')) {
1738  for (p += 3; *p >= '0' && *p <= '9';)
1739  p++;
1740  if (*p == '\0')
1741  return true;
1742  }
1743 
1744  if (dblink)
1745  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Incorrectly formatted DBLINK Sequence Read Archive value: \"%s\". Entry dropped.", id);
1746 
1747  return false;
1748 }
1749 
1750 /**********************************************************/
1751 bool fta_if_valid_biosample(const Char* id, bool dblink)
1752 {
1753  const Char* p = id;
1754 
1755  if (p && StringLen(p) > 5 && p[0] == 'S' && p[1] == 'A' &&
1756  p[2] == 'M' && (p[3] == 'N' || p[3] == 'E' || p[3] == 'D')) {
1757  if (p[4] == 'A' || p[4] == 'G')
1758  p += 5;
1759  else
1760  p += 4;
1761  while (*p >= '0' && *p <= '9')
1762  p++;
1763  if (*p == '\0')
1764  return true;
1765  }
1766 
1767  if (dblink)
1768  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Incorrectly formatted DBLINK BioSample value: \"%s\". Entry dropped.", id);
1769 
1770  return false;
1771 }
1772 
1773 /**********************************************************/
1775 {
1776  ValNodePtr vnp;
1777  ValNodePtr tvnp;
1778  ValNodePtr uvnp;
1779  ValNodePtr tagvnp;
1780 
1781  bool got_nl;
1782  bool bad;
1783  bool sra;
1784  bool assembly;
1785  bool biosample;
1786  bool bioproject;
1787 
1788  char* p;
1789  char* q;
1790  char* r = nullptr;
1791  char* t;
1792  char* u;
1793  Char ch;
1794 
1795  if (! str || *str == '\0') {
1796  ErrPostStr(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Empty DBLINK line type supplied. Entry dropped.");
1797  return nullptr;
1798  }
1799 
1800  for (p = str; *p != '\0'; p++)
1801  if (*p == ';' || *p == '\t')
1802  *p = ' ';
1803 
1804  vnp = ValNodeNew(nullptr);
1805  tvnp = vnp;
1806  bad = false;
1807  got_nl = true;
1808  sra = false;
1809  assembly = false;
1810  biosample = false;
1811  bioproject = false;
1812  tagvnp = nullptr;
1813 
1814  for (p = str; *p != '\0'; got_nl = false) {
1815  while (*p == ' ' || *p == '\n' || *p == ':' || *p == ',') {
1816  if (*p == '\n')
1817  got_nl = true;
1818  p++;
1819  }
1820 
1821  if (got_nl) {
1822  t = StringChr(p, ':');
1823  if (t) {
1824  r = StringChr(p, '\n');
1825  u = StringChr(p, ',');
1826 
1827  if ((! u || u > t) && (! r || r > t)) {
1828  ch = *++t;
1829  *t = '\0';
1830 
1831  if (! StringEqu(p, "Project:") &&
1832  ! StringEqu(p, "Assembly:") &&
1833  ! StringEqu(p, "BioSample:") &&
1834  ! StringEqu(p, "BioProject:") &&
1835  ! StringEqu(p, "Sequence Read Archive:") &&
1836  ! StringEqu(p, "Trace Assembly Archive:")) {
1837  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Invalid DBLINK tag encountered: \"%s\". Entry dropped.", p);
1838  bad = true;
1839  break;
1840  }
1841 
1842  bioproject = StringEqu(p, "BioProject:");
1843  sra = StringEqu(p, "Sequence Read Archive:");
1844  biosample = StringEqu(p, "BioSample:");
1845  assembly = StringEqu(p, "Assembly:");
1846 
1847  if (tvnp->data && StringChr(tvnp->data, ':')) {
1848  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Found DBLINK tag with no value: \"%s\". Entry dropped.", tvnp->data);
1849  bad = true;
1850  break;
1851  }
1852 
1853  for (uvnp = vnp->next; uvnp; uvnp = uvnp->next)
1854  if (StringEqu(uvnp->data, p)) {
1855  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Multiple DBLINK tags found: \"%s\". Entry dropped.", p);
1856  bad = true;
1857  break;
1858  }
1859  if (bad)
1860  break;
1861 
1862  tvnp = ValNodeNew(tvnp, p);
1863  tagvnp = tvnp;
1864  *t = ch;
1865  p = t;
1866  continue;
1867  }
1868  }
1869  }
1870 
1871  q = p;
1872  while (*p != ',' && *p != '\n' && *p != ':' && *p != '\0')
1873  p++;
1874  if (*p == ':') {
1875  while (*p != '\0' && *p != '\n')
1876  p++;
1877  ch = *p;
1878  *p = '\0';
1879  while (*r != '\n' && r > str)
1880  r--;
1881  while (*r == ' ' || *r == '\n')
1882  r++;
1883  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Too many delimiters/fields for DBLINK line: \"%s\". Entry dropped.", r);
1884  *p = ch;
1885  bad = true;
1886  break;
1887  }
1888 
1889  if (q == p)
1890  continue;
1891 
1892  ch = *p;
1893  *p = '\0';
1894 
1895  if (tagvnp && tagvnp->data) {
1896  for (uvnp = tagvnp->next; uvnp; uvnp = uvnp->next) {
1897  if (! uvnp->data || ! StringEqu(uvnp->data, q))
1898  continue;
1899 
1900  ErrPostEx(SEV_WARNING, ERR_DBLINK_DuplicateIdentifierRemoved, "Duplicate identifier \"%s\" from \"%s\" link removed.", q, tagvnp->data);
1901  break;
1902  }
1903 
1904  if (uvnp) {
1905  *p = ch;
1906  continue;
1907  }
1908  }
1909 
1910  if ((bioproject &&
1911  fta_validate_bioproject(q, source) == false) ||
1912  (biosample && fta_if_valid_biosample(q, true) == false) ||
1913  (sra && fta_if_valid_sra(q, true) == false)) {
1914  *p = ch;
1915  bad = true;
1916  }
1917 
1918  if (assembly)
1920 
1921  tvnp = ValNodeNew(tvnp, q);
1922  *p = ch;
1923  }
1924 
1925  if (! bad && tvnp->data && StringChr(tvnp->data, ':')) {
1926  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Found DBLINK tag with no value: \"%s\". Entry dropped.", tvnp->data);
1927  bad = true;
1928  }
1929 
1930  tvnp = vnp->next;
1931  delete vnp;
1932 
1933  if (! tvnp)
1934  return nullptr;
1935 
1936  if (! bad)
1937  return (tvnp);
1938 
1939  ValNodeFreeData(tvnp);
1940  return nullptr;
1941 }
1942 
1943 /**********************************************************/
1945 {
1946  ValNodePtr vnp;
1947  ValNodePtr tvnp;
1948  ValNodePtr uvnp;
1949 
1950  const char* str;
1951  Int4 i;
1952 
1953  if (! offset)
1954  return;
1955 
1956  char* str1 = StringSave(offset + ParFlat_COL_DATA);
1957  str1[len - ParFlat_COL_DATA] = '\0';
1958  vnp = fta_tokenize_dblink(str1, source);
1959  MemFree(str1);
1960 
1961  if (! vnp) {
1962  *drop = true;
1963  return;
1964  }
1965 
1966  CRef<CUser_object> user_obj;
1967  CRef<CUser_field> user_field;
1968 
1969  for (tvnp = vnp; tvnp; tvnp = tvnp->next) {
1970  if (StringChr(tvnp->data, ':')) {
1971  if (user_obj.NotEmpty())
1972  break;
1973 
1974  if (StringEqu(tvnp->data, "Project:")) {
1975  user_obj.Reset(new CUser_object);
1976  CObject_id& id = user_obj->SetType();
1977 
1978  id.SetStr("GenomeProjectsDB");
1979  }
1980  continue;
1981  }
1982 
1983  if (user_obj.Empty())
1984  continue;
1985 
1986  str = tvnp->data;
1987  if (! str || *str == '\0')
1988  continue;
1989 
1990  if (*str != '0')
1991  while (*str >= '0' && *str <= '9')
1992  str++;
1993  if (*str != '\0') {
1994  ErrPostEx(SEV_ERROR, ERR_FORMAT_IncorrectDBLINK, "Skipping invalid \"Project:\" value on the DBLINK line: \"%s\".", tvnp->data);
1995  continue;
1996  }
1997 
1998  user_field.Reset(new CUser_field);
1999 
2000  user_field->SetLabel().SetStr("ProjectID");
2001  user_field->SetData().SetInt(atoi(tvnp->data));
2002  user_obj->SetData().push_back(user_field);
2003 
2004  user_field.Reset(new CUser_field);
2005  user_field->SetLabel().SetStr("ParentID");
2006  user_field->SetData().SetInt(0);
2007 
2008  user_obj->SetData().push_back(user_field);
2009  }
2010 
2011  if (user_obj.NotEmpty() && ! user_obj->IsSetData()) {
2012  user_obj.Reset();
2013  }
2014 
2015  if (user_obj.NotEmpty()) {
2016  CRef<CSeqdesc> descr(new CSeqdesc);
2017  descr->SetUser(*user_obj);
2018  descrs.push_back(descr);
2019  }
2020 
2021  user_obj.Reset();
2022  user_field.Reset();
2023 
2024  bool inpr = false;
2025  for (tvnp = vnp; tvnp; tvnp = tvnp->next) {
2026  if (StringChr(tvnp->data, ':')) {
2027  if (StringEqu(tvnp->data, "Project:")) {
2028  inpr = true;
2029  continue;
2030  }
2031 
2032  inpr = false;
2033 
2034  if (user_obj.Empty()) {
2035  user_obj.Reset(new CUser_object);
2036  user_obj->SetType().SetStr("DBLink");
2037  }
2038 
2039  for (i = 0, uvnp = tvnp->next; uvnp; uvnp = uvnp->next, i++)
2040  if (StringChr(uvnp->data, ':'))
2041  break;
2042 
2043  user_field.Reset(new CUser_field);
2044 
2045  string lstr(tvnp->data);
2046  lstr = lstr.substr(0, lstr.size() - 1);
2047  user_field->SetLabel().SetStr(lstr);
2048  user_field->SetNum(i);
2049  user_field->SetData().SetStrs();
2050 
2051  user_obj->SetData().push_back(user_field);
2052  } else if (! inpr && user_obj.NotEmpty()) {
2053  user_field->SetData().SetStrs().push_back(tvnp->data);
2054  }
2055  }
2056 
2057  ValNodeFreeData(vnp);
2058 
2059  if (user_obj.NotEmpty()) {
2060  CRef<CSeqdesc> descr(new CSeqdesc);
2061  descr->SetUser(*user_obj);
2062  descrs.push_back(descr);
2063 
2064  dbuop = user_obj;
2065  }
2066 }
2067 
2068 /**********************************************************/
2070 {
2071  if (bioseq.GetInst().GetRepr() != CSeq_inst::eRepr_delta || ! bioseq.GetInst().IsSetExt() || ! bioseq.GetInst().GetExt().IsDelta())
2072  return CMolInfo::eTech_unknown;
2073 
2074  bool good = false;
2075  bool finished = true;
2076 
2077  for (const auto& delta : bioseq.GetInst().GetExt().GetDelta().Get()) {
2078  if (! delta->IsLoc())
2079  continue;
2080 
2081  const CSeq_loc& locs = delta->GetLoc();
2082  CSeq_loc_CI ci(locs);
2083 
2084  for (; ci; ++ci) {
2085  const CSeq_id* id = nullptr;
2086 
2088  if (loc->IsEmpty() || loc->IsWhole() || loc->IsInt() || loc->IsPnt() || loc->IsPacked_pnt())
2089  id = &ci.GetSeq_id();
2090  else
2091  continue;
2092 
2093  if (! id)
2094  break;
2095 
2096  if (! id->IsGenbank() && ! id->IsEmbl() &&
2097  ! id->IsOther() && ! id->IsDdbj() &&
2098  ! id->IsTpg() && ! id->IsTpe() && ! id->IsTpd())
2099  break;
2100 
2101  const CTextseq_id* text_id = id->GetTextseq_Id();
2102  if (! text_id || ! text_id->IsSetAccession() ||
2103  text_id->GetAccession().empty() ||
2104  fta_if_wgs_acc(text_id->GetAccession()) != 1)
2105  break;
2106  good = true;
2107  }
2108 
2109  if (ci) {
2110  finished = false;
2111  break;
2112  }
2113  }
2114 
2115  if (good && finished)
2116  return CMolInfo::eTech_wgs;
2117 
2118  return CMolInfo::eTech_unknown;
2119 }
2120 
2121 /**********************************************************/
2122 static void fta_fix_seq_id(CSeq_loc& loc, CSeq_id& id, IndexblkPtr ibp, const char* location, const char* name, SeqLocIdsPtr slip, bool iscon, Parser::ESource source)
2123 {
2124  Int4 i;
2125 
2126  if (! ibp)
2127  return;
2128 
2129  if (id.IsLocal()) {
2130  return;
2131  }
2132 
2133  if (! name && id.IsGeneral()) {
2134  const CDbtag& tag = id.GetGeneral();
2135  if (tag.GetDb() == "SeqLit" || tag.GetDb() == "UnkSeqLit")
2136  return;
2137  }
2138 
2139  if (! id.IsGenbank() && ! id.IsEmbl() && ! id.IsPir() &&
2140  ! id.IsSwissprot() && ! id.IsOther() && ! id.IsDdbj() && ! id.IsPrf() &&
2141  ! id.IsTpg() && ! id.IsTpe() && ! id.IsTpd()) {
2142 
2143  if (! name)
2144  ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem, "Empty or unsupported Seq-id found in CONTIG/CO line at location: \"%s\". Entry skipped.", location);
2145  else
2146  ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem, "Empty or unsupported Seq-id found in feature \"%s\" at location \"%s\". Entry skipped.", name, location);
2147  ibp->drop = true;
2148  return;
2149  }
2150 
2151  const CTextseq_id* text_id = id.GetTextseq_Id();
2152  if (! text_id || ! text_id->IsSetAccession()) {
2153  if (! name)
2154  ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem, "Empty Seq-id found in CONTIG/CO line at location: \"%s\". Entry skipped.", location);
2155  else
2156  ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem, "Empty Seq-id found in feature \"%s\" at location \"%s\". Entry skipped.", name, location);
2157  ibp->drop = true;
2158  return;
2159  }
2160 
2161  const Char* accession = text_id->GetAccession().c_str();
2162  if (iscon) {
2163  i = IsNewAccessFormat(accession);
2164  if (i == 3) {
2165  if (! slip->wgscont)
2166  slip->wgscont = accession;
2167  else if (! slip->wgsacc && ! StringEquN(slip->wgscont, accession, 4))
2168  slip->wgsacc = accession;
2169  } else if (i == 7) {
2170  if (! slip->wgsscaf)
2171  slip->wgsscaf = accession;
2172  else if (! slip->wgsacc && ! StringEquN(slip->wgsscaf, accession, 4))
2173  slip->wgsacc = accession;
2174  }
2175  }
2176 
2179  if (type != id.Which()) {
2180  CRef<CTextseq_id> new_text_id(new CTextseq_id);
2181  new_text_id->Assign(*text_id);
2182  SetTextId(type, id, *new_text_id);
2183  }
2184  } else if (source == Parser::ESource::Flybase) {
2185  id.SetGeneral().SetDb("FlyBase");
2186  id.SetGeneral().SetTag().SetStr(accession);
2187  } else if (source == Parser::ESource::USPTO) {
2188  CRef<CPatent_seq_id> pat_id = MakeUsptoPatSeqId(accession);
2189  id.SetPatent(*pat_id);
2190  } else {
2191  if (! name)
2192  ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem, "Invalid accession found in CONTIG/CO line at location: \"%s\". Entry skipped.", location);
2193  else
2194  ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem, "Invalid accession found in feature \"%s\" at location \"%s\". Entry skipped.", name, location);
2195  ibp->drop = true;
2196  return;
2197  }
2198 
2199  slip->total++;
2200 
2201  if (id.IsGenbank()) {
2203  source != Parser::ESource::LANL && ! slip->badslp)
2204  slip->badslp = &loc;
2205  slip->genbank = 1;
2206  } else if (id.IsEmbl()) {
2208  ! slip->badslp)
2209  slip->badslp = &loc;
2210  slip->embl = 1;
2211  } else if (id.IsPir()) {
2212  if (source != Parser::ESource::All &&
2213  ! slip->badslp)
2214  slip->badslp = &loc;
2215  slip->pir = 1;
2216  } else if (id.IsSwissprot()) {
2218  ! slip->badslp)
2219  slip->badslp = &loc;
2220  slip->swissprot = 1;
2221  } else if (id.IsOther()) {
2223  ! slip->badslp)
2224  slip->badslp = &loc;
2225  slip->other = 1;
2226  } else if (id.IsDdbj()) {
2228  ! slip->badslp)
2229  slip->badslp = &loc;
2230  slip->ddbj = 1;
2231  } else if (id.IsPrf()) {
2232  if (source != Parser::ESource::All &&
2233  ! slip->badslp)
2234  slip->badslp = &loc;
2235  slip->prf = 1;
2236  } else if (id.IsTpg()) {
2238  source != Parser::ESource::LANL && ! slip->badslp)
2239  slip->badslp = &loc;
2240  slip->tpg = 1;
2241  } else if (id.IsTpe()) {
2243  ! slip->badslp)
2244  slip->badslp = &loc;
2245  slip->tpe = 1;
2246  } else if (id.IsTpd()) {
2248  ! slip->badslp)
2249  slip->badslp = &loc;
2250  slip->tpd = 1;
2251  }
2252 }
2253 
2254 /**********************************************************/
2255 static void fta_do_fix_seq_loc_id(TSeqLocList& locs, IndexblkPtr ibp, const char* location, const char* name, SeqLocIdsPtr slip, bool iscon, Parser::ESource source)
2256 {
2257  for (auto& loc : locs) {
2258  if (loc->IsEmpty()) {
2259  fta_fix_seq_id(*loc, loc->SetEmpty(), ibp, location, name, slip, iscon, source);
2260  } else if (loc->IsWhole()) {
2261  fta_fix_seq_id(*loc, loc->SetWhole(), ibp, location, name, slip, iscon, source);
2262  } else if (loc->IsInt()) {
2263  fta_fix_seq_id(*loc, loc->SetInt().SetId(), ibp, location, name, slip, iscon, source);
2264  } else if (loc->IsPnt()) {
2265  fta_fix_seq_id(*loc, loc->SetPnt().SetId(), ibp, location, name, slip, iscon, source);
2266  if (iscon && ! loc->GetPnt().IsSetFuzz()) {
2267  int point = loc->GetPnt().GetPoint();
2268  CRef<CSeq_interval> interval(new CSeq_interval);
2269  interval->SetFrom(point);
2270  interval->SetTo(point);
2271 
2272  if (loc->GetPnt().IsSetStrand())
2273  interval->SetStrand(loc->GetPnt().GetStrand());
2274 
2275  interval->SetId(loc->SetPnt().SetId());
2276  loc->SetInt(*interval);
2277  }
2278  } else if (loc->IsPacked_int()) {
2279  for (auto& interval : loc->SetPacked_int().Set()) {
2280  fta_fix_seq_id(*loc, interval->SetId(), ibp, location, name, slip, iscon, source);
2281  }
2282  } else if (loc->IsPacked_pnt()) {
2283  fta_fix_seq_id(*loc, loc->SetPacked_pnt().SetId(), ibp, location, name, slip, iscon, source);
2284  } else if (loc->IsMix()) {
2285  fta_do_fix_seq_loc_id(loc->SetMix().Set(), ibp, location, name, slip, iscon, source);
2286  } else if (loc->IsEquiv()) {
2287  fta_do_fix_seq_loc_id(loc->SetEquiv().Set(), ibp, location, name, slip, iscon, source);
2288  }
2289  }
2290 }
2291 
2292 /**********************************************************/
2293 Int4 fta_fix_seq_loc_id(TSeqLocList& locs, ParserPtr pp, const char* location, const char* name, bool iscon)
2294 {
2295  SeqLocIds sli;
2296  const Char* p = nullptr;
2297  ErrSev sev;
2298  IndexblkPtr ibp;
2299  Int4 tpa;
2300  Int4 non_tpa;
2301  Int4 i = 0;
2302 
2303  ibp = pp->entrylist[pp->curindx];
2304 
2305  fta_do_fix_seq_loc_id(locs, ibp, location, name, &sli, iscon, pp->source);
2306 
2307  tpa = sli.tpg + sli.tpe + sli.tpd;
2308  non_tpa = sli.genbank + sli.embl + sli.pir + sli.swissprot + sli.other +
2309  sli.ddbj + sli.prf;
2310 
2311  if (iscon && ! sli.wgsacc && sli.wgscont &&
2312  sli.wgsscaf && ! StringEquN(sli.wgscont, sli.wgsscaf, 4))
2313  sli.wgsacc = sli.wgsscaf;
2314 
2315  if ((tpa > 0 && non_tpa > 0) || tpa > 1 || non_tpa > 1 ||
2316  (iscon && sli.wgscont && sli.wgsscaf)) {
2317  }
2318 
2319  if (tpa > 0 && non_tpa > 0) {
2320  if (! name)
2321  ErrPostEx(SEV_REJECT, ERR_LOCATION_TpaAndNonTpa, "The CONTIG/CO line with location \"%s\" refers to intervals on both primary and third-party sequence records. Entry skipped.", location);
2322  else
2323  ErrPostEx(SEV_REJECT, ERR_LOCATION_TpaAndNonTpa, "The \"%s\" feature at \"%s\" refers to intervals on both primary and third-party sequence records. Entry skipped.", name, location);
2324  ibp->drop = true;
2325  }
2326 
2327  if (tpa > 1 || non_tpa > 1) {
2328  if (! pp->allow_crossdb_featloc) {
2329  sev = SEV_REJECT;
2330  p = "Entry skipped.";
2331  ibp->drop = true;
2332  } else {
2333  sev = SEV_WARNING;
2334  p = "";
2335  }
2336  if (! name) {
2337  string label;
2338  if (sli.badslp)
2339  sli.badslp->GetLabel(&label);
2340 
2341  ErrPostEx(sev, ERR_LOCATION_CrossDatabaseFeatLoc, "The CONTIG/CO line refers to intervals on records from two or more INSDC databases. This is not allowed without review and approval : \"%s\".%s", label.empty() ? location : label.c_str(), p);
2342  } else
2343  ErrPostEx(sev, ERR_LOCATION_CrossDatabaseFeatLoc, "The \"%s\" feature at \"%s\" refers to intervals on records from two or more INSDC databases. This is not allowed without review and approval.%s", name, location, p);
2344  }
2345 
2346  if (iscon) {
2347  if (sli.wgscont && sli.wgsscaf)
2348  ErrPostEx(SEV_ERROR, ERR_LOCATION_ContigAndScaffold, "The CONTIG/CO line with location \"%s\" refers to intervals on both WGS contig and WGS scaffold records.", location);
2349 
2350  if (sli.wgsacc) {
2351  if (sli.wgscont && ! StringEquN(sli.wgscont, sli.wgsacc, 4))
2352  p = sli.wgscont;
2353  else if (sli.wgsscaf && ! StringEquN(sli.wgsscaf, sli.wgsacc, 4))
2354  p = sli.wgsscaf;
2355 
2356  if (p) {
2357  Char msga[5],
2358  msgb[5];
2359 
2360  StringNCpy(msga, sli.wgsacc, 4);
2361  StringNCpy(msgb, p, 4);
2362  msga[4] = msgb[4] = 0;
2363 
2364  ErrPostEx(SEV_WARNING, ERR_SEQUENCE_MultipleWGSProjects, "This CON/scaffold record is assembled from the contigs of multiple WGS projects. First pair of WGS project codes is \"%s\" and \"%s\".", msgb, msga);
2365  }
2366  }
2367 
2368  i = IsNewAccessFormat(ibp->acnum);
2369  if (i == 3 || i == 7) {
2370  p = nullptr;
2371  if (sli.wgscont && ! StringEquN(sli.wgscont, ibp->acnum, 4))
2372  p = sli.wgscont;
2373  else if (sli.wgsscaf && ! StringEquN(sli.wgsscaf, ibp->acnum, 4))
2374  p = sli.wgsscaf;
2375  else if (sli.wgsacc && ! StringEquN(sli.wgsacc, ibp->acnum, 4))
2376  p = sli.wgsscaf; // ?
2377 
2378  if (p) {
2379  Char msg[5];
2380  StringNCpy(msg, p, 4);
2381  msg[4] = 0;
2382 
2383  ErrPostEx(SEV_WARNING, ERR_ACCESSION_WGSPrefixMismatch, "This WGS CON/scaffold record is assembled from the contigs of different WGS projects. First differing WGS project code is \"%s\".", msg);
2384  }
2385  }
2386  }
2387 
2388  if (sli.wgscont)
2389  sli.wgscont = nullptr;
2390  if (sli.wgsscaf)
2391  sli.wgsscaf = nullptr;
2392  if (sli.wgsacc)
2393  sli.wgsacc = nullptr;
2394 
2395  return (sli.total);
2396 }
2397 
2398 /**********************************************************/
2400 {
2401  ValNodePtr res;
2402  ValNodePtr vnp;
2403  char* start;
2404  char* p;
2405  char* q;
2406  char* r;
2407  bool bad;
2408 
2409  if (! buf || *buf == '\0')
2410  return nullptr;
2411 
2412  for (p = buf; *p != '\0'; p++) {
2413  if (*p != '~')
2414  continue;
2415 
2416  for (p++; *p == ' ' || *p == '~'; p++)
2417  *p = ' ';
2418  p--;
2419  }
2420 
2421  bad = false;
2422  res = ValNodeNew(nullptr);
2423  vnp = res;
2424  for (start = buf;;) {
2425  p = StringStr(start, "::");
2426  if (! p) {
2427  if (start == buf)
2428  bad = true;
2429  break;
2430  }
2431 
2432  q = StringStr(p + 2, "::");
2433  if (! q) {
2434  vnp = ValNodeNew(vnp, start);
2435  for (r = vnp->data; *r != '\0'; r++)
2436  if (*r == '~')
2437  *r = ' ';
2438  ShrinkSpaces(vnp->data);
2439  break;
2440  }
2441 
2442  *q = '\0';
2443  r = StringRChr(p + 2, '~');
2444  *q = ':';
2445  if (! r) {
2446  bad = true;
2447  break;
2448  }
2449 
2450  *r = '\0';
2451  vnp = ValNodeNew(vnp, start);
2452  *r = '~';
2453  for (p = vnp->data; *p != '\0'; p++)
2454  if (*p == '~')
2455  *p = ' ';
2456  ShrinkSpaces(vnp->data);
2457 
2458  start = r;
2459  }
2460 
2461  vnp = res->next;
2462  res->next = nullptr;
2463  ValNodeFree(res);
2464 
2465  if (! bad)
2466  return (vnp);
2467 
2468  ValNodeFreeData(vnp);
2469  return nullptr;
2470 }
2471 
2472 /**********************************************************/
2474 {
2475  ValNodePtr vnp;
2476  ValNodePtr tvnp;
2477 
2478  char* p;
2479  char* q;
2480 
2481  CRef<CUser_object> obj;
2482 
2483  if (! tag || *tag == '\0' || ! buf || *buf == '\0')
2484  return obj;
2485 
2487  if (! vnp)
2488  return obj;
2489 
2490  obj.Reset(new CUser_object);
2491 
2492  CObject_id& id = obj->SetType();
2493  id.SetStr("StructuredComment");
2494 
2495  CRef<CUser_field> field(new CUser_field);
2496  field->SetLabel().SetStr("StructuredCommentPrefix");
2497 
2498  field->SetData().SetStr() = tag;
2499  field->SetData().SetStr() += "-START##";
2500 
2501  obj->SetData().push_back(field);
2502 
2503  for (tvnp = vnp; tvnp; tvnp = tvnp->next) {
2504  p = tvnp->data;
2505  if (! p || *p == '\0')
2506  continue;
2507 
2508  q = StringStr(p, "::");
2509  if (! q)
2510  continue;
2511 
2512  if (q > p && *(q - 1) == ' ')
2513  q--;
2514 
2515  for (*q++ = '\0'; *q == ' ' || *q == ':';)
2516  q++;
2517 
2518  if (*p == '\0' || *q == '\0')
2519  continue;
2520 
2521  field.Reset(new CUser_field);
2522  field->SetLabel().SetStr(p);
2523  field->SetData().SetStr(q);
2524 
2525  obj->SetData().push_back(field);
2526  }
2527 
2528  if (obj->GetData().size() < 2) {
2529  obj.Reset();
2530  return obj;
2531  }
2532 
2533  field.Reset(new CUser_field);
2534  field->SetLabel().SetStr("StructuredCommentSuffix");
2535  field->SetData().SetStr() = tag;
2536  field->SetData().SetStr() += "-END##";
2537 
2538  obj->SetData().push_back(field);
2539 
2540  ValNodeFreeData(vnp);
2541 
2542  return obj;
2543 }
2544 
2545 /**********************************************************/
2546 void fta_parse_structured_comment(char* str, bool& bad, TUserObjVector& objs)
2547 {
2548  ValNodePtr tagvnp;
2549  ValNodePtr vnp;
2550 
2551  char* start;
2552  char* tag = nullptr;
2553  char* buf;
2554  char* p;
2555  char* q;
2556  char* r;
2557 
2558  if (! str || *str == '\0')
2559  return;
2560 
2561  tagvnp = nullptr;
2562  for (p = str;;) {
2563  p = StringStr(p, "-START##");
2564  if (! p)
2565  break;
2566  for (q = p;; q--)
2567  if (*q == '~' || (*q == '#' && q > str && *--q == '#') || q == str)
2568  break;
2569  if (q[0] != '#' || q[1] != '#') {
2570  p += 8;
2571  continue;
2572  }
2573 
2574  start = q;
2575 
2576  tag = StringSave(string_view(q, p - q));
2577 
2578  for (q = p;;) {
2579  q = StringStr(q, tag);
2580  if (! q) {
2581  bad = true;
2582  break;
2583  }
2584  size_t i = StringLen(tag);
2585  if (! StringEquN(q + i, "-END##", 6)) {
2586  q += (i + 6);
2587  continue;
2588  }
2589  r = StringStr(p + 8, "-START##");
2590  if (r && r < q) {
2591  bad = true;
2592  break;
2593  }
2594  break;
2595  }
2596 
2597  if (bad)
2598  break;
2599 
2600  if (! tagvnp) {
2601  tagvnp = ValNodeNew(nullptr, tag);
2602  } else {
2603  for (vnp = tagvnp; vnp; vnp = vnp->next) {
2604  r = vnp->data;
2605  if (StringEqu(r + 2, tag + 2)) {
2606  if (*r != ' ') {
2607  ErrPostEx(SEV_ERROR, ERR_COMMENT_SameStructuredCommentTags, "More than one structured comment with the same tag \"%s\" found.", tag + 2);
2608  *r = ' ';
2609  }
2610  break;
2611  }
2612  if (! vnp->next) {
2613  ValNodeNew(vnp, tag);
2614  break;
2615  }
2616  }
2617  }
2618 
2619  if (StringEqu(tag, "##Metadata")) {
2620  MemFree(tag);
2621  p += 8;
2622  continue;
2623  }
2624 
2625  *q = '\0';
2626  if (! StringStr(p + 8, "::")) {
2627  ErrPostStr(SEV_ERROR, ERR_COMMENT_StructuredCommentLacksDelim, "The structured comment in this record lacks the expected double-colon '::' delimiter between fields and values.");
2628  MemFree(tag);
2629  p += 8;
2630  *q = '#';
2631  continue;
2632  }
2633 
2634  buf = StringSave(p + 8);
2635  *q = '#';
2636 
2638  MemFree(buf);
2639 
2640  if (cur.Empty()) {
2641  bad = true;
2642  break;
2643  }
2644 
2645  objs.push_back(cur);
2646 
2647  fta_StringCpy(start, q + StringLen(tag) + 6);
2648  MemFree(tag);
2649  p = start;
2650  }
2651 
2652  if (bad) {
2653  ErrPostEx(SEV_REJECT, ERR_COMMENT_InvalidStructuredComment, "Incorrectly formatted structured comment with tag \"%s\" encountered. Entry dropped.", tag + 2);
2654  MemFree(tag);
2655  }
2656 
2657  if (tagvnp)
2658  ValNodeFreeData(tagvnp);
2659 }
2660 
2661 /**********************************************************/
2662 string GetQSFromFile(FILE* fd, const Indexblk* ibp)
2663 {
2664  string ret;
2665  Char buf[1024];
2666 
2667  if (! fd || ibp->qslength < 1)
2668  return ret;
2669 
2670  ret.reserve(ibp->qslength + 10);
2671  fseek(fd, static_cast<long>(ibp->qsoffset), 0);
2672  while (fgets(buf, 1023, fd)) {
2673  if (buf[0] == '>' && ret[0] != '\0')
2674  break;
2675  ret.append(buf);
2676  }
2677  return ret;
2678 }
2679 
2680 /**********************************************************/
2682 {
2683  TSeqdescList* descrs = nullptr;
2684  if (seq_entry.IsSeq()) {
2685  if (seq_entry.GetSeq().IsSetDescr())
2686  descrs = &seq_entry.SetSeq().SetDescr().Set();
2687  } else if (seq_entry.IsSet()) {
2688  if (seq_entry.GetSet().IsSetDescr())
2689  descrs = &seq_entry.SetSet().SetDescr().Set();
2690  }
2691 
2692  if (! descrs)
2693  return;
2694 
2695  for (TSeqdescList::iterator descr = descrs->begin(); descr != descrs->end();) {
2696  if (! (*descr)->IsUser()) {
2697  ++descr;
2698  continue;
2699  }
2700 
2701  const CUser_object& user_obj = (*descr)->GetUser();
2702  if (! user_obj.IsSetType() || ! user_obj.GetType().IsStr() ||
2703  user_obj.GetType().GetStr() != "NcbiCleanup") {
2704  ++descr;
2705  continue;
2706  }
2707 
2708  descr = descrs->erase(descr);
2709  break;
2710  }
2711 }
2712 
2713 /**********************************************************/
2715  bool is_tsa)
2716 {
2717  bool got_comment = false;
2718  bool got_dblink = false;
2719 
2720  for (const auto& descr : bioseq.GetDescr().Get()) {
2721  if (! descr->IsUser())
2722  continue;
2723 
2724  const CUser_object& user_obj = descr->GetUser();
2725  if (! user_obj.IsSetType() || ! user_obj.GetType().IsStr())
2726  continue;
2727 
2728  const string& user_type_str = user_obj.GetType().GetStr();
2729 
2730  if (user_type_str == "StructuredComment")
2731  got_comment = true;
2732  else if (user_type_str == "GenomeProjectsDB")
2733  got_dblink = true;
2734  else if (user_type_str == "DBLink") {
2735  for (const auto& field : user_obj.GetData()) {
2736  if (! field->IsSetLabel() || ! field->GetLabel().IsStr() ||
2737  field->GetLabel().GetStr() != "BioProject")
2738  continue;
2739  got_dblink = true;
2740  break;
2741  }
2742  }
2743  }
2744 
2745  if (! is_tsa) {
2746  if (! got_comment)
2747  ErrPostStr(SEV_WARNING, ERR_ENTRY_TLSLacksStructuredComment, "This TLS record lacks an expected structured comment.");
2748  if (! got_dblink)
2749  ErrPostStr(SEV_WARNING, ERR_ENTRY_TLSLacksBioProjectLink, "This TLS record lacks an expected BioProject or Project link.");
2750  } else {
2751  if (! got_comment)
2752  ErrPostStr(SEV_WARNING, ERR_ENTRY_TSALacksStructuredComment, "This TSA record lacks an expected structured comment.");
2753  if (! got_dblink)
2754  ErrPostStr(SEV_WARNING, ERR_ENTRY_TSALacksBioProjectLink, "This TSA record lacks an expected BioProject or Project link.");
2755  }
2756 }
2757 
2758 /**********************************************************/
2760 {
2761  if (bioseq.GetInst().GetTopology() != CSeq_inst::eTopology_circular || (ibp && ibp->gaps))
2762  return;
2763 
2764  CMolInfo* mol_info = nullptr;
2765  for (auto& descr : bioseq.SetDescr().Set()) {
2766  if (descr->IsMolinfo()) {
2767  mol_info = &descr->SetMolinfo();
2768  break;
2769  }
2770  }
2771 
2772  if (mol_info) {
2774  } else {
2775  CRef<CSeqdesc> descr(new CSeqdesc);
2776  CMolInfo& mol = descr->SetMolinfo();
2778 
2779  bioseq.SetDescr().Set().push_back(descr);
2780  }
2781 }
2782 
2783 /**********************************************************/
2785 {
2786  if (num < 1000)
2787  return;
2788 
2789  ErrPostEx(SEV_INFO, ERR_SEQUENCE_HasManyComponents, "An OnlyNearFeatures FeatureFetchPolicy User-object has been added to this record because it is constructed from %d components, which exceeds the threshold of 999 for User-object creation.", num);
2790 
2791  CRef<CSeqdesc> descr(new CSeqdesc);
2792  descr->SetUser().SetType().SetStr("FeatureFetchPolicy");
2793 
2794  CRef<CUser_field> field(new CUser_field);
2795 
2796  field->SetLabel().SetStr("Policy");
2797  field->SetData().SetStr("OnlyNearFeatures");
2798 
2799  descr->SetUser().SetData().push_back(field);
2800 
2801  bsp.SetDescr().Set().push_back(descr);
2802 }
2803 
2804 /**********************************************************/
2805 void StripECO(string& str)
2806 {
2807  for (size_t i = str.find("{ECO:"); i != string::npos; i = str.find("{ECO:", i)) {
2808  size_t j = str.find('}', i);
2809  if (j == string::npos)
2810  break;
2811  ++j;
2812  if (i > 0 && str[i - 1] == ' ')
2813  --i;
2814  if (i > 0 && j < str.size()) {
2815  if ((str[i - 1] == '.' && str[j] == '.') ||
2816  (str[i - 1] == ';' && str[j] == ';')) {
2817  --i;
2818  }
2819  }
2820  str.erase(i, j - i);
2821  }
2822 }
2823 
2824 /**********************************************************/
2826 {
2827  if (uop.Empty() || ! uop->IsSetData() || ! uop->IsSetType() ||
2828  ! uop->GetType().IsStr() || uop->GetType().GetStr() != "DBLink")
2829  return false;
2830 
2831  bool got = false;
2832 
2833  for (const auto& field : uop->GetData()) {
2834  if (! field->IsSetData() || ! field->GetData().IsStrs() || ! field->IsSetNum() || field->GetNum() < 1 ||
2835  ! field->IsSetLabel() || ! field->GetLabel().IsStr() || field->GetLabel().GetStr() != "Sequence Read Archive")
2836  continue;
2837 
2838  for (const CStringUTF8& str : field->GetData().GetStrs()) {
2839  if (str.size() > 2 &&
2840  (str[0] == 'D' || str[0] == 'E' || str[0] == 'S') && str[1] == 'R' &&
2841  (str[2] == 'R' || str[2] == 'X' || str[2] == 'Z')) {
2842  got = true;
2843  break;
2844  }
2845  }
2846  if (got)
2847  break;
2848  }
2849  return (got);
2850 }
2851 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static bool s_IsConOrScaffold(CBioseq_Handle bsh)
Definition: add.cpp:691
USING_SCOPE(objects)
static void CreateSeqGap(CSeq_literal &seq_lit, GapFeatsPtr gfp)
Definition: add.cpp:313
static void fta_fix_seq_id(CSeq_loc &loc, CSeq_id &id, IndexblkPtr ibp, const char *location, const char *name, SeqLocIdsPtr slip, bool iscon, Parser::ESource source)
Definition: add.cpp:2122
bool no_reference(const CBioseq &bioseq)
Definition: add.cpp:220
void SeqToDelta(CBioseq &bioseq, Int2 tech)
Definition: add.cpp:500
CMolInfo::TTech fta_check_con_for_wgs(CBioseq &bioseq)
Definition: add.cpp:2069
bool fta_check_htg_kwds(TKeywordList &kwds, IndexblkPtr ibp, CMolInfo &mol_info)
Definition: add.cpp:907
#define SHORT_GAP
Definition: add.cpp:83
void fta_set_molinfo_completeness(CBioseq &bioseq, const Indexblk *ibp)
Definition: add.cpp:2759
static void fta_validate_assembly(char *name)
Definition: add.cpp:1465
void fta_add_hist(ParserPtr pp, CBioseq &bioseq, CGB_block::TExtra_accessions &extra_accs, Parser::ESource source, CSeq_id::E_Choice acctype, bool pricon, const char *acc)
Definition: add.cpp:787
static bool fta_ranges_to_hist(const CGB_block::TExtra_accessions &extra_accs)
Definition: add.cpp:602
static int sGetPrefixLength(const CTempString &accession)
Definition: add.cpp:775
void AssemblyGapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:334
bool fta_parse_tpa_tsa_block(CBioseq &bioseq, char *offset, char *acnum, Int2 vernum, size_t len, Int2 col_data, bool tpa)
Definition: add.cpp:1112
bool g_DoesNotReferencePrimary(const CDelta_ext &delta_ext, const CSeq_id &primary, CScope &scope)
Definition: add.cpp:724
static bool fta_validate_bioproject(char *name, Parser::ESource source)
Definition: add.cpp:1495
bool fta_if_valid_biosample(const Char *id, bool dblink)
Definition: add.cpp:1751
string GetQSFromFile(FILE *fd, const Indexblk *ibp)
Definition: add.cpp:2662
void fta_get_project_user_object(TSeqdescList &descrs, char *offset, Parser::EFormat format, bool *drop, Parser::ESource source)
Definition: add.cpp:1604
bool fta_strings_same(const char *s1, const char *s2)
Definition: add.cpp:897
bool check_cds(const DataBlk &entry, Parser::EFormat format)
Definition: add.cpp:258
static bool s_IsAccession(const CSeq_id &id)
Definition: add.cpp:709
void fta_create_far_fetch_policy_user_object(CBioseq &bsp, Int4 num)
Definition: add.cpp:2784
void fta_tsa_tls_comment_dblink_check(const CBioseq &bioseq, bool is_tsa)
Definition: add.cpp:2714
void fta_remove_cleanup_user_object(CSeq_entry &seq_entry)
Definition: add.cpp:2681
bool fta_if_valid_sra(const Char *id, bool dblink)
Definition: add.cpp:1730
static void fta_do_fix_seq_loc_id(TSeqLocList &locs, IndexblkPtr ibp, const char *location, const char *name, SeqLocIdsPtr slip, bool iscon, Parser::ESource source)
Definition: add.cpp:2255
static ValNodePtr fta_tokenize_dblink(char *str, Parser::ESource source)
Definition: add.cpp:1774
#define HTG_GAP
Definition: add.cpp:82
bool fta_dblink_has_sra(const CRef< CUser_object > &uop)
Definition: add.cpp:2825
static ValNodePtr fta_vnp_structured_comment(char *buf)
Definition: add.cpp:2399
CRef< CSeq_loc > fta_get_seqloc_int_whole(CSeq_id &seq_id, size_t len)
Definition: add.cpp:1447
char * StringRStr(char *where, const char *what)
Definition: add.cpp:1432
void GapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:382
void fta_get_dblink_user_object(TSeqdescList &descrs, char *offset, size_t len, Parser::ESource source, bool *drop, CRef< CUser_object > &dbuop)
Definition: add.cpp:1944
bool fta_number_is_huge(const Char *s)
Definition: add.cpp:1053
void err_install(const Indexblk *ibp, bool accver)
Definition: add.cpp:297
string tata_save(string_view t)
Definition: add.cpp:148
bool no_date(Parser::EFormat format, const TSeqdescList &descrs)
Definition: add.cpp:190
static void fta_tpa_block_free(FTATpaBlockPtr ftbp)
Definition: add.cpp:128
static ValNodePtr fta_tokenize_project(char *str, Parser::ESource source, bool newstyle)
Definition: add.cpp:1530
static CRef< CUser_object > fta_build_structured_comment(char *tag, char *buf)
Definition: add.cpp:2473
void fta_parse_structured_comment(char *str, bool &bad, TUserObjVector &objs)
Definition: add.cpp:2546
Int4 fta_fix_seq_loc_id(TSeqLocList &locs, ParserPtr pp, const char *location, const char *name, bool iscon)
Definition: add.cpp:2293
static void fta_check_tpa_tsa_coverage(FTATpaBlockPtr ftbp, Int4 length, bool tpa)
Definition: add.cpp:978
void StripECO(string &str)
Definition: add.cpp:2805
CRef< CPatent_seq_id > MakeUsptoPatSeqId(const char *acc)
Definition: asci_blk.cpp:884
void ShrinkSpaces(char *line)
Definition: asci_blk.cpp:118
CBioseq_Handle –.
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
Definition: Dbtag.hpp:53
CDelta_seq –.
Definition: Delta_seq.hpp:66
@Imp_feat.hpp User-defined methods of the data storage class.
Definition: Imp_feat.hpp:54
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
Definition: Seq_entry.hpp:56
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
char * mOffset
Definition: ftablock.h:329
size_t len
Definition: ftablock.h:330
CFlatFileData * mpData
Definition: ftablock.h:328
DataBlk * mpNext
Definition: ftablock.h:333
int mType
Definition: ftablock.h:327
@ ParFlat_FH
Definition: embl.h:52
#define ParFlat_COL_DATA_EMBL
Definition: embl.h:38
#define ERR_REFERENCE_Illegalreference
Definition: flat2err.h:287
#define ERR_LOCATION_TpaAndNonTpa
Definition: flat2err.h:401
#define ERR_TPA_SpanLengthDiff
Definition: flat2err.h:591
#define ERR_COMMENT_SameStructuredCommentTags
Definition: flat2err.h:97
#define ERR_TSA_SpanLengthDiff
Definition: flat2err.h:607
#define ERR_TPA_InvalidPrimarySeqId
Definition: flat2err.h:588
#define ERR_TSA_SpanDiffOver300bp
Definition: flat2err.h:608
#define ERR_ENTRY_TLSLacksBioProjectLink
Definition: flat2err.h:92
#define ERR_FORMAT_InvalidBioProjectAcc
Definition: flat2err.h:73
#define ERR_TPA_IncompleteCoverage
Definition: flat2err.h:590
#define ERR_DBLINK_InvalidIdentifier
Definition: flat2err.h:612
#define ERR_SEQUENCE_HasManyComponents
Definition: flat2err.h:158
#define ERR_LOCATION_CrossDatabaseFeatLoc
Definition: flat2err.h:402
#define ERR_COMMENT_StructuredCommentLacksDelim
Definition: flat2err.h:98
#define ERR_TPA_InvalidPrimarySpan
Definition: flat2err.h:587
#define ERR_ENTRY_TSALacksStructuredComment
Definition: flat2err.h:89
#define ERR_FORMAT_WrongBioProjectPrefix
Definition: flat2err.h:72
#define ERR_LOCATION_SeqIdProblem
Definition: flat2err.h:400
#define ERR_SEQUENCE_MultipleWGSProjects
Definition: flat2err.h:159
#define ERR_TSA_IncompleteCoverage
Definition: flat2err.h:606
#define ERR_ACCESSION_CannotGetDivForSecondary
Definition: flat2err.h:171
#define ERR_ENTRY_TSALacksBioProjectLink
Definition: flat2err.h:90
#define ERR_TPA_SpanDiffOver300bp
Definition: flat2err.h:592
#define ERR_FORMAT_ContigVersusAssemblyGapMissmatch
Definition: flat2err.h:71
#define ERR_TSA_InvalidPrimaryBlock
Definition: flat2err.h:605
#define ERR_TSA_InvalidPrimarySpan
Definition: flat2err.h:603
#define ERR_FEATURE_AllNsBetweenGaps
Definition: flat2err.h:368
#define ERR_FEATURE_InvalidGapSequence
Definition: flat2err.h:369
#define ERR_FORMAT_IncorrectDBLINK
Definition: flat2err.h:69
#define ERR_FEATURE_NsAbutGap
Definition: flat2err.h:367
#define ERR_ENTRY_TLSLacksStructuredComment
Definition: flat2err.h:91
#define ERR_LOCATION_ContigAndScaffold
Definition: flat2err.h:405
#define ERR_ACCESSION_WGSPrefixMismatch
Definition: flat2err.h:177
#define ERR_DBLINK_DuplicateIdentifierRemoved
Definition: flat2err.h:613
#define ERR_SEQUENCE_HTGPossibleShortGap
Definition: flat2err.h:152
#define ERR_TPA_InvalidPrimaryBlock
Definition: flat2err.h:589
#define ERR_SEQUENCE_HTGPhaseZeroHasGap
Definition: flat2err.h:154
#define ERR_COMMENT_InvalidStructuredComment
Definition: flat2err.h:96
#define ERR_KEYWORD_MultipleHTGPhases
Definition: flat2err.h:203
#define ERR_SEQUENCE_HTGWithoutGaps
Definition: flat2err.h:151
#define ERR_TSA_InvalidPrimarySeqId
Definition: flat2err.h:604
std::list< std::string > TKeywordList
Definition: ftablock.h:163
std::list< CRef< objects::CSeqdesc > > TSeqdescList
Definition: ftablock.h:61
std::vector< CRef< objects::CUser_object > > TUserObjVector
Definition: ftablock.h:62
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:131
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:121
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:111
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:90
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
char * StringRChr(char *s, const char c)
Definition: ftacpp.hpp:93
void FtaInstallPrefix(int prefix, const char *name, const char *location)
Definition: ftaerr.cpp:321
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static int type
Definition: getdata.c:31
static const char * str(char *buf, int n)
Definition: stats.c:84
int offset
Definition: replacements.h:160
static const char location[]
Definition: config.c:97
@ ParFlat_FEATURES
Definition: genbank.h:51
#define ParFlat_COL_DATA
Definition: genbank.h:37
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define StringStr
Definition: ncbistr.hpp:322
#define StringSave
Definition: ncbistr.hpp:326
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
ErrSev
Definition: ncbierr.hpp:63
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
static E_Choice GetAccType(EAccessionInfo info)
Definition: Seq_id.hpp:562
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
CConstRef< CSeq_loc > GetRangeAsSeq_loc(void) const
Get seq-loc for the current iterator position.
Definition: Seq_loc.cpp:2585
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
const CSeq_id & GetSeq_id(void) const
Get seq_id of the current location.
Definition: Seq_loc.hpp:1028
void GetLabel(string *label) const
Appends a label suitable for display (e.g., error messages) label must point to an existing string ob...
Definition: Seq_loc.cpp:3467
CSeq_id_Handle GetAccVer(const CSeq_id_Handle &idh, TGetFlags flags=0)
Get accession.version Seq-id Returns null CSeq_id_Handle if the sequence is not found or if it doesn'...
Definition: scope.cpp:413
TBioseqHandles GetBioseqHandles(const TIds &ids)
Get bioseq handles for all ids.
Definition: scope.cpp:143
const TInst_Ext & GetInst_Ext(void) const
bool IsSetInst_Ext(void) const
bool IsSetInst_Repr(void) const
TInst_Repr GetInst_Repr(void) const
TObjectType * GetNCPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1174
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
std::string CStringUTF8
Definition: ncbistl.hpp:254
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
const_iterator end() const
Return an iterator to the string's ending position (one past the end of the represented sequence)
Definition: tempstr.hpp:306
CTempString literal(const char(&str)[Size])
Templatized initialization from a string literal.
Definition: tempstr.hpp:441
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:776
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5347
const_iterator begin() const
Return an iterator to the string's starting position.
Definition: tempstr.hpp:299
static const char label[]
list< string > TExtra_accessions
Definition: GB_block_.hpp:91
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
TData & SetData(void)
Assign a value to Data data member.
void SetNum(TNum value)
Assign a value to Num data member.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
void SetLabel(TLabel &value)
Assign a value to Label data member.
const TData & GetData(void) const
Get the Data member data.
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
const TType & GetType(void) const
Get the Type member data.
Tdata & Set(void)
Assign a value to data member.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
void SetDim(TDim value)
Assign a value to Dim data member.
Definition: Seq_align_.hpp:865
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:818
@ eType_partial
mapping pieces together
Definition: Seq_align_.hpp:103
const TKey & GetKey(void) const
Get the Key member data.
Definition: Imp_feat_.hpp:259
void SetTo(TTo value)
Assign a value to To data member.
bool IsGenbank(void) const
Check if variant Genbank is selected.
Definition: Seq_id_.hpp:841
TGeneral & SetGeneral(void)
Select the variant.
Definition: Seq_id_.cpp:375
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
bool IsTpg(void) const
Check if variant Tpg is selected.
Definition: Seq_id_.hpp:928
bool IsEmpty(void) const
Check if variant Empty is selected.
Definition: Seq_loc_.hpp:516
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
bool IsPacked_pnt(void) const
Check if variant Packed_pnt is selected.
Definition: Seq_loc_.hpp:546
bool IsTpd(void) const
Check if variant Tpd is selected.
Definition: Seq_id_.hpp:940
bool IsOther(void) const
Check if variant Other is selected.
Definition: Seq_id_.hpp:871
void SetId(TId &value)
Assign a value to Id data member.
bool IsEmbl(void) const
Check if variant Embl is selected.
Definition: Seq_id_.hpp:847
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
void SetFrom(TFrom value)
Assign a value to From data member.
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
bool IsWhole(void) const
Check if variant Whole is selected.
Definition: Seq_loc_.hpp:522
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
void SetStrand(TStrand value)
Assign a value to Strand data member.
bool IsTpe(void) const
Check if variant Tpe is selected.
Definition: Seq_id_.hpp:934
bool IsPnt(void) const
Check if variant Pnt is selected.
Definition: Seq_loc_.hpp:540
const TAccession & GetAccession(void) const
Get the Accession member data.
bool IsDdbj(void) const
Check if variant Ddbj is selected.
Definition: Seq_id_.hpp:910
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ e_General
for other databases
Definition: Seq_id_.hpp:105
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
@ e_Named_annot_track
Internal named annotation tracking ID.
Definition: Seq_id_.hpp:114
@ e_not_set
No variant selected.
Definition: Seq_id_.hpp:94
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Local
local use
Definition: Seq_id_.hpp:95
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSetDescr(void) const
Check if a value has been assigned to Descr data member.
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
void SetCompleteness(TCompleteness value)
Assign a value to Completeness data member.
Definition: MolInfo_.hpp:600
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
list< CRef< CSeq_align > > TAssembly
Definition: Seq_hist_.hpp:248
bool IsSetSeq_data(void) const
the sequence Check if a value has been assigned to Seq_data data member.
Definition: Seq_inst_.hpp:805
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
TTopology GetTopology(void) const
Get the Topology member data.
Definition: Seq_inst_.hpp:733
const TIupacna & GetIupacna(void) const
Get the variant data.
Definition: Seq_data_.hpp:510
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_gap_.hpp:291
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
list< CRef< CDelta_seq > > Tdata
Definition: Delta_ext_.hpp:89
TLinkage_evidence & SetLinkage_evidence(void)
Assign a value to Linkage_evidence data member.
Definition: Seq_gap_.hpp:375
void SetLinkage(TLinkage value)
Assign a value to Linkage data member.
Definition: Seq_gap_.hpp:338
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
void SetTech(TTech value)
Assign a value to Tech data member.
Definition: MolInfo_.hpp:503
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
TMolinfo & SetMolinfo(void)
Select the variant.
Definition: Seqdesc_.cpp:594
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eCompleteness_complete
complete biological entity
Definition: MolInfo_.hpp:156
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
CSeq_id::E_Choice GetNucAccOwner(const CTempString &acc)
Definition: indx_blk.cpp:2244
int fta_if_wgs_acc(string_view accession)
Definition: indx_blk.cpp:1190
bool isSupportedAccession(CSeq_id::E_Choice type)
Definition: indx_blk.cpp:2224
Int4 IsNewAccessFormat(const Char *acnum)
Definition: indx_blk.cpp:992
char * buf
int i
int len
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
static const BitmapCharRec ch1
Definition: ncbi_10x20.c:1827
static const BitmapCharRec ch2
Definition: ncbi_10x20.c:1819
#define fseek
EIPRangeType t
Definition: ncbi_localip.c:101
const char * tag
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
std::list< SeqLoc > TSeqLocList
static Format format
Definition: njn_ioutil.cpp:53
Int4 delta(size_t dimension_, const Int4 *score_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
Int4 from2
Definition: add.cpp:112
Int4 version
Definition: add.cpp:111
CSeq_id::E_Choice sicho
Definition: add.cpp:115
ENa_strand strand
Definition: add.cpp:114
Int4 from1
Definition: add.cpp:108
Int4 to2
Definition: add.cpp:113
char * accession
Definition: add.cpp:110
FTATpaBlock * next
Definition: add.cpp:116
Int4 to1
Definition: add.cpp:109
Int4 from
Definition: add.cpp:121
FTATpaSpan * next
Definition: add.cpp:123
Int4 to
Definition: add.cpp:122
Int4 from
Definition: ftablock.h:119
string gap_type
Definition: ftablock.h:125
Int4 to
Definition: ftablock.h:120
GapFeats * next
Definition: ftablock.h:130
objects::CLinkage_evidence::TLinkage_evidence asn_linkage_evidence
Definition: ftablock.h:128
objects::CSeq_gap::TType asn_gap_type
Definition: ftablock.h:127
bool assembly_gap
Definition: ftablock.h:124
bool rightNs
Definition: ftablock.h:123
Int4 estimated_length
Definition: ftablock.h:121
bool leftNs
Definition: ftablock.h:122
Char acnum[200]
Definition: ftablock.h:166
size_t qsoffset
Definition: ftablock.h:229
Int2 htg
Definition: ftablock.h:196
Int2 vernum
Definition: ftablock.h:167
bool drop
Definition: ftablock.h:182
GapFeatsPtr gaps
Definition: ftablock.h:214
Char locusname[200]
Definition: ftablock.h:170
size_t qslength
Definition: ftablock.h:230
vector< IndexblkPtr > entrylist
bool allow_crossdb_featloc
const Char * wgsscaf
Definition: add.cpp:92
Int4 ddbj
Definition: add.cpp:98
const Char * wgscont
Definition: add.cpp:91
Int4 tpd
Definition: add.cpp:102
Int4 genbank
Definition: add.cpp:93
Int4 pir
Definition: add.cpp:95
CSeq_loc * badslp
Definition: add.cpp:89
Int4 tpg
Definition: add.cpp:100
Int4 tpe
Definition: add.cpp:101
Int4 total
Definition: add.cpp:103
Int4 swissprot
Definition: add.cpp:96
Int4 prf
Definition: add.cpp:99
Int4 other
Definition: add.cpp:97
Int4 embl
Definition: add.cpp:94
const Char * wgsacc
Definition: add.cpp:90
ValNode * next
Definition: valnode.h:51
char * data
Definition: valnode.h:49
Definition: type.c:6
#define _ASSERT
CScope & GetScope()
bool SetTextId(Uint1 seqtype, CSeq_id &seqId, CTextseq_id &textId)
Definition: utilfun.cpp:1507
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1496
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
Definition: utilfun.cpp:994
char * SrchTheStr(char *bptr, char *eptr, const char *leadstr)
Definition: utilfun.cpp:779
void UnwrapAccessionRange(const CGB_block::TExtra_accessions &extra_accs, CGB_block::TExtra_accessions &hist)
Definition: utilfun.cpp:197
ValNodePtr ValNodeNew(ValNodePtr prev, const char *data)
Definition: valnode.cpp:53
ValNodePtr ValNodeFree(ValNodePtr vnp)
Definition: valnode.cpp:76
ValNodePtr ValNodeFreeData(ValNodePtr vnp)
Definition: valnode.cpp:96
Modified on Fri Sep 20 14:58:24 2024 by modify_doxy.py rev. 669887