NCBI C++ ToolKit
add.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: add.cpp 102137 2024-04-08 15:37:15Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Name: add.cpp
27  *
28  * Author: Karl Sirotkin, Hsiu-Chuan Chen
29  *
30  * File Description:
31  * Additional parser functions.
32  *
33  */
34 
35 #include <ncbi_pch.hpp>
36 
37 #include "ftacpp.hpp"
38 #include <objects/seq/Seq_gap.hpp>
44 #include <objects/seq/MolInfo.hpp>
45 #include <objects/seq/Seq_inst.hpp>
46 #include <objects/seq/Seq_ext.hpp>
47 #include <objects/seq/Seq_hist.hpp>
63 
64 #include "index.h"
65 #include "genbank.h" /* for ParFlat_FEATURES */
66 #include "embl.h" /* for ParFlat_FH */
67 
69 #include "ftanet.h"
70 
71 #include "ftaerr.hpp"
72 #include "indx_blk.h"
73 #include "asci_blk.h"
74 #include "utilfun.h"
75 #include "add.h"
76 
77 #ifdef THIS_FILE
78 # undef THIS_FILE
79 #endif
80 #define THIS_FILE "add.cpp"
81 
82 #define HTG_GAP 100
83 #define SHORT_GAP 20
84 
87 
88 struct SeqLocIds {
89  CSeq_loc* badslp = nullptr;
90  const Char* wgsacc = nullptr;
91  const Char* wgscont = nullptr;
92  const Char* wgsscaf = nullptr;
94  Int4 embl = 0;
95  Int4 pir = 0;
97  Int4 other = 0;
98  Int4 ddbj = 0;
99  Int4 prf = 0;
100  Int4 tpg = 0;
101  Int4 tpe = 0;
102  Int4 tpd = 0;
103  Int4 total = 0;
104 };
106 
107 struct FTATpaBlock {
108  Int4 from1 = 0;
109  Int4 to1 = 0;
110  char* accession = nullptr;
112  Int4 from2 = 0;
113  Int4 to2 = 0;
116  FTATpaBlock* next = nullptr;
117 };
119 
120 struct FTATpaSpan {
121  Int4 from = 0;
122  Int4 to = 0;
123  FTATpaSpan* next = nullptr;
124 };
126 
127 /**********************************************************/
129 {
131 
132  for (; ftbp; ftbp = next) {
133  next = ftbp->next;
134  if (ftbp->accession)
135  MemFree(ftbp->accession);
136  delete ftbp;
137  }
138 }
139 
140 /**********************************************************
141  *
142  * char* tata_save(str):
143  *
144  * Deletes spaces from the begining and the end and
145  * returns Nlm_StringSave.
146  *
147  **********************************************************/
148 string tata_save(string_view t)
149 {
150  if (t.empty())
151  return {};
152  string str(t);
153 
154  // strip from beginning
155  size_t i = 0;
156  for (char c : str) {
157  if (isspace(c) || c == ',')
158  ++i;
159  else
160  break;
161  }
162  if (i > 0)
163  str.erase(0, i);
164 
165  // strip from beginning of each line
166  for (i = 0; i < str.length(); ++i) {
167  if (str[i] != '\n')
168  continue;
169  size_t j = 0;
170  for (size_t k = i + 1; k < str.length() && isspace(str[k]); ++k)
171  ++j;
172  str[i] = ' ';
173  if (j > 0)
174  str.erase(i + 1, j);
175  }
176 
177  // strip from end
178  while (! str.empty()) {
179  char c = str.back();
180  if (c == ' ' || c == ';' || c == ',' || c == '\"' || c == '\t')
181  str.pop_back();
182  else
183  break;
184  }
185 
186  return str;
187 }
188 
189 /**********************************************************/
191 {
192  bool no_create = true;
193  bool no_update = true;
194 
195  for (const auto& desc : descrs) {
196  if (desc->IsCreate_date())
197  no_create = false;
198  else if (desc->IsUpdate_date())
199  no_update = false;
200 
201  if (no_create == false && no_update == false)
202  break;
203  }
204 
206  return (no_update);
207 
208  return (no_create || no_update);
209 }
210 
211 /**********************************************************
212  *
213  * bool no_reference(bsp):
214  *
215  * Search for at least one reference in bioseq->desr
216  * or in bioseq->annot.
217  * If no reference return TRUE.
218  *
219  **********************************************************/
220 bool no_reference(const CBioseq& bioseq)
221 {
222  for (const auto& desc : bioseq.GetDescr().Get()) {
223  if (desc->IsPub())
224  return false;
225  }
226 
227  for (const auto& annot : bioseq.GetAnnot()) {
228  if (! annot->IsFtable())
229  continue;
230 
231  for (const auto& feat : annot->GetData().GetFtable()) {
232  if (feat->IsSetData() && feat->GetData().IsPub())
233  return false;
234  }
235 
236  for (const auto& feat : annot->GetData().GetFtable()) {
237  if (! feat->IsSetData() || ! feat->GetData().IsImp())
238  continue;
239 
240  const CImp_feat& imp = feat->GetData().GetImp();
241  if (imp.GetKey() == "Site-ref") {
242  ErrPostStr(SEV_ERROR, ERR_REFERENCE_Illegalreference, "The entry has only 'sites' references");
243  return false;
244  }
245  }
246  }
247 
248  return true;
249 }
250 
251 /**********************************************************
252  *
253  * bool check_cds(entry, format):
254  *
255  * Returns TRUE if CDS is in the entry.
256  *
257  **********************************************************/
259 {
260  DataBlkPtr temp;
261  DataBlkPtr dbp;
262  const char* str;
263  char* p;
264  Char ch;
265  Int2 type;
266 
267  if (format == Parser::EFormat::EMBL) {
268  type = ParFlat_FH;
269  str = "\nFT CDS ";
270  } else if (format == Parser::EFormat::GenBank) {
272  str = "\n CDS ";
273  } else
274  return false;
275 
276  for (temp = TrackNodeType(entry, type); temp; temp = temp->mpNext) {
277  if (temp->mType != type)
278  continue;
279 
280  size_t len = 0;
281  for (dbp = static_cast<DataBlk*>(temp->mpData); dbp; dbp = dbp->mpNext)
282  len += dbp->len;
283  if (len == 0)
284  continue;
285 
286  dbp = static_cast<DataBlk*>(temp->mpData);
287  ch = dbp->mOffset[len];
288  dbp->mOffset[len] = '\0';
289  p = StringStr(dbp->mOffset, str);
290  dbp->mOffset[len] = ch;
291 
292  if (p)
293  break;
294  }
295 
296  if (! temp)
297  return false;
298  return true;
299 }
300 
301 /**********************************************************/
302 void err_install(const Indexblk* ibp, bool accver)
303 {
304  string temp;
305 
307  temp = ibp->acnum;
308  if (accver && ibp->vernum > 0) {
309  temp += '.';
310  temp += to_string(ibp->vernum);
311  }
312  if (temp.empty())
313  temp = ibp->locusname;
314  FtaInstallPrefix(PREFIX_ACCESSION, temp.c_str());
315 }
316 
317 /**********************************************************/
318 static void CreateSeqGap(CSeq_literal& seq_lit, GapFeatsPtr gfp)
319 {
320  if (! gfp)
321  return;
322 
323  CSeq_gap& sgap = seq_lit.SetSeq_data().SetGap();
324  sgap.SetType(gfp->asn_gap_type);
325 
326  if (! gfp->asn_linkage_evidence.empty())
327  sgap.SetLinkage_evidence().swap(gfp->asn_linkage_evidence);
328 
330  if (! gfp->gap_type.empty()) {
331  const string& gapType(gfp->gap_type);
332  if (gapType == "unknown" || gapType == "within scaffold" || gapType == "repeat within scaffold") {
334  }
335  }
336 }
337 
338 /**********************************************************/
339 void AssemblyGapsToDelta(CBioseq& bioseq, GapFeatsPtr gfp, bool* drop)
340 {
341  if (! bioseq.GetInst().IsSetExt() || ! bioseq.GetInst().GetExt().IsDelta() ||
342  ! gfp)
343  return;
344 
345  CDelta_ext::Tdata& deltas = bioseq.SetInst().SetExt().SetDelta();
346  CDelta_ext::Tdata::iterator delta = deltas.begin();
347  for (; delta != deltas.end(); ++delta) {
348  if (! gfp)
349  break;
350 
351  if (! (*delta)->IsLiteral()) /* not Seq-lit */
352  continue;
353 
354  CSeq_literal& literal = (*delta)->SetLiteral();
355  if (literal.GetLength() != static_cast<Uint4>(gfp->to - gfp->from + 1)) {
356  ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigVersusAssemblyGapMissmatch, "The lengths of the CONTIG/CO line gaps disagrees with the lengths of assembly_gap features. First assembly_gap with a mismatch is at \"%d..%d\".", gfp->from, gfp->to);
357  *drop = true;
358  break;
359  }
360 
361  CreateSeqGap(literal, gfp);
362 
363  gfp = gfp->next;
364  }
365 
366  if (*drop || (delta == deltas.end() && ! gfp))
367  return;
368 
369  if (delta == deltas.end() && gfp) {
370  ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigVersusAssemblyGapMissmatch, "The number of the assembly_gap features exceeds the number of CONTIG/CO line gaps. First extra assembly_gap is at \"%d..%d\".", gfp->from, gfp->to);
371  *drop = true;
372  } else if (delta != deltas.end() && ! gfp) {
373  for (; delta != deltas.end(); ++delta) {
374  if ((*delta)->IsLiteral()) /* Seq-lit */
375  break;
376  }
377 
378  if (delta == deltas.end())
379  return;
380 
381  ErrPostEx(SEV_REJECT, ERR_FORMAT_ContigVersusAssemblyGapMissmatch, "The number of the CONTIG/CO line gaps exceeds the number of assembly_gap features.");
382  *drop = true;
383  }
384 }
385 
386 /**********************************************************/
387 void GapsToDelta(CBioseq& bioseq, GapFeatsPtr gfp, bool* drop)
388 {
389  GapFeatsPtr tgfp;
390 
391  const Char* p;
392  Int4 prevto;
393  Int4 nextfrom;
394  Int4 i;
395 
396  if (! gfp || ! bioseq.GetInst().IsSetSeq_data())
397  return;
398 
399  const string& sequence = bioseq.GetInst().GetSeq_data().GetIupacna();
400 
401  if (sequence.empty() || sequence.size() != bioseq.GetLength())
402  return;
403 
404  for (prevto = 0, tgfp = gfp; tgfp; tgfp = tgfp->next) {
405  if (tgfp->next) {
406  p = sequence.c_str() + tgfp->to;
407  for (i = tgfp->to + 1; i < tgfp->next->from; p++, i++)
408  if (*p != 'N')
409  break;
410  if (i == tgfp->next->from && tgfp->next->from > tgfp->to + 1) {
411  ErrPostEx(SEV_ERROR, ERR_FEATURE_AllNsBetweenGaps, "A run of all-N sequence exists between the gap features located at \"%d..%d\" and \"%d..%d\".", tgfp->from, tgfp->to, tgfp->next->from, tgfp->next->to);
412  tgfp->rightNs = true;
413  tgfp->next->leftNs = true;
414  }
415  nextfrom = tgfp->next->from;
416  } else
417  nextfrom = bioseq.GetLength() + 1;
418 
419  if (tgfp->leftNs == false && tgfp->from - prevto > 10) {
420  for (p = sequence.c_str() + tgfp->from - 11, i = 0; i < 10; p++, i++)
421  if (*p != 'N')
422  break;
423  if (i == 10) {
424  ErrPostEx(SEV_WARNING, ERR_FEATURE_NsAbutGap, "A run of N's greater or equal than 10 abuts the gap feature at \"%d..%d\" : possible problem with the boundaries of the gap.", tgfp->from, tgfp->to);
425  }
426  }
427 
428  if (tgfp->rightNs == false && nextfrom - tgfp->to > 10) {
429  for (p = sequence.c_str() + tgfp->to, i = 0; i < 10; p++, i++)
430  if (*p != 'N')
431  break;
432  if (i == 10) {
433  ErrPostEx(SEV_WARNING, ERR_FEATURE_NsAbutGap, "A run of N's greater or equal than 10 abuts the gap feature at \"%d..%d\" : possible problem with the boundaries of the gap.", tgfp->from, tgfp->to);
434  }
435  }
436 
437  for (i = tgfp->from - 1, p = sequence.c_str() + i; i < tgfp->to; p++, i++)
438  if (*p != 'N')
439  break;
440  if (i < tgfp->to) {
441  ErrPostEx(SEV_REJECT, ERR_FEATURE_InvalidGapSequence, "The sequence data associated with the gap feature at \"%d..%d\" contains basepairs other than N.", tgfp->from, tgfp->to);
442  *drop = true;
443  }
444 
445  prevto = tgfp->to;
446  }
447 
448  if (*drop)
449  return;
450 
451  CDelta_ext::Tdata deltas;
452 
453  for (prevto = 0, tgfp = gfp;; tgfp = tgfp->next) {
454  Int4 len = 0;
455 
457  if (tgfp->from - prevto - 1 > 0) {
458  len = tgfp->from - prevto - 1;
459  delta->SetLiteral().SetLength(len);
460  delta->SetLiteral().SetSeq_data().SetIupacna().Set() = sequence.substr(prevto, len);
461 
462  deltas.push_back(delta);
463 
464  delta.Reset(new CDelta_seq);
465  }
466 
467  len = tgfp->to - tgfp->from + 1;
468  delta->SetLiteral().SetLength(len);
469  if (tgfp->estimated_length == -100) {
470  delta->SetLiteral().SetFuzz().SetLim();
471  } else if (tgfp->estimated_length != len) {
472  delta->SetLiteral().SetFuzz().SetRange().SetMin(tgfp->estimated_length);
473  delta->SetLiteral().SetFuzz().SetRange().SetMax(len);
474  }
475 
476  if (tgfp->assembly_gap)
477  CreateSeqGap(delta->SetLiteral(), tgfp);
478 
479  deltas.push_back(delta);
480 
481  prevto = tgfp->to;
482 
483  if (! tgfp->next) {
484  if (bioseq.GetLength() - prevto > 0) {
485  delta.Reset(new CDelta_seq);
486 
487  len = bioseq.GetLength() - prevto;
488  delta->SetLiteral().SetLength(len);
489  delta->SetLiteral().SetSeq_data().SetIupacna().Set() = sequence.substr(prevto, len);
490 
491  deltas.push_back(delta);
492  }
493  break;
494  }
495  }
496 
497  if (! deltas.empty()) {
498  bioseq.SetInst().SetExt().SetDelta().Set().swap(deltas);
499  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_delta);
500  bioseq.SetInst().ResetSeq_data();
501  }
502 }
503 
504 /**********************************************************/
505 void SeqToDelta(CBioseq& bioseq, Int2 tech)
506 {
507  char* p;
508  char* q;
509  char* r;
510 
511  Int4 i;
512  Int4 j;
513  Int4 gotcha;
514 
515  if (! bioseq.GetInst().IsSetSeq_data())
516  return;
517 
518  const string& sequence = bioseq.GetInst().GetSeq_data().GetIupacna();
519  if (sequence.empty() || sequence.size() != bioseq.GetLength())
520  return;
521 
522  vector<Char> buf(sequence.begin(), sequence.end());
523  buf.push_back(0);
524  p = &buf[0];
525  gotcha = 0;
526 
527  CDelta_ext::Tdata deltas;
528 
529  for (q = p; *p != '\0';) {
530  if (*p != 'N') {
531  p++;
532  continue;
533  }
534 
535  for (r = p, p++, i = 1; *p == 'N'; i++)
536  p++;
537  if (i < HTG_GAP) {
538  if (i >= SHORT_GAP && gotcha == 0)
539  gotcha = 1;
540  continue;
541  }
542 
544  gotcha = 2;
545 
546  if (r != q) {
547  *r = '\0';
548  j = (Int4)(r - q);
549 
550  delta->SetLiteral().SetLength(j);
551  delta->SetLiteral().SetSeq_data().SetIupacna().Set(string(q, r));
552 
553  deltas.push_back(delta);
554 
555  delta.Reset(new CDelta_seq);
556 
557  *r = 'N';
558  }
559 
560  delta->SetLiteral().SetLength(i);
561  if (i == 100) {
562  delta->SetLiteral().SetFuzz().SetLim();
563  }
564 
565  deltas.push_back(delta);
566  q = p;
567  }
568 
569  if (p > q) {
570  j = (Int4)(p - q);
571 
573  delta->SetLiteral().SetLength(j);
574  delta->SetLiteral().SetSeq_data().SetIupacna().Set(string(q, p));
575 
576  deltas.push_back(delta);
577  }
578 
579  if (deltas.size() > 1) {
580  bioseq.SetInst().SetExt().SetDelta().Set().swap(deltas);
581  bioseq.SetInst().SetRepr(CSeq_inst::eRepr_delta);
582  bioseq.SetInst().ResetSeq_data();
583  }
584 
585  if (bioseq.GetInst().GetRepr() != CSeq_inst::eRepr_delta && tech == 1) {
586  ErrPostEx(SEV_WARNING, ERR_SEQUENCE_HTGWithoutGaps, "This Phase 1 HTG sequence has no runs of 100 "
587  "or more N's to indicate gaps between component contigs. "
588  "This could be an error, or perhaps sequencing is finished "
589  "and this record should not be Phase 1.");
590  }
591 
592  if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_delta) {
593  if (tech == 4) /* Phase 0 */
594  ErrPostEx(SEV_WARNING, ERR_SEQUENCE_HTGPhaseZeroHasGap, "A Phase 0 HTG record usually consists of several reads "
595  "for one contig, and hence gaps are not expected. But "
596  "this record does have one (ore more) gaps, hence it "
597  "may require review.");
598  if (gotcha == 1)
599  ErrPostEx(SEV_WARNING, ERR_SEQUENCE_HTGPossibleShortGap, "This sequence has one or more runs "
600  "of at least 20 N's. They could indicate gaps, "
601  "but have not been treated that way because "
602  "they are below the minimum of 100 N's.");
603  }
604 }
605 
606 /**********************************************************/
607 static bool fta_ranges_to_hist(const CGB_block::TExtra_accessions& extra_accs)
608 {
609  string ppacc1;
610  string ppacc2;
611  char* master;
612  char* range;
613  char* acc1;
614  char* acc2;
615  char* p;
616  char* q;
617  Char ch1;
618  Char ch2;
619 
620  if (extra_accs.empty())
621  return false;
622 
623  if (extra_accs.size() != 2)
624  return true;
625 
626  CGB_block::TExtra_accessions::const_iterator it = extra_accs.begin();
627 
628  ppacc1 = *it;
629  ++it;
630  ppacc2 = *it;
631  acc1 = ppacc1.data();
632  acc2 = ppacc2.data();
633 
634 
635  if (! acc1 && ! acc2)
636  return false;
637  if (! acc1 || ! acc2)
638  return true;
639 
640  p = StringChr(acc1, '-');
641  q = StringChr(acc2, '-');
642 
643  if (p && q)
644  return true;
645 
646  if (! p) {
647  master = acc1;
648  range = acc2;
649  if (q)
650  *q = '\0';
651  } else {
652  master = acc2;
653  range = acc1;
654  if (p) // ?
655  *p = '\0';
656  }
657 
658  if (fta_if_wgs_acc(master) != 0 || fta_if_wgs_acc(range) != 1) {
659  if (p)
660  *p = '-';
661  if (q)
662  *q = '-';
663  return true;
664  }
665 
666  if (p)
667  *p = '-';
668  if (q)
669  *q = '-';
670 
671  for (p = master; *p != '\0' && (*p < '0' || *p > '9');)
672  p++;
673  if (*p != '\0')
674  p++;
675  if (*p != '\0')
676  p++;
677  ch1 = *p;
678  *p = '\0';
679 
680  for (q = range; *q != '\0' && (*q < '0' || *q > '9');)
681  q++;
682  if (*q != '\0')
683  q++;
684  if (*q != '\0')
685  q++;
686  ch2 = *q;
687  *q = '\0';
688 
689  bool ret = (master != range);
690  *p = ch1;
691  *q = ch2;
692 
693  return ret;
694 }
695 
696 
698 {
699  if (bsh &&
700  bsh.IsSetInst_Repr() &&
702  bsh.IsSetInst_Ext()) {
703  const auto& ext = bsh.GetInst_Ext();
704  if (ext.IsDelta() &&
705  ext.GetDelta().IsSet()) {
706  const auto& delta = ext.GetDelta().Get();
707  return any_of(begin(delta),
708  end(delta),
709  [](CRef<CDelta_seq> pDeltaSeq) { return (pDeltaSeq && pDeltaSeq->IsLoc()); });
710  }
711  }
712  return false;
713 }
714 
715 static bool s_IsAccession(const CSeq_id& id)
716 {
717  const auto idType = id.Which();
718  switch (idType) {
719  case CSeq_id::e_Local:
720  case CSeq_id::e_General:
721  case CSeq_id::e_Gi:
723  return false;
724  default:
725  return true;
726  }
727 }
728 
729 
730 bool g_DoesNotReferencePrimary(const CDelta_ext& delta_ext, const CSeq_id& primary, CScope& scope)
731 {
732  const auto primaryType = primary.Which();
733  string primaryString = primary.GetSeqIdString();
734  const bool primaryIsAccession = s_IsAccession(primary);
735  const bool primaryIsGi = primaryIsAccession ? false : (primaryType == CSeq_id::e_Gi);
736 
737  unique_ptr<string> pPrimaryAccessionString;
738 
739  for (const auto& pDeltaSeq : delta_ext.Get()) {
740  if (pDeltaSeq && pDeltaSeq->IsLoc()) {
741  auto pId = pDeltaSeq->GetLoc().GetId();
742  const auto& deltaIdType = pId->Which();
743  if (deltaIdType == primaryType) {
744  if (pId->GetSeqIdString() == primaryString) {
745  return false;
746  }
747  } else {
748  if (primaryIsAccession && deltaIdType == CSeq_id::e_Gi) {
749  auto deltaHandle = CSeq_id_Handle::GetHandle(pId->GetGi());
750  auto deltaAccessionHandle = scope.GetAccVer(deltaHandle);
751  if (! deltaAccessionHandle) {
752  return false;
753  }
754 
755  if (deltaAccessionHandle.GetSeqId()->GetSeqIdString() ==
756  primaryString) {
757  return false;
758  }
759  } else if (primaryIsGi && s_IsAccession(*pId)) {
760  if (! pPrimaryAccessionString) {
761  auto primaryGiHandle = CSeq_id_Handle::GetHandle(primary.GetGi());
762  auto primaryAccessionHandle = scope.GetAccVer(primaryGiHandle);
763  if (! primaryAccessionHandle) {
764  return false;
765  }
766  pPrimaryAccessionString =
767  make_unique<string>(primaryAccessionHandle.GetSeqId()->GetSeqIdString());
768  }
769 
770  if (*pPrimaryAccessionString == pId->GetSeqIdString()) {
771  return false;
772  }
773  }
774  }
775  }
776  }
777  return true;
778 }
779 
780 
781 static int sGetPrefixLength(const CTempString& accession)
782 {
783  auto it = find_if(begin(accession),
784  end(accession),
785  [](char c) { return ! (isalpha(c) || c == '_'); });
786 
787  _ASSERT(it != accession.end());
788  return int(distance(accession.begin(), it));
789 }
790 
791 
792 /**********************************************************/
793 void fta_add_hist(ParserPtr pp, CBioseq& bioseq, CGB_block::TExtra_accessions& extra_accs, Parser::ESource source, CSeq_id::E_Choice acctype, bool pricon, const char* acc)
794 {
795  Int4 pri_acc;
796  Int4 sec_acc;
797 
798  if (pp->accver == false || pp->histacc == false ||
799  pp->source != source || pp->entrez_fetch == 0)
800  return;
801 
802  if (! fta_ranges_to_hist(extra_accs))
803  return;
804 
806  UnwrapAccessionRange(extra_accs, hist);
807  if (hist.empty())
808  return;
809 
810  // IndexblkPtr ibp = pp->entrylist[pp->curindx];
811 
812  pri_acc = fta_if_wgs_acc(acc);
813 
814  CTempString primaryAccession(acc);
815  SIZE_TYPE prefixLength = 0;
816 
817 
818  // bulk load sequences
819  vector<string> candidatesAccs;
820  vector<CRef<CSeq_id>> candidatesIds;
821  vector<CSeq_id_Handle> candidatesIdhs;
822 
823  list<CRef<CSeq_id>> replaces;
824 
825  for (const auto& accessionString : hist) {
826  if (accessionString.empty())
827  continue;
828 
829  const auto idChoice = GetNucAccOwner(accessionString);
830  if (idChoice == CSeq_id::e_not_set) {
831  continue;
832  }
833  sec_acc = fta_if_wgs_acc(accessionString);
834  if (sec_acc == 0) { // Project WGS accession
835  continue;
836  }
837 
838  if (sec_acc == 1) // Contig WGS accession
839  {
840  if (pri_acc == 0 || pri_acc == 2) { // A project WGS accession or
841  continue; // a scaffold WGS accession
842  }
843 
844  if (pri_acc == 1) { // Contig WGS accession
845  if (prefixLength <= 0) {
846  prefixLength = sGetPrefixLength(primaryAccession);
847  }
848 
849  if ((accessionString.length() <= prefixLength ||
850  ! NStr::EqualNocase(accessionString, 0, prefixLength, primaryAccession.substr(0, prefixLength)) ||
851  ! isdigit(accessionString[prefixLength])) &&
852  ! pp->allow_uwsec) {
853  continue;
854  }
855  }
856  }
857 
858  CRef<CSeq_id> id(new CSeq_id(idChoice, accessionString));
859  candidatesAccs.push_back(accessionString);
860  candidatesIds.push_back(id);
861  candidatesIdhs.push_back(CSeq_id_Handle::GetHandle(*id));
862  }
863 
864  vector<CBioseq_Handle> secondaryBshs = GetScope().GetBioseqHandles(candidatesIdhs);
865  for ( size_t i = 0; i < candidatesIdhs.size(); ++i ) {
866  auto& accessionString = candidatesAccs[i];
867  auto id = candidatesIds[i];
868  auto idChoice = id->Which();
869  auto secondaryBsh = secondaryBshs[i];
870  bool IsConOrScaffold = false;
871  try {
872  IsConOrScaffold = s_IsConOrScaffold(secondaryBsh);
873  } catch (...) {
874  ErrPostEx(SEV_ERROR, ERR_ACCESSION_CannotGetDivForSecondary, "Failed to determine division code for secondary accession \"%s\". Entry dropped.", accessionString.c_str());
875  continue;
876  }
877 
878  if (! IsConOrScaffold && pricon && idChoice == acctype) {
879  continue;
880  }
881 
882  if (IsConOrScaffold && ! pricon) {
883  CRef<CSeq_id> pPrimary(new CSeq_id(primaryAccession));
884  if (g_DoesNotReferencePrimary(secondaryBsh.GetInst_Ext().GetDelta(),
885  *pPrimary,
886  GetScope())) {
887  replaces.push_back(id);
888  }
889  continue;
890  }
891 
892  replaces.push_back(id);
893  }
894 
895 
896  if (! replaces.empty()) {
897  auto& hist_replaces_ids = bioseq.SetInst().SetHist().SetReplaces().SetIds();
898  hist_replaces_ids.splice(hist_replaces_ids.end(), replaces);
899  }
900 }
901 
902 /**********************************************************/
903 bool fta_strings_same(const char* s1, const char* s2)
904 {
905  if (! s1 && ! s2)
906  return true;
907  if (! s1 || ! s2 || ! StringEqu(s1, s2))
908  return false;
909  return true;
910 }
911 
912 /**********************************************************/
914 {
915  bool deldiv = false;
916 
917  for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();) {
918  bool delnode = false;
919  bool errpost = false;
920  if (*key == "HTGS_PHASE0") {
921  if (ibp->htg != 0 && ibp->htg != 5) {
922  delnode = true;
923  if (ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 3)
924  errpost = true;
925  } else {
926  ibp->htg = 4;
928  }
929  deldiv = true;
930  } else if (*key == "HTGS_PHASE1") {
931  if (ibp->htg != 0 && ibp->htg != 5) {
932  delnode = true;
933  if (ibp->htg == 2 || ibp->htg == 3 || ibp->htg == 4)
934  errpost = true;
935  } else {
936  ibp->htg = 1;
938  }
939  deldiv = true;
940  } else if (*key == "HTGS_PHASE2") {
941  if (ibp->htg != 0 && ibp->htg != 5) {
942  delnode = true;
943  if (ibp->htg == 1 || ibp->htg == 3 || ibp->htg == 4)
944  errpost = true;
945  } else {
946  ibp->htg = 2;
948  }
949  deldiv = true;
950  } else if (*key == "HTGS_PHASE3") {
951  if (ibp->htg != 0 && ibp->htg != 5) {
952  delnode = true;
953  if (ibp->htg == 1 || ibp->htg == 2 || ibp->htg == 4)
954  errpost = true;
955  } else {
956  ibp->htg = 3;
958  }
959  deldiv = true;
960  } else if (*key == "HTG") {
961  if (ibp->htg == 0) {
962  ibp->htg = 5;
964  }
965  deldiv = true;
966  }
967 
968  if (errpost) {
969  ErrPostEx(SEV_ERROR, ERR_KEYWORD_MultipleHTGPhases, "This entry has multiple HTG-related keywords, for differing HTG phases. Ignoring all but the first.");
970  }
971 
972  if (delnode)
973  key = kwds.erase(key);
974  else
975  ++key;
976  }
977  if (ibp->htg == 5)
978  ibp->htg = 3;
979 
980  return deldiv;
981 }
982 
983 /**********************************************************/
984 static void fta_check_tpa_tsa_coverage(FTATpaBlockPtr ftbp, Int4 length, bool tpa)
985 {
986  FTATpaBlockPtr tftbp;
987  FTATpaSpanPtr ftsp;
988  FTATpaSpanPtr tftsp;
989  Int4 i1;
990  Int4 i2;
991  Int4 j;
992 
993  if (! ftbp || length < 1)
994  return;
995 
996  ftsp = new FTATpaSpan;
997  ftsp->from = ftbp->from1;
998  ftsp->to = ftbp->to1;
999  ftsp->next = nullptr;
1000  tftsp = ftsp;
1001  for (tftbp = ftbp; tftbp; tftbp = tftbp->next) {
1002  i1 = tftbp->to1 - tftbp->from1;
1003  i2 = tftbp->to2 - tftbp->from2;
1004  j = (i2 > i1) ? (i2 - i1) : (i1 - i2);
1005  i1++;
1006 
1007  if (i1 < 3000 && j * 10 > i1) {
1008  if (tpa)
1009  ErrPostEx(SEV_ERROR, ERR_TPA_SpanLengthDiff, "Span \"%d..%d\" of this TPA record differs from the span \"%d..%d\" of the contributing primary sequence or trace record by more than 10 percent.", tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1010  else
1011  ErrPostEx(SEV_ERROR, ERR_TSA_SpanLengthDiff, "Span \"%d..%d\" of this TSA record differs from the span \"%d..%d\" of the contributing primary sequence or trace record by more than 10 percent.", tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1012  }
1013 
1014  if (i1 >= 3000 && j > 300) {
1015  if (tpa)
1016  ErrPostEx(SEV_ERROR, ERR_TPA_SpanDiffOver300bp, "Span \"%d..%d\" of this TPA record differs from span \"%d..%d\" of the contributing primary sequence or trace record by more than 300 basepairs.", tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1017  else
1018  ErrPostEx(SEV_ERROR, ERR_TSA_SpanDiffOver300bp, "Span \"%d..%d\" of this TSA record differs from span \"%d..%d\" of the contributing primary sequence or trace record by more than 300 basepairs.", tftbp->from1, tftbp->to1, tftbp->from2, tftbp->to2);
1019  }
1020 
1021  if (tftbp->from1 <= tftsp->to + 1) {
1022  if (tftbp->to1 > tftsp->to)
1023  tftsp->to = tftbp->to1;
1024  continue;
1025  }
1026 
1027  tftsp->next = new FTATpaSpan;
1028  tftsp = tftsp->next;
1029  tftsp->from = tftbp->from1;
1030  tftsp->to = tftbp->to1;
1031  }
1032 
1033  if (ftsp->from - 1 > 50) {
1034  if (tpa)
1035  ErrPostEx(SEV_ERROR, ERR_TPA_IncompleteCoverage, "This TPA record contains a sequence region \"1..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->from - 1);
1036  else
1037  ErrPostEx(SEV_ERROR, ERR_TSA_IncompleteCoverage, "This TSA record contains a sequence region \"1..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->from - 1);
1038  }
1039 
1040  for (; ftsp; ftsp = tftsp) {
1041  tftsp = ftsp->next;
1042  if (tftsp && tftsp->from - ftsp->to - 1 > 50) {
1043  if (tpa)
1044  ErrPostEx(SEV_ERROR, ERR_TPA_IncompleteCoverage, "This TPA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->to + 1, tftsp->from - 1);
1045  else
1046  ErrPostEx(SEV_ERROR, ERR_TSA_IncompleteCoverage, "This TSA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->to + 1, tftsp->from - 1);
1047  } else if (! tftsp && length - ftsp->to > 50) {
1048  if (tpa)
1049  ErrPostEx(SEV_ERROR, ERR_TPA_IncompleteCoverage, "This TPA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->to + 1, length);
1050  else
1051  ErrPostEx(SEV_ERROR, ERR_TSA_IncompleteCoverage, "This TSA record contains a sequence region \"%d..%d\" greater than 50 basepairs long that is not accounted for by a contributing primary sequence or trace record.", ftsp->to + 1, length);
1052  }
1053 
1054  delete ftsp;
1055  }
1056 }
1057 
1058 /**********************************************************/
1059 bool fta_number_is_huge(const Char* s)
1060 {
1061  size_t i = StringLen(s);
1062  if (i > 10)
1063  return true;
1064  else if (i < 10)
1065  return false;
1066 
1067  if (*s > '2')
1068  return true;
1069  else if (*s < '2')
1070  return false;
1071 
1072  if (*++s > '1')
1073  return true;
1074  else if (*s < '1')
1075  return false;
1076 
1077  if (*++s > '4')
1078  return true;
1079  else if (*s < '4')
1080  return false;
1081 
1082  if (*++s > '7')
1083  return true;
1084  else if (*s < '7')
1085  return false;
1086 
1087  if (*++s > '4')
1088  return true;
1089  else if (*s < '4')
1090  return false;
1091 
1092  if (*++s > '8')
1093  return true;
1094  else if (*s < '8')
1095  return false;
1096 
1097  if (*++s > '3')
1098  return true;
1099  else if (*s < '3')
1100  return false;
1101 
1102  if (*++s > '6')
1103  return true;
1104  else if (*s < '6')
1105  return false;
1106 
1107  if (*++s > '4')
1108  return true;
1109  else if (*s < '4')
1110  return false;
1111 
1112  if (*++s > '7')
1113  return true;
1114  return false;
1115 }
1116 
1117 /**********************************************************/
1118 bool fta_parse_tpa_tsa_block(CBioseq& bioseq, char* offset, char* acnum, Int2 vernum, size_t len, Int2 col_data, bool tpa)
1119 {
1120  FTATpaBlockPtr ftbp;
1121  FTATpaBlockPtr tftbp;
1122  FTATpaBlockPtr ft;
1123 
1124  string buf;
1125  char* p;
1126  char* q;
1127  char* r;
1128  char* t;
1129  const char* bad_accession;
1130  bool bad_line;
1131  bool bad_interval;
1132  Char ch;
1133  Int4 from1;
1134  Int4 to1;
1135  Int4 len1;
1136  Int4 len2;
1137 
1138  CSeq_id::E_Choice choice;
1139 
1140  if (! offset || ! acnum || len < 2)
1141  return false;
1142 
1143  choice = GetNucAccOwner(acnum);
1144 
1145  if (col_data == 0) /* HACK: XML format */
1146  {
1147  for (p = offset; *p != '\0'; p++)
1148  if (*p == '~')
1149  *p = '\n';
1150  p = StringChr(offset, '\n');
1151  if (! p)
1152  return false;
1153  buf.assign(p + 1);
1154  buf.append("\n");
1155  } else {
1156  ch = offset[len];
1157  offset[len] = '\0';
1158  p = StringChr(offset, '\n');
1159  if (! p) {
1160  offset[len] = ch;
1161  return false;
1162  }
1163  buf.assign(p + 1);
1164  offset[len] = ch;
1165  }
1166 
1167  ftbp = new FTATpaBlock;
1168 
1169  bad_line = false;
1170  bad_interval = false;
1171  bad_accession = nullptr;
1172  p = buf.data();
1173  for (q = StringChr(p, '\n'); q; p = q + 1, q = StringChr(p, '\n')) {
1174  *q = '\0';
1175  if ((Int2)StringLen(p) < col_data)
1176  break;
1177  for (p += col_data; *p == ' ';)
1178  p++;
1179  for (r = p; *p >= '0' && *p <= '9';)
1180  p++;
1181  if (*p != '-') {
1182  bad_interval = true;
1183  break;
1184  }
1185 
1186  *p++ = '\0';
1187  from1 = atoi(r);
1188 
1189  for (r = p; *p >= '0' && *p <= '9';)
1190  p++;
1191  if (*p != ' ' && *p != '\n' && *p != '\0') {
1192  bad_interval = true;
1193  break;
1194  }
1195  if (*p != '\0')
1196  *p++ = '\0';
1197  to1 = atoi(r);
1198 
1199  if (from1 >= to1) {
1200  bad_interval = true;
1201  break;
1202  }
1203 
1204  for (ft = ftbp; ft->next; ft = ft->next)
1205  if ((ft->next->from1 > from1) ||
1206  (ft->next->from1 == from1 && ft->next->to1 > to1))
1207  break;
1208  tftbp = new FTATpaBlock;
1209  tftbp->next = ft->next;
1210  ft->next = tftbp;
1211 
1212  tftbp->from1 = from1;
1213  tftbp->to1 = to1;
1214 
1215  while (*p == ' ')
1216  p++;
1217  for (r = p; *p != '\0' && *p != ' ' && *p != '\n';)
1218  p++;
1219  if (*p != '\0')
1220  *p++ = '\0';
1221  tftbp->accession = StringSave(r);
1222  r = StringChr(tftbp->accession, '.');
1223  if (r) {
1224  *r++ = '\0';
1225  for (t = r; *t >= '0' && *t <= '9';)
1226  t++;
1227  if (*t != '\0') {
1228  *--r = '.';
1229  bad_accession = tftbp->accession;
1230  break;
1231  }
1232  tftbp->version = atoi(r);
1233  }
1234 
1235  if (StringEquNI(tftbp->accession, "ti", 2)) {
1236  for (r = tftbp->accession + 2; *r == '0';)
1237  r++;
1238  if (*r == '\0') {
1239  bad_accession = tftbp->accession;
1240  break;
1241  }
1242  while (*r >= '0' && *r <= '9')
1243  r++;
1244  if (*r != '\0') {
1245  bad_accession = tftbp->accession;
1246  break;
1247  }
1248  } else {
1249  tftbp->sicho = GetNucAccOwner(tftbp->accession);
1250  if ((tftbp->sicho != CSeq_id::e_Genbank && tftbp->sicho != CSeq_id::e_Embl &&
1251  tftbp->sicho != CSeq_id::e_Ddbj &&
1252  (tftbp->sicho != CSeq_id::e_Tpg || tpa == false))) {
1253  bad_accession = tftbp->accession;
1254  break;
1255  }
1256  }
1257 
1258  while (*p == ' ')
1259  p++;
1260 
1261  if (StringEquNI(p, "not_available", 13)) {
1262  p += 13;
1263  tftbp->from2 = 1;
1264  tftbp->to2 = 1;
1265  } else {
1266  for (r = p; *p >= '0' && *p <= '9';)
1267  p++;
1268  if (*p != '-') {
1269  bad_interval = true;
1270  break;
1271  }
1272  *p++ = '\0';
1273  tftbp->from2 = atoi(r);
1274 
1275  for (r = p; *p >= '0' && *p <= '9';)
1276  p++;
1277  if (*p != ' ' && *p != '\n' && *p != '\0') {
1278  bad_interval = true;
1279  break;
1280  }
1281  if (*p != '\0')
1282  *p++ = '\0';
1283  tftbp->to2 = atoi(r);
1284 
1285  if (tftbp->from2 >= tftbp->to2) {
1286  bad_interval = true;
1287  break;
1288  }
1289  }
1290 
1291  while (*p == ' ')
1292  p++;
1293  if (*p == 'c') {
1294  tftbp->strand = eNa_strand_minus;
1295  for (p++; *p == ' ';)
1296  p++;
1297  } else
1298  tftbp->strand = eNa_strand_plus;
1299  if (*p != '\0') {
1300  bad_line = true;
1301  break;
1302  }
1303  }
1304 
1305  buf.clear();
1306  if (bad_line || bad_interval || bad_accession) {
1307  if (bad_interval) {
1308  if (tpa)
1309  ErrPostEx(SEV_REJECT, ERR_TPA_InvalidPrimarySpan, "Intervals from primary records on which a TPA record is based must be of form X-Y, where X is less than Y and both X and Y are integers. Entry dropped.");
1310  else
1311  ErrPostEx(SEV_REJECT, ERR_TSA_InvalidPrimarySpan, "Intervals from primary records on which a TSA record is based must be of form X-Y, where X is less than Y and both X and Y are integers. Entry dropped.");
1312  } else if (bad_accession) {
1313  if (tpa)
1314  ErrPostEx(SEV_REJECT, ERR_TPA_InvalidPrimarySeqId, "\"%s\" is not a GenBank/EMBL/DDBJ/Trace sequence identifier. Entry dropped.", bad_accession);
1315  else
1316  ErrPostEx(SEV_REJECT, ERR_TSA_InvalidPrimarySeqId, "\"%s\" is not a GenBank/EMBL/DDBJ/Trace sequence identifier. Entry dropped.", bad_accession);
1317  } else {
1318  if (tpa)
1319  ErrPostEx(SEV_REJECT, ERR_TPA_InvalidPrimaryBlock, "Supplied PRIMARY block for TPA record is incorrect. Cannot parse. Entry dropped.");
1320  else
1321  ErrPostEx(SEV_REJECT, ERR_TSA_InvalidPrimaryBlock, "Supplied PRIMARY block for TSA record is incorrect. Cannot parse. Entry dropped.");
1322  }
1323 
1324  if (ftbp)
1325  fta_tpa_block_free(ftbp);
1326  return false;
1327  }
1328 
1329  tftbp = ftbp->next;
1330  ftbp->next = nullptr;
1331  delete ftbp;
1332  ftbp = tftbp;
1333 
1334  fta_check_tpa_tsa_coverage(ftbp, bioseq.GetLength(), tpa);
1335 
1336  CSeq_hist::TAssembly& assembly = bioseq.SetInst().SetHist().SetAssembly();
1337  if (! assembly.empty())
1338  assembly.clear();
1339 
1340  CRef<CSeq_align> root_align(new CSeq_align);
1341 
1342  root_align->SetType(CSeq_align::eType_not_set);
1343  CSeq_align_set& align_set = root_align->SetSegs().SetDisc();
1344 
1345  for (; tftbp; tftbp = tftbp->next) {
1346  len1 = tftbp->to1 - tftbp->from1 + 1;
1347  len2 = tftbp->to2 - tftbp->from2 + 1;
1348 
1349  CRef<CSeq_align> align(new CSeq_align);
1351  align->SetDim(2);
1352 
1353  CSeq_align::C_Segs::TDenseg& seg = align->SetSegs().SetDenseg();
1354 
1355  seg.SetDim(2);
1356  seg.SetNumseg((len1 == len2) ? 1 : 2);
1357 
1358  seg.SetStarts().push_back(tftbp->from1 - 1);
1359  seg.SetStarts().push_back(tftbp->from2 - 1);
1360 
1361  if (len1 != len2) {
1362  if (len1 < len2) {
1363  seg.SetStarts().push_back(-1);
1364  seg.SetStarts().push_back(tftbp->from2 - 1 + len1);
1365  } else {
1366  seg.SetStarts().push_back(tftbp->from1 - 1 + len2);
1367  seg.SetStarts().push_back(-1);
1368  }
1369  }
1370 
1371  if (len1 == len2)
1372  seg.SetLens().push_back(len1);
1373  else if (len1 < len2) {
1374  seg.SetLens().push_back(len1);
1375  seg.SetLens().push_back(len2 - len1);
1376  } else {
1377  seg.SetLens().push_back(len2);
1378  seg.SetLens().push_back(len1 - len2);
1379  }
1380 
1381  seg.SetStrands().push_back(eNa_strand_plus);
1382  seg.SetStrands().push_back(tftbp->strand);
1383 
1384  if (len1 != len2) {
1385  seg.SetStrands().push_back(eNa_strand_plus);
1386  seg.SetStrands().push_back(tftbp->strand);
1387  }
1388 
1389  CRef<CTextseq_id> text_id(new CTextseq_id);
1390  text_id->SetAccession(acnum);
1391 
1392  if (vernum > 0)
1393  text_id->SetVersion(vernum);
1394 
1395  CRef<CSeq_id> id(new CSeq_id),
1396  aux_id;
1397  SetTextId(choice, *id, *text_id);
1398  seg.SetIds().push_back(id);
1399 
1400  if (StringEquNI(tftbp->accession, "ti", 2)) {
1401  CRef<CSeq_id> gen_id(new CSeq_id);
1402  CDbtag& tag = gen_id->SetGeneral();
1403 
1404  for (r = tftbp->accession + 2; *r == '0';)
1405  r++;
1406  if (fta_number_is_huge(r) == false)
1407  tag.SetTag().SetId(atoi(r));
1408  else
1409  tag.SetTag().SetStr(r);
1410 
1411  tag.SetDb("ti");
1412  seg.SetIds().push_back(gen_id);
1413  } else {
1414  CRef<CTextseq_id> otext_id(new CTextseq_id);
1415  otext_id->SetAccession(tftbp->accession);
1416 
1417  if (tftbp->version > 0)
1418  otext_id->SetVersion(tftbp->version);
1419 
1420  aux_id.Reset(new CSeq_id);
1421  SetTextId(tftbp->sicho, *aux_id, *otext_id);
1422  }
1423 
1424  if (aux_id.NotEmpty())
1425  seg.SetIds().push_back(aux_id);
1426 
1427  align_set.Set().push_back(align);
1428  }
1429 
1430  assembly.push_back(root_align);
1431 
1432  if (ftbp)
1433  fta_tpa_block_free(ftbp);
1434  return true;
1435 }
1436 
1437 /**********************************************************/
1438 char* StringRStr(char* where, const char* what)
1439 {
1440  if (! where || ! what || *where == '\0' || *what == '\0')
1441  return nullptr;
1442 
1443  size_t i = StringLen(what);
1444  char* res = nullptr;
1445  for (char* p = where; *p != '\0'; p++)
1446  if (StringEquN(p, what, i))
1447  res = p;
1448 
1449  return (res);
1450 }
1451 
1452 /**********************************************************/
1454 {
1455  CRef<CSeq_loc> ret;
1456 
1457  if (len < 1)
1458  return ret;
1459 
1460  ret.Reset(new CSeq_loc);
1461  CSeq_interval& interval = ret->SetInt();
1462 
1463  interval.SetFrom(0);
1464  interval.SetTo(static_cast<TSeqPos>(len) - 1);
1465  interval.SetId(seq_id);
1466 
1467  return ret;
1468 }
1469 
1470 /**********************************************************/
1471 static void fta_validate_assembly(char* name)
1472 {
1473  bool bad_format = false;
1474 
1475  char* p = name;
1476  if (! p || *p == '\0' || StringLen(p) < 7)
1477  bad_format = true;
1478  else if (p[0] != 'G' || p[1] != 'C' || (p[2] != 'F' && p[2] != 'A') ||
1479  p[3] != '_' || p[4] < '0' || p[4] > '9')
1480  bad_format = true;
1481  else {
1482  for (p += 5; *p != '\0'; p++)
1483  if (*p < '0' || *p > '9')
1484  break;
1485  if (*p != '.' || p[1] < '0' || p[1] > '9')
1486  bad_format = true;
1487  else {
1488  for (p++; *p != '\0'; p++)
1489  if (*p < '0' || *p > '9')
1490  break;
1491  if (*p != '\0')
1492  bad_format = true;
1493  }
1494  }
1495 
1496  if (bad_format)
1497  ErrPostEx(SEV_WARNING, ERR_DBLINK_InvalidIdentifier, "\"%s\" is not a validly formatted identifier for the Assembly resource.", name);
1498 }
1499 
1500 /**********************************************************/
1502 {
1503  char* p;
1504  bool bad_format = false;
1505 
1506  if (StringLen(name) < 6)
1507  bad_format = true;
1508  else if (name[0] != 'P' || name[1] != 'R' || name[2] != 'J' ||
1509  (name[3] != 'E' && name[3] != 'N' && name[3] != 'D') ||
1510  name[4] < 'A' || name[4] > 'Z' || name[5] < '0' || name[5] > '9')
1511  bad_format = true;
1512  else {
1513  for (p = name + 6; *p != '\0'; p++)
1514  if (*p < '0' || *p > '9')
1515  break;
1516  if (*p != '\0')
1517  bad_format = true;
1518  }
1519 
1520  if (bad_format) {
1521  ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc, "BioProject accession number is not validly formatted: \"%s\". Entry dropped.", name);
1522  return false;
1523  }
1524 
1525  if ((source == Parser::ESource::NCBI && name[3] != 'N') ||
1526  (source == Parser::ESource::DDBJ && name[3] != 'D' &&
1527  (name[3] != 'N' || name[4] != 'A')) ||
1528  (source == Parser::ESource::EMBL && name[3] != 'E' &&
1529  (name[3] != 'N' || name[4] != 'A')))
1530  ErrPostEx(SEV_WARNING, ERR_FORMAT_WrongBioProjectPrefix, "BioProject accession number does not agree with this record's database of origin: \"%s\".", name);
1531 
1532  return true;
1533 }
1534 
1535 /**********************************************************/
1537 {
1538  ValNodePtr vnp;
1539  ValNodePtr tvnp;
1540  char* p;
1541  char* q;
1542  char* r;
1543  bool bad;
1544  Char ch;
1545 
1546  if (! str || *str == '\0') {
1547  ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc, "Empty PROJECT/PR line type supplied. Entry dropped.");
1548  return nullptr;
1549  }
1550 
1551  for (p = str; *p != '\0'; p++)
1552  if (*p == ';' || *p == ',' || *p == '\t')
1553  *p = ' ';
1554 
1555  for (p = str; *p == ' ';)
1556  p++;
1557  if (*p == '\0') {
1558  ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc, "Empty PROJECT/PR line type supplied. Entry dropped.");
1559  return nullptr;
1560  }
1561 
1562  vnp = ValNodeNew(nullptr);
1563  tvnp = vnp;
1564 
1565  for (bad = false, p = str; *p != '\0';) {
1566  while (*p == ' ')
1567  p++;
1568 
1569  if (*p == '\0')
1570  break;
1571 
1572  for (q = p; *p != ' ' && *p != '\0';)
1573  p++;
1574 
1575  ch = *p;
1576  *p = '\0';
1577  if (! newstyle) {
1578  for (r = q; *r >= '0' && *r <= '9';)
1579  r++;
1580  if (*r != '\0') {
1581  ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc, "BioProject accession number is not validly formatted: \"%s\". Entry dropped.", q);
1582  bad = true;
1583  }
1584  } else if (fta_validate_bioproject(q, source) == false)
1585  bad = true;
1586 
1587  if (bad) {
1588  *p = ch;
1589  break;
1590  }
1591 
1592  tvnp = ValNodeNew(tvnp, q);
1593  *p = ch;
1594  }
1595 
1596  tvnp = vnp->next;
1597  delete vnp;
1598 
1599  if (! tvnp)
1600  return nullptr;
1601 
1602  if (! bad)
1603  return (tvnp);
1604 
1605  ValNodeFreeData(tvnp);
1606  return nullptr;
1607 }
1608 
1609 /**********************************************************/
1611 {
1612  ValNodePtr vnp;
1613  ValNodePtr tvnp;
1614 
1615  const Char* name;
1616 
1617  char* str;
1618  char* p;
1619  Char ch;
1620  Int4 i;
1621 
1622  if (! offset)
1623  return;
1624 
1625  bool newstyle = false;
1627  i = ParFlat_COL_DATA;
1628  name = "GenomeProject:";
1629  ch = '\n';
1630  } else {
1632  name = "Project:";
1633  ch = ';';
1634  }
1635 
1636  size_t len = StringLen(name);
1637  str = StringSave(offset + i);
1638  p = StringChr(str, ch);
1639  if (p)
1640  *p = '\0';
1641 
1642  if (! StringEquN(str, name, len)) {
1644  ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidBioProjectAcc, "PROJECT line is missing \"GenomeProject:\" tag. Entry dropped.", str);
1645  MemFree(str);
1646  *drop = true;
1647  return;
1648  }
1649  newstyle = true;
1650  len = 0;
1651  } else if (format == Parser::EFormat::EMBL && str[len] == 'P')
1652  newstyle = true;
1653 
1654  vnp = fta_tokenize_project(str + len, source, newstyle);
1655  if (! vnp) {
1656  *drop = true;
1657  MemFree(str);
1658  return;
1659  }
1660 
1661  CUser_object* user_obj_ptr;
1662  bool got = false;
1663 
1664  for (auto& descr : descrs) {
1665  if (! descr->IsUser() || ! descr->GetUser().IsSetData())
1666  continue;
1667 
1668  user_obj_ptr = &(descr->SetUser());
1669 
1670  CObject_id* obj_id = nullptr;
1671  if (user_obj_ptr->IsSetType())
1672  obj_id = &(user_obj_ptr->SetType());
1673 
1674  if (obj_id && obj_id->IsStr() && obj_id->GetStr() == "DBLink") {
1675  got = true;
1676  break;
1677  }
1678  }
1679 
1680  CRef<CUser_object> user_obj;
1681  if (newstyle) {
1682  for (i = 0, tvnp = vnp; tvnp; tvnp = tvnp->next)
1683  i++;
1684 
1685  if (! got) {
1686  user_obj.Reset(new CUser_object);
1687  user_obj_ptr = user_obj.GetNCPointer();
1688 
1689  CObject_id& id = user_obj_ptr->SetType();
1690  id.SetStr("DBLink");
1691  }
1692 
1693  CRef<CUser_field> user_field(new CUser_field);
1694  user_field->SetLabel().SetStr("BioProject");
1695  user_field->SetNum(i);
1696 
1697  for (tvnp = vnp; tvnp; tvnp = tvnp->next)
1698  user_field->SetData().SetStrs().push_back(tvnp->data);
1699 
1700  user_obj_ptr->SetData().push_back(user_field);
1701  } else {
1702  got = false;
1703 
1704  user_obj.Reset(new CUser_object);
1705  user_obj_ptr = user_obj.GetNCPointer();
1706 
1707  CObject_id& id = user_obj_ptr->SetType();
1708  id.SetStr("GenomeProjectsDB");
1709 
1710  for (tvnp = vnp; tvnp; tvnp = tvnp->next) {
1711 
1712  CRef<CUser_field> user_field(new CUser_field);
1713  user_field->SetLabel().SetStr("ProjectID");
1714  user_field->SetData().SetInt(atoi(tvnp->data));
1715  user_obj_ptr->SetData().push_back(user_field);
1716 
1717 
1718  user_field.Reset(new CUser_field);
1719  user_field->SetLabel().SetStr("ParentID");
1720  user_field->SetData().SetInt(0);
1721  user_obj_ptr->SetData().push_back(user_field);
1722  }
1723  }
1724 
1725  if (! got) {
1726  CRef<CSeqdesc> descr(new CSeqdesc);
1727  descr->SetUser(*user_obj_ptr);
1728  descrs.push_back(descr);
1729  }
1730 
1731  MemFree(str);
1732  ValNodeFree(vnp);
1733 }
1734 
1735 /**********************************************************/
1736 bool fta_if_valid_sra(const Char* id, bool dblink)
1737 {
1738  const Char* p = id;
1739 
1740  if (p && StringLen(p) > 3 &&
1741  (p[0] == 'E' || p[0] == 'S' || p[0] == 'D') && p[1] == 'R' &&
1742  (p[2] == 'A' || p[2] == 'P' || p[2] == 'R' || p[2] == 'S' ||
1743  p[2] == 'X' || p[2] == 'Z')) {
1744  for (p += 3; *p >= '0' && *p <= '9';)
1745  p++;
1746  if (*p == '\0')
1747  return true;
1748  }
1749 
1750  if (dblink)
1751  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Incorrectly formatted DBLINK Sequence Read Archive value: \"%s\". Entry dropped.", id);
1752 
1753  return false;
1754 }
1755 
1756 /**********************************************************/
1757 bool fta_if_valid_biosample(const Char* id, bool dblink)
1758 {
1759  const Char* p = id;
1760 
1761  if (p && StringLen(p) > 5 && p[0] == 'S' && p[1] == 'A' &&
1762  p[2] == 'M' && (p[3] == 'N' || p[3] == 'E' || p[3] == 'D')) {
1763  if (p[4] == 'A' || p[4] == 'G')
1764  p += 5;
1765  else
1766  p += 4;
1767  while (*p >= '0' && *p <= '9')
1768  p++;
1769  if (*p == '\0')
1770  return true;
1771  }
1772 
1773  if (dblink)
1774  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Incorrectly formatted DBLINK BioSample value: \"%s\". Entry dropped.", id);
1775 
1776  return false;
1777 }
1778 
1779 /**********************************************************/
1781 {
1782  ValNodePtr vnp;
1783  ValNodePtr tvnp;
1784  ValNodePtr uvnp;
1785  ValNodePtr tagvnp;
1786 
1787  bool got_nl;
1788  bool bad;
1789  bool sra;
1790  bool assembly;
1791  bool biosample;
1792  bool bioproject;
1793 
1794  char* p;
1795  char* q;
1796  char* r = nullptr;
1797  char* t;
1798  char* u;
1799  Char ch;
1800 
1801  if (! str || *str == '\0') {
1802  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Empty DBLINK line type supplied. Entry dropped.");
1803  return nullptr;
1804  }
1805 
1806  for (p = str; *p != '\0'; p++)
1807  if (*p == ';' || *p == '\t')
1808  *p = ' ';
1809 
1810  vnp = ValNodeNew(nullptr);
1811  tvnp = vnp;
1812  bad = false;
1813  got_nl = true;
1814  sra = false;
1815  assembly = false;
1816  biosample = false;
1817  bioproject = false;
1818  tagvnp = nullptr;
1819 
1820  for (p = str; *p != '\0'; got_nl = false) {
1821  while (*p == ' ' || *p == '\n' || *p == ':' || *p == ',') {
1822  if (*p == '\n')
1823  got_nl = true;
1824  p++;
1825  }
1826 
1827  if (got_nl) {
1828  t = StringChr(p, ':');
1829  if (t) {
1830  r = StringChr(p, '\n');
1831  u = StringChr(p, ',');
1832 
1833  if ((! u || u > t) && (! r || r > t)) {
1834  ch = *++t;
1835  *t = '\0';
1836 
1837  if (! StringEqu(p, "Project:") &&
1838  ! StringEqu(p, "Assembly:") &&
1839  ! StringEqu(p, "BioSample:") &&
1840  ! StringEqu(p, "BioProject:") &&
1841  ! StringEqu(p, "Sequence Read Archive:") &&
1842  ! StringEqu(p, "Trace Assembly Archive:")) {
1843  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Invalid DBLINK tag encountered: \"%s\". Entry dropped.", p);
1844  bad = true;
1845  break;
1846  }
1847 
1848  bioproject = StringEqu(p, "BioProject:");
1849  sra = StringEqu(p, "Sequence Read Archive:");
1850  biosample = StringEqu(p, "BioSample:");
1851  assembly = StringEqu(p, "Assembly:");
1852 
1853  if (tvnp->data && StringChr(tvnp->data, ':')) {
1854  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Found DBLINK tag with no value: \"%s\". Entry dropped.", tvnp->data);
1855  bad = true;
1856  break;
1857  }
1858 
1859  for (uvnp = vnp->next; uvnp; uvnp = uvnp->next)
1860  if (StringEqu(uvnp->data, p)) {
1861  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Multiple DBLINK tags found: \"%s\". Entry dropped.", p);
1862  bad = true;
1863  break;
1864  }
1865  if (bad)
1866  break;
1867 
1868  tvnp = ValNodeNew(tvnp, p);
1869  tagvnp = tvnp;
1870  *t = ch;
1871  p = t;
1872  continue;
1873  }
1874  }
1875  }
1876 
1877  q = p;
1878  while (*p != ',' && *p != '\n' && *p != ':' && *p != '\0')
1879  p++;
1880  if (*p == ':') {
1881  while (*p != '\0' && *p != '\n')
1882  p++;
1883  ch = *p;
1884  *p = '\0';
1885  while (*r != '\n' && r > str)
1886  r--;
1887  while (*r == ' ' || *r == '\n')
1888  r++;
1889  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Too many delimiters/fields for DBLINK line: \"%s\". Entry dropped.", r);
1890  *p = ch;
1891  bad = true;
1892  break;
1893  }
1894 
1895  if (q == p)
1896  continue;
1897 
1898  ch = *p;
1899  *p = '\0';
1900 
1901  if (tagvnp && tagvnp->data) {
1902  for (uvnp = tagvnp->next; uvnp; uvnp = uvnp->next) {
1903  if (! uvnp->data || ! StringEqu(uvnp->data, q))
1904  continue;
1905 
1906  ErrPostEx(SEV_WARNING, ERR_DBLINK_DuplicateIdentifierRemoved, "Duplicate identifier \"%s\" from \"%s\" link removed.", q, tagvnp->data);
1907  break;
1908  }
1909 
1910  if (uvnp) {
1911  *p = ch;
1912  continue;
1913  }
1914  }
1915 
1916  if ((bioproject &&
1917  fta_validate_bioproject(q, source) == false) ||
1918  (biosample && fta_if_valid_biosample(q, true) == false) ||
1919  (sra && fta_if_valid_sra(q, true) == false)) {
1920  *p = ch;
1921  bad = true;
1922  }
1923 
1924  if (assembly)
1926 
1927  tvnp = ValNodeNew(tvnp, q);
1928  *p = ch;
1929  }
1930 
1931  if (! bad && tvnp->data && StringChr(tvnp->data, ':')) {
1932  ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectDBLINK, "Found DBLINK tag with no value: \"%s\". Entry dropped.", tvnp->data);
1933  bad = true;
1934  }
1935 
1936  tvnp = vnp->next;
1937  delete vnp;
1938 
1939  if (! tvnp)
1940  return nullptr;
1941 
1942  if (! bad)
1943  return (tvnp);
1944 
1945  ValNodeFreeData(tvnp);
1946  return nullptr;
1947 }
1948 
1949 /**********************************************************/
1951 {
1952  ValNodePtr vnp;
1953  ValNodePtr tvnp;
1954  ValNodePtr uvnp;
1955 
1956  const char* str;
1957  Int4 i;
1958 
1959  if (! offset)
1960  return;
1961 
1962  char* str1 = StringSave(offset + ParFlat_COL_DATA);
1963  str1[len - ParFlat_COL_DATA] = '\0';
1964  vnp = fta_tokenize_dblink(str1, source);
1965  MemFree(str1);
1966 
1967  if (! vnp) {
1968  *drop = true;
1969  return;
1970  }
1971 
1972  CRef<CUser_object> user_obj;
1973  CRef<CUser_field> user_field;
1974 
1975  for (tvnp = vnp; tvnp; tvnp = tvnp->next) {
1976  if (StringChr(tvnp->data, ':')) {
1977  if (user_obj.NotEmpty())
1978  break;
1979 
1980  if (StringEqu(tvnp->data, "Project:")) {
1981  user_obj.Reset(new CUser_object);
1982  CObject_id& id = user_obj->SetType();
1983 
1984  id.SetStr("GenomeProjectsDB");
1985  }
1986  continue;
1987  }
1988 
1989  if (user_obj.Empty())
1990  continue;
1991 
1992  str = tvnp->data;
1993  if (! str || *str == '\0')
1994  continue;
1995 
1996  if (*str != '0')
1997  while (*str >= '0' && *str <= '9')
1998  str++;
1999  if (*str != '\0') {
2000  ErrPostEx(SEV_ERROR, ERR_FORMAT_IncorrectDBLINK, "Skipping invalid \"Project:\" value on the DBLINK line: \"%s\".", tvnp->data);
2001  continue;
2002  }
2003 
2004  user_field.Reset(new CUser_field);
2005 
2006  user_field->SetLabel().SetStr("ProjectID");
2007  user_field->SetData().SetInt(atoi(tvnp->data));
2008  user_obj->SetData().push_back(user_field);
2009 
2010  user_field.Reset(new CUser_field);
2011  user_field->SetLabel().SetStr("ParentID");
2012  user_field->SetData().SetInt(0);
2013 
2014  user_obj->SetData().push_back(user_field);
2015  }
2016 
2017  if (user_obj.NotEmpty() && ! user_obj->IsSetData()) {
2018  user_obj.Reset();
2019  }
2020 
2021  if (user_obj.NotEmpty()) {
2022  CRef<CSeqdesc> descr(new CSeqdesc);
2023  descr->SetUser(*user_obj);
2024  descrs.push_back(descr);
2025  }
2026 
2027  user_obj.Reset();
2028  user_field.Reset();
2029 
2030  bool inpr = false;
2031  for (tvnp = vnp; tvnp; tvnp = tvnp->next) {
2032  if (StringChr(tvnp->data, ':')) {
2033  if (StringEqu(tvnp->data, "Project:")) {
2034  inpr = true;
2035  continue;
2036  }
2037 
2038  inpr = false;
2039 
2040  if (user_obj.Empty()) {
2041  user_obj.Reset(new CUser_object);
2042  user_obj->SetType().SetStr("DBLink");
2043  }
2044 
2045  for (i = 0, uvnp = tvnp->next; uvnp; uvnp = uvnp->next, i++)
2046  if (StringChr(uvnp->data, ':'))
2047  break;
2048 
2049  user_field.Reset(new CUser_field);
2050 
2051  string lstr(tvnp->data);
2052  lstr = lstr.substr(0, lstr.size() - 1);
2053  user_field->SetLabel().SetStr(lstr);
2054  user_field->SetNum(i);
2055  user_field->SetData().SetStrs();
2056 
2057  user_obj->SetData().push_back(user_field);
2058  } else if (! inpr && user_obj.NotEmpty()) {
2059  user_field->SetData().SetStrs().push_back(tvnp->data);
2060  }
2061  }
2062 
2063  ValNodeFreeData(vnp);
2064 
2065  if (user_obj.NotEmpty()) {
2066  CRef<CSeqdesc> descr(new CSeqdesc);
2067  descr->SetUser(*user_obj);
2068  descrs.push_back(descr);
2069 
2070  dbuop = user_obj;
2071  }
2072 }
2073 
2074 /**********************************************************/
2076 {
2077  if (bioseq.GetInst().GetRepr() != CSeq_inst::eRepr_delta || ! bioseq.GetInst().IsSetExt() || ! bioseq.GetInst().GetExt().IsDelta())
2078  return CMolInfo::eTech_unknown;
2079 
2080  bool good = false;
2081  bool finished = true;
2082 
2083  for (const auto& delta : bioseq.GetInst().GetExt().GetDelta().Get()) {
2084  if (! delta->IsLoc())
2085  continue;
2086 
2087  const CSeq_loc& locs = delta->GetLoc();
2088  CSeq_loc_CI ci(locs);
2089 
2090  for (; ci; ++ci) {
2091  const CSeq_id* id = nullptr;
2092 
2094  if (loc->IsEmpty() || loc->IsWhole() || loc->IsInt() || loc->IsPnt() || loc->IsPacked_pnt())
2095  id = &ci.GetSeq_id();
2096  else
2097  continue;
2098 
2099  if (! id)
2100  break;
2101 
2102  if (! id->IsGenbank() && ! id->IsEmbl() &&
2103  ! id->IsOther() && ! id->IsDdbj() &&
2104  ! id->IsTpg() && ! id->IsTpe() && ! id->IsTpd())
2105  break;
2106 
2107  const CTextseq_id* text_id = id->GetTextseq_Id();
2108  if (! text_id || ! text_id->IsSetAccession() ||
2109  text_id->GetAccession().empty() ||
2110  fta_if_wgs_acc(text_id->GetAccession().c_str()) != 1)
2111  break;
2112  good = true;
2113  }
2114 
2115  if (ci) {
2116  finished = false;
2117  break;
2118  }
2119  }
2120 
2121  if (good && finished)
2122  return CMolInfo::eTech_wgs;
2123 
2124  return CMolInfo::eTech_unknown;
2125 }
2126 
2127 /**********************************************************/
2128 static void fta_fix_seq_id(CSeq_loc& loc, CSeq_id& id, IndexblkPtr ibp, char* location, const char* name, SeqLocIdsPtr slip, bool iscon, Parser::ESource source)
2129 {
2130  Int4 i;
2131  Char ch;
2132 
2133  if (! ibp)
2134  return;
2135 
2136  if (id.IsLocal()) {
2137  return;
2138  }
2139 
2140  if (! name && id.IsGeneral()) {
2141  const CDbtag& tag = id.GetGeneral();
2142  if (tag.GetDb() == "SeqLit" || tag.GetDb() == "UnkSeqLit")
2143  return;
2144  }
2145 
2146  if (! id.IsGenbank() && ! id.IsEmbl() && ! id.IsPir() &&
2147  ! id.IsSwissprot() && ! id.IsOther() && ! id.IsDdbj() && ! id.IsPrf() &&
2148  ! id.IsTpg() && ! id.IsTpe() && ! id.IsTpd()) {
2149  if (StringLen(location) > 50) {
2150  ch = location[50];
2151  location[50] = '\0';
2152  } else
2153  ch = '\0';
2154 
2155  if (! name)
2156  ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem, "Empty or unsupported Seq-id found in CONTIG/CO line at location: \"%s\". Entry skipped.", location);
2157  else
2158  ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem, "Empty or unsupported Seq-id found in feature \"%s\" at location \"%s\". Entry skipped.", name, location);
2159  if (ch != '\0')
2160  location[50] = ch;
2161  ibp->drop = true;
2162  return;
2163  }
2164 
2165  const CTextseq_id* text_id = id.GetTextseq_Id();
2166  if (! text_id || ! text_id->IsSetAccession()) {
2167  if (StringLen(location) > 50) {
2168  ch = location[50];
2169  location[50] = '\0';
2170  } else
2171  ch = '\0';
2172  if (! name)
2173  ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem, "Empty Seq-id found in CONTIG/CO line at location: \"%s\". Entry skipped.", location);
2174  else
2175  ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem, "Empty Seq-id found in feature \"%s\" at location \"%s\". Entry skipped.", name, location);
2176  if (ch != '\0')
2177  location[50] = ch;
2178  ibp->drop = true;
2179  return;
2180  }
2181 
2182  const Char* accession = text_id->GetAccession().c_str();
2183  if (iscon) {
2184  i = IsNewAccessFormat(accession);
2185  if (i == 3) {
2186  if (! slip->wgscont)
2187  slip->wgscont = accession;
2188  else if (! slip->wgsacc && ! StringEquN(slip->wgscont, accession, 4))
2189  slip->wgsacc = accession;
2190  } else if (i == 7) {
2191  if (! slip->wgsscaf)
2192  slip->wgsscaf = accession;
2193  else if (! slip->wgsacc && ! StringEquN(slip->wgsscaf, accession, 4))
2194  slip->wgsacc = accession;
2195  }
2196  }
2197 
2200  if (type != id.Which()) {
2201  CRef<CTextseq_id> new_text_id(new CTextseq_id);
2202  new_text_id->Assign(*text_id);
2203  SetTextId(type, id, *new_text_id);
2204  }
2205  } else if (source == Parser::ESource::Flybase) {
2206  id.SetGeneral().SetDb("FlyBase");
2207  id.SetGeneral().SetTag().SetStr(accession);
2208  } else if (source == Parser::ESource::USPTO) {
2209  CRef<CPatent_seq_id> pat_id = MakeUsptoPatSeqId(accession);
2210  id.SetPatent(*pat_id);
2211  } else {
2212  if (StringLen(location) > 50) {
2213  ch = location[50];
2214  location[50] = '\0';
2215  } else
2216  ch = '\0';
2217  if (! name)
2218  ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem, "Invalid accession found in CONTIG/CO line at location: \"%s\". Entry skipped.", location);
2219  else
2220  ErrPostEx(SEV_REJECT, ERR_LOCATION_SeqIdProblem, "Invalid accession found in feature \"%s\" at location \"%s\". Entry skipped.", name, location);
2221  if (ch != '\0')
2222  location[50] = ch;
2223  ibp->drop = true;
2224  return;
2225  }
2226 
2227  slip->total++;
2228 
2229  if (id.IsGenbank()) {
2231  source != Parser::ESource::LANL && ! slip->badslp)
2232  slip->badslp = &loc;
2233  slip->genbank = 1;
2234  } else if (id.IsEmbl()) {
2236  ! slip->badslp)
2237  slip->badslp = &loc;
2238  slip->embl = 1;
2239  } else if (id.IsPir()) {
2240  if (source != Parser::ESource::All &&
2241  ! slip->badslp)
2242  slip->badslp = &loc;
2243  slip->pir = 1;
2244  } else if (id.IsSwissprot()) {
2246  ! slip->badslp)
2247  slip->badslp = &loc;
2248  slip->swissprot = 1;
2249  } else if (id.IsOther()) {
2251  ! slip->badslp)
2252  slip->badslp = &loc;
2253  slip->other = 1;
2254  } else if (id.IsDdbj()) {
2256  ! slip->badslp)
2257  slip->badslp = &loc;
2258  slip->ddbj = 1;
2259  } else if (id.IsPrf()) {
2260  if (source != Parser::ESource::All &&
2261  ! slip->badslp)
2262  slip->badslp = &loc;
2263  slip->prf = 1;
2264  } else if (id.IsTpg()) {
2266  source != Parser::ESource::LANL && ! slip->badslp)
2267  slip->badslp = &loc;
2268  slip->tpg = 1;
2269  } else if (id.IsTpe()) {
2271  ! slip->badslp)
2272  slip->badslp = &loc;
2273  slip->tpe = 1;
2274  } else if (id.IsTpd()) {
2276  ! slip->badslp)
2277  slip->badslp = &loc;
2278  slip->tpd = 1;
2279  }
2280 }
2281 
2282 /**********************************************************/
2283 static void fta_do_fix_seq_loc_id(TSeqLocList& locs, IndexblkPtr ibp, char* location, const char* name, SeqLocIdsPtr slip, bool iscon, Parser::ESource source)
2284 {
2285  for (auto& loc : locs) {
2286  if (loc->IsEmpty()) {
2287  fta_fix_seq_id(*loc, loc->SetEmpty(), ibp, location, name, slip, iscon, source);
2288  } else if (loc->IsWhole()) {
2289  fta_fix_seq_id(*loc, loc->SetWhole(), ibp, location, name, slip, iscon, source);
2290  } else if (loc->IsInt()) {
2291  fta_fix_seq_id(*loc, loc->SetInt().SetId(), ibp, location, name, slip, iscon, source);
2292  } else if (loc->IsPnt()) {
2293  fta_fix_seq_id(*loc, loc->SetPnt().SetId(), ibp, location, name, slip, iscon, source);
2294  if (iscon && ! loc->GetPnt().IsSetFuzz()) {
2295  int point = loc->GetPnt().GetPoint();
2296  CRef<CSeq_interval> interval(new CSeq_interval);
2297  interval->SetFrom(point);
2298  interval->SetTo(point);
2299 
2300  if (loc->GetPnt().IsSetStrand())
2301  interval->SetStrand(loc->GetPnt().GetStrand());
2302 
2303  interval->SetId(loc->SetPnt().SetId());
2304  loc->SetInt(*interval);
2305  }
2306  } else if (loc->IsPacked_int()) {
2307  for (auto& interval : loc->SetPacked_int().Set()) {
2308  fta_fix_seq_id(*loc, interval->SetId(), ibp, location, name, slip, iscon, source);
2309  }
2310  } else if (loc->IsPacked_pnt()) {
2311  fta_fix_seq_id(*loc, loc->SetPacked_pnt().SetId(), ibp, location, name, slip, iscon, source);
2312  } else if (loc->IsMix()) {
2313  fta_do_fix_seq_loc_id(loc->SetMix().Set(), ibp, location, name, slip, iscon, source);
2314  } else if (loc->IsEquiv()) {
2315  fta_do_fix_seq_loc_id(loc->SetEquiv().Set(), ibp, location, name, slip, iscon, source);
2316  }
2317  }
2318 }
2319 
2320 /**********************************************************/
2321 Int4 fta_fix_seq_loc_id(TSeqLocList& locs, ParserPtr pp, char* location, const char* name, bool iscon)
2322 {
2323  SeqLocIds sli;
2324  const Char* p = nullptr;
2325  ErrSev sev;
2326  IndexblkPtr ibp;
2327  Char ch;
2328  Int4 tpa;
2329  Int4 non_tpa;
2330  Int4 i = 0;
2331 
2332  ibp = pp->entrylist[pp->curindx];
2333 
2334  fta_do_fix_seq_loc_id(locs, ibp, location, name, &sli, iscon, pp->source);
2335 
2336  tpa = sli.tpg + sli.tpe + sli.tpd;
2337  non_tpa = sli.genbank + sli.embl + sli.pir + sli.swissprot + sli.other +
2338  sli.ddbj + sli.prf;
2339 
2340  if (iscon && ! sli.wgsacc && sli.wgscont &&
2341  sli.wgsscaf && ! StringEquN(sli.wgscont, sli.wgsscaf, 4))
2342  sli.wgsacc = sli.wgsscaf;
2343 
2344  ch = '\0';
2345  if ((tpa > 0 && non_tpa > 0) || tpa > 1 || non_tpa > 1 ||
2346  (iscon && sli.wgscont && sli.wgsscaf)) {
2347  if (StringLen(location) > 50) {
2348  ch = location[50];
2349  location[50] = '\0';
2350  }
2351  }
2352 
2353  if (tpa > 0 && non_tpa > 0) {
2354  if (! name)
2355  ErrPostEx(SEV_REJECT, ERR_LOCATION_TpaAndNonTpa, "The CONTIG/CO line with location \"%s\" refers to intervals on both primary and third-party sequence records. Entry skipped.", location);
2356  else
2357  ErrPostEx(SEV_REJECT, ERR_LOCATION_TpaAndNonTpa, "The \"%s\" feature at \"%s\" refers to intervals on both primary and third-party sequence records. Entry skipped.", name, location);
2358  ibp->drop = true;
2359  }
2360 
2361  if (tpa > 1 || non_tpa > 1) {
2362  if (! pp->allow_crossdb_featloc) {
2363  sev = SEV_REJECT;
2364  p = "Entry skipped.";
2365  ibp->drop = true;
2366  } else {
2367  sev = SEV_WARNING;
2368  p = "";
2369  }
2370  if (! name) {
2371  string label;
2372  if (sli.badslp)
2373  sli.badslp->GetLabel(&label);
2374 
2375  ErrPostEx(sev, ERR_LOCATION_CrossDatabaseFeatLoc, "The CONTIG/CO line refers to intervals on records from two or more INSDC databases. This is not allowed without review and approval : \"%s\".%s", label.empty() ? location : label.c_str(), p);
2376  } else
2377  ErrPostEx(sev, ERR_LOCATION_CrossDatabaseFeatLoc, "The \"%s\" feature at \"%s\" refers to intervals on records from two or more INSDC databases. This is not allowed without review and approval.%s", name, location, p);
2378  }
2379 
2380  if (iscon) {
2381  if (sli.wgscont && sli.wgsscaf)
2382  ErrPostEx(SEV_ERROR, ERR_LOCATION_ContigAndScaffold, "The CONTIG/CO line with location \"%s\" refers to intervals on both WGS contig and WGS scaffold records.", location);
2383 
2384  if (sli.wgsacc) {
2385  if (sli.wgscont && ! StringEquN(sli.wgscont, sli.wgsacc, 4))
2386  p = sli.wgscont;
2387  else if (sli.wgsscaf && ! StringEquN(sli.wgsscaf, sli.wgsacc, 4))
2388  p = sli.wgsscaf;
2389 
2390  if (p) {
2391  Char msga[5],
2392  msgb[5];
2393 
2394  StringNCpy(msga, sli.wgsacc, 4);
2395  StringNCpy(msgb, p, 4);
2396  msga[4] = msgb[4] = 0;
2397 
2398  ErrPostEx(SEV_WARNING, ERR_SEQUENCE_MultipleWGSProjects, "This CON/scaffold record is assembled from the contigs of multiple WGS projects. First pair of WGS project codes is \"%s\" and \"%s\".", msgb, msga);
2399  }
2400  }
2401 
2402  i = IsNewAccessFormat(ibp->acnum);
2403  if (i == 3 || i == 7) {
2404  p = nullptr;
2405  if (sli.wgscont && ! StringEquN(sli.wgscont, ibp->acnum, 4))
2406  p = sli.wgscont;
2407  else if (sli.wgsscaf && ! StringEquN(sli.wgsscaf, ibp->acnum, 4))
2408  p = sli.wgsscaf;
2409  else if (sli.wgsacc && ! StringEquN(sli.wgsacc, ibp->acnum, 4))
2410  p = sli.wgsscaf; // ?
2411 
2412  if (p) {
2413  Char msg[5];
2414  StringNCpy(msg, p, 4);
2415  msg[4] = 0;
2416 
2417  ErrPostEx(SEV_WARNING, ERR_ACCESSION_WGSPrefixMismatch, "This WGS CON/scaffold record is assembled from the contigs of different WGS projects. First differing WGS project code is \"%s\".", msg);
2418  }
2419  }
2420  }
2421 
2422  if (ch != '\0')
2423  location[50] = ch;
2424 
2425  if (sli.wgscont)
2426  sli.wgscont = nullptr;
2427  if (sli.wgsscaf)
2428  sli.wgsscaf = nullptr;
2429  if (sli.wgsacc)
2430  sli.wgsacc = nullptr;
2431 
2432  return (sli.total);
2433 }
2434 
2435 /**********************************************************/
2437 {
2438  ValNodePtr res;
2439  ValNodePtr vnp;
2440  char* start;
2441  char* p;
2442  char* q;
2443  char* r;
2444  bool bad;
2445 
2446  if (! buf || *buf == '\0')
2447  return nullptr;
2448 
2449  for (p = buf; *p != '\0'; p++) {
2450  if (*p != '~')
2451  continue;
2452 
2453  for (p++; *p == ' ' || *p == '~'; p++)
2454  *p = ' ';
2455  p--;
2456  }
2457 
2458  bad = false;
2459  res = ValNodeNew(nullptr);
2460  vnp = res;
2461  for (start = buf;;) {
2462  p = StringStr(start, "::");
2463  if (! p) {
2464  if (start == buf)
2465  bad = true;
2466  break;
2467  }
2468 
2469  q = StringStr(p + 2, "::");
2470  if (! q) {
2471  vnp = ValNodeNew(vnp, start);
2472  for (r = vnp->data; *r != '\0'; r++)
2473  if (*r == '~')
2474  *r = ' ';
2475  ShrinkSpaces(vnp->data);
2476  break;
2477  }
2478 
2479  *q = '\0';
2480  r = StringRChr(p + 2, '~');
2481  *q = ':';
2482  if (! r) {
2483  bad = true;
2484  break;
2485  }
2486 
2487  *r = '\0';
2488  vnp = ValNodeNew(vnp, start);
2489  *r = '~';
2490  for (p = vnp->data; *p != '\0'; p++)
2491  if (*p == '~')
2492  *p = ' ';
2493  ShrinkSpaces(vnp->data);
2494 
2495  start = r;
2496  }
2497 
2498  vnp = res->next;
2499  res->next = nullptr;
2500  ValNodeFree(res);
2501 
2502  if (! bad)
2503  return (vnp);
2504 
2505  ValNodeFreeData(vnp);
2506  return nullptr;
2507 }
2508 
2509 /**********************************************************/
2511 {
2512  ValNodePtr vnp;
2513  ValNodePtr tvnp;
2514 
2515  char* p;
2516  char* q;
2517 
2518  CRef<CUser_object> obj;
2519 
2520  if (! tag || *tag == '\0' || ! buf || *buf == '\0')
2521  return obj;
2522 
2524  if (! vnp)
2525  return obj;
2526 
2527  obj.Reset(new CUser_object);
2528 
2529  CObject_id& id = obj->SetType();
2530  id.SetStr("StructuredComment");
2531 
2532  CRef<CUser_field> field(new CUser_field);
2533  field->SetLabel().SetStr("StructuredCommentPrefix");
2534 
2535  field->SetData().SetStr() = tag;
2536  field->SetData().SetStr() += "-START##";
2537 
2538  obj->SetData().push_back(field);
2539 
2540  for (tvnp = vnp; tvnp; tvnp = tvnp->next) {
2541  p = tvnp->data;
2542  if (! p || *p == '\0')
2543  continue;
2544 
2545  q = StringStr(p, "::");
2546  if (! q)
2547  continue;
2548 
2549  if (q > p && *(q - 1) == ' ')
2550  q--;
2551 
2552  for (*q++ = '\0'; *q == ' ' || *q == ':';)
2553  q++;
2554 
2555  if (*p == '\0' || *q == '\0')
2556  continue;
2557 
2558  field.Reset(new CUser_field);
2559  field->SetLabel().SetStr(p);
2560  field->SetData().SetStr(q);
2561 
2562  obj->SetData().push_back(field);
2563  }
2564 
2565  if (obj->GetData().size() < 2) {
2566  obj.Reset();
2567  return obj;
2568  }
2569 
2570  field.Reset(new CUser_field);
2571  field->SetLabel().SetStr("StructuredCommentSuffix");
2572  field->SetData().SetStr() = tag;
2573  field->SetData().SetStr() += "-END##";
2574 
2575  obj->SetData().push_back(field);
2576 
2577  ValNodeFreeData(vnp);
2578 
2579  return obj;
2580 }
2581 
2582 /**********************************************************/
2583 void fta_parse_structured_comment(char* str, bool& bad, TUserObjVector& objs)
2584 {
2585  ValNodePtr tagvnp;
2586  ValNodePtr vnp;
2587 
2588  char* start;
2589  char* tag = nullptr;
2590  char* buf;
2591  char* p;
2592  char* q;
2593  char* r;
2594 
2595  if (! str || *str == '\0')
2596  return;
2597 
2598  tagvnp = nullptr;
2599  for (p = str;;) {
2600  p = StringStr(p, "-START##");
2601  if (! p)
2602  break;
2603  for (q = p;; q--)
2604  if (*q == '~' || (*q == '#' && q > str && *--q == '#') || q == str)
2605  break;
2606  if (q[0] != '#' || q[1] != '#') {
2607  p += 8;
2608  continue;
2609  }
2610 
2611  start = q;
2612 
2613  tag = StringSave(string_view(q, p - q));
2614 
2615  for (q = p;;) {
2616  q = StringStr(q, tag);
2617  if (! q) {
2618  bad = true;
2619  break;
2620  }
2621  size_t i = StringLen(tag);
2622  if (! StringEquN(q + i, "-END##", 6)) {
2623  q += (i + 6);
2624  continue;
2625  }
2626  r = StringStr(p + 8, "-START##");
2627  if (r && r < q) {
2628  bad = true;
2629  break;
2630  }
2631  break;
2632  }
2633 
2634  if (bad)
2635  break;
2636 
2637  if (! tagvnp) {
2638  tagvnp = ValNodeNew(nullptr, tag);
2639  } else {
2640  for (vnp = tagvnp; vnp; vnp = vnp->next) {
2641  r = vnp->data;
2642  if (StringEqu(r + 2, tag + 2)) {
2643  if (*r != ' ') {
2644  ErrPostEx(SEV_ERROR, ERR_COMMENT_SameStructuredCommentTags, "More than one structured comment with the same tag \"%s\" found.", tag + 2);
2645  *r = ' ';
2646  }
2647  break;
2648  }
2649  if (! vnp->next) {
2650  ValNodeNew(vnp, tag);
2651  break;
2652  }
2653  }
2654  }
2655 
2656  if (StringEqu(tag, "##Metadata")) {
2657  MemFree(tag);
2658  p += 8;
2659  continue;
2660  }
2661 
2662  *q = '\0';
2663  if (! StringStr(p + 8, "::")) {
2664  ErrPostEx(SEV_ERROR, ERR_COMMENT_StructuredCommentLacksDelim, "The structured comment in this record lacks the expected double-colon '::' delimiter between fields and values.");
2665  MemFree(tag);
2666  p += 8;
2667  *q = '#';
2668  continue;
2669  }
2670 
2671  buf = StringSave(p + 8);
2672  *q = '#';
2673 
2675  MemFree(buf);
2676 
2677  if (cur.Empty()) {
2678  bad = true;
2679  break;
2680  }
2681 
2682  objs.push_back(cur);
2683 
2684  fta_StringCpy(start, q + StringLen(tag) + 6);
2685  MemFree(tag);
2686  p = start;
2687  }
2688 
2689  if (bad) {
2690  ErrPostEx(SEV_REJECT, ERR_COMMENT_InvalidStructuredComment, "Incorrectly formatted structured comment with tag \"%s\" encountered. Entry dropped.", tag + 2);
2691  MemFree(tag);
2692  }
2693 
2694  if (tagvnp)
2695  ValNodeFreeData(tagvnp);
2696 }
2697 
2698 /**********************************************************/
2699 string GetQSFromFile(FILE* fd, const Indexblk* ibp)
2700 {
2701  string ret;
2702  Char buf[1024];
2703 
2704  if (! fd || ibp->qslength < 1)
2705  return ret;
2706 
2707  ret.reserve(ibp->qslength + 10);
2708  fseek(fd, static_cast<long>(ibp->qsoffset), 0);
2709  while (fgets(buf, 1023, fd)) {
2710  if (buf[0] == '>' && ret[0] != '\0')
2711  break;
2712  ret.append(buf);
2713  }
2714  return ret;
2715 }
2716 
2717 /**********************************************************/
2719 {
2720  TSeqdescList* descrs = nullptr;
2721  if (seq_entry.IsSeq()) {
2722  if (seq_entry.GetSeq().IsSetDescr())
2723  descrs = &seq_entry.SetSeq().SetDescr().Set();
2724  } else if (seq_entry.IsSet()) {
2725  if (seq_entry.GetSet().IsSetDescr())
2726  descrs = &seq_entry.SetSet().SetDescr().Set();
2727  }
2728 
2729  if (! descrs)
2730  return;
2731 
2732  for (TSeqdescList::iterator descr = descrs->begin(); descr != descrs->end();) {
2733  if (! (*descr)->IsUser()) {
2734  ++descr;
2735  continue;
2736  }
2737 
2738  const CUser_object& user_obj = (*descr)->GetUser();
2739  if (! user_obj.IsSetType() || ! user_obj.GetType().IsStr() ||
2740  user_obj.GetType().GetStr() != "NcbiCleanup") {
2741  ++descr;
2742  continue;
2743  }
2744 
2745  descr = descrs->erase(descr);
2746  break;
2747  }
2748 }
2749 
2750 /**********************************************************/
2752  bool is_tsa)
2753 {
2754  bool got_comment = false;
2755  bool got_dblink = false;
2756 
2757  for (const auto& descr : bioseq.GetDescr().Get()) {
2758  if (! descr->IsUser())
2759  continue;
2760 
2761  const CUser_object& user_obj = descr->GetUser();
2762  if (! user_obj.IsSetType() || ! user_obj.GetType().IsStr())
2763  continue;
2764 
2765  const string& user_type_str = user_obj.GetType().GetStr();
2766 
2767  if (user_type_str == "StructuredComment")
2768  got_comment = true;
2769  else if (user_type_str == "GenomeProjectsDB")
2770  got_dblink = true;
2771  else if (user_type_str == "DBLink") {
2772  for (const auto& field : user_obj.GetData()) {
2773  if (! field->IsSetLabel() || ! field->GetLabel().IsStr() ||
2774  field->GetLabel().GetStr() != "BioProject")
2775  continue;
2776  got_dblink = true;
2777  break;
2778  }
2779  }
2780  }
2781 
2782  if (! is_tsa) {
2783  if (! got_comment)
2784  ErrPostEx(SEV_WARNING, ERR_ENTRY_TLSLacksStructuredComment, "This TLS record lacks an expected structured comment.");
2785  if (! got_dblink)
2786  ErrPostEx(SEV_WARNING, ERR_ENTRY_TLSLacksBioProjectLink, "This TLS record lacks an expected BioProject or Project link.");
2787  } else {
2788  if (! got_comment)
2789  ErrPostEx(SEV_WARNING, ERR_ENTRY_TSALacksStructuredComment, "This TSA record lacks an expected structured comment.");
2790  if (! got_dblink)
2791  ErrPostEx(SEV_WARNING, ERR_ENTRY_TSALacksBioProjectLink, "This TSA record lacks an expected BioProject or Project link.");
2792  }
2793 }
2794 
2795 /**********************************************************/
2797 {
2798  if (bioseq.GetInst().GetTopology() != CSeq_inst::eTopology_circular || (ibp && ibp->gaps))
2799  return;
2800 
2801  CMolInfo* mol_info = nullptr;
2802  for (auto& descr : bioseq.SetDescr().Set()) {
2803  if (descr->IsMolinfo()) {
2804  mol_info = &descr->SetMolinfo();
2805  break;
2806  }
2807  }
2808 
2809  if (mol_info) {
2811  } else {
2812  CRef<CSeqdesc> descr(new CSeqdesc);
2813  CMolInfo& mol = descr->SetMolinfo();
2815 
2816  bioseq.SetDescr().Set().push_back(descr);
2817  }
2818 }
2819 
2820 /**********************************************************/
2822 {
2823  if (num < 1000)
2824  return;
2825 
2826  ErrPostEx(SEV_INFO, ERR_SEQUENCE_HasManyComponents, "An OnlyNearFeatures FeatureFetchPolicy User-object has been added to this record because it is constructed from %d components, which exceeds the threshold of 999 for User-object creation.", num);
2827 
2828  CRef<CSeqdesc> descr(new CSeqdesc);
2829  descr->SetUser().SetType().SetStr("FeatureFetchPolicy");
2830 
2831  CRef<CUser_field> field(new CUser_field);
2832 
2833  field->SetLabel().SetStr("Policy");
2834  field->SetData().SetStr("OnlyNearFeatures");
2835 
2836  descr->SetUser().SetData().push_back(field);
2837 
2838  bsp.SetDescr().Set().push_back(descr);
2839 }
2840 
2841 /**********************************************************/
2842 void StripECO(string& str)
2843 {
2844  for (size_t i = str.find("{ECO:"); i != string::npos; i = str.find("{ECO:", i)) {
2845  size_t j = str.find('}', i);
2846  if (j == string::npos)
2847  break;
2848  ++j;
2849  if (i > 0 && str[i - 1] == ' ')
2850  --i;
2851  if (i > 0 && j < str.size()) {
2852  if ((str[i - 1] == '.' && str[j] == '.') ||
2853  (str[i - 1] == ';' && str[j] == ';')) {
2854  --i;
2855  }
2856  }
2857  str.erase(i, j - i);
2858  }
2859 }
2860 
2861 /**********************************************************/
2863 {
2864  if (uop.Empty() || ! uop->IsSetData() || ! uop->IsSetType() ||
2865  ! uop->GetType().IsStr() || uop->GetType().GetStr() != "DBLink")
2866  return false;
2867 
2868  bool got = false;
2869 
2870  for (const auto& field : uop->GetData()) {
2871  if (! field->IsSetData() || ! field->GetData().IsStrs() || ! field->IsSetNum() || field->GetNum() < 1 ||
2872  ! field->IsSetLabel() || ! field->GetLabel().IsStr() || field->GetLabel().GetStr() != "Sequence Read Archive")
2873  continue;
2874 
2875  for (const CStringUTF8& str : field->GetData().GetStrs()) {
2876  if (str.size() > 2 &&
2877  (str[0] == 'D' || str[0] == 'E' || str[0] == 'S') && str[1] == 'R' &&
2878  (str[2] == 'R' || str[2] == 'X' || str[2] == 'Z')) {
2879  got = true;
2880  break;
2881  }
2882  }
2883  if (got)
2884  break;
2885  }
2886  return (got);
2887 }
2888 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static bool s_IsConOrScaffold(CBioseq_Handle bsh)
Definition: add.cpp:697
USING_SCOPE(objects)
static void CreateSeqGap(CSeq_literal &seq_lit, GapFeatsPtr gfp)
Definition: add.cpp:318
bool no_reference(const CBioseq &bioseq)
Definition: add.cpp:220
void SeqToDelta(CBioseq &bioseq, Int2 tech)
Definition: add.cpp:505
CMolInfo::TTech fta_check_con_for_wgs(CBioseq &bioseq)
Definition: add.cpp:2075
bool fta_check_htg_kwds(TKeywordList &kwds, IndexblkPtr ibp, CMolInfo &mol_info)
Definition: add.cpp:913
#define SHORT_GAP
Definition: add.cpp:83
void fta_set_molinfo_completeness(CBioseq &bioseq, const Indexblk *ibp)
Definition: add.cpp:2796
static void fta_validate_assembly(char *name)
Definition: add.cpp:1471
void fta_add_hist(ParserPtr pp, CBioseq &bioseq, CGB_block::TExtra_accessions &extra_accs, Parser::ESource source, CSeq_id::E_Choice acctype, bool pricon, const char *acc)
Definition: add.cpp:793
static bool fta_ranges_to_hist(const CGB_block::TExtra_accessions &extra_accs)
Definition: add.cpp:607
static int sGetPrefixLength(const CTempString &accession)
Definition: add.cpp:781
void AssemblyGapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:339
bool fta_parse_tpa_tsa_block(CBioseq &bioseq, char *offset, char *acnum, Int2 vernum, size_t len, Int2 col_data, bool tpa)
Definition: add.cpp:1118
bool g_DoesNotReferencePrimary(const CDelta_ext &delta_ext, const CSeq_id &primary, CScope &scope)
Definition: add.cpp:730
static bool fta_validate_bioproject(char *name, Parser::ESource source)
Definition: add.cpp:1501
bool fta_if_valid_biosample(const Char *id, bool dblink)
Definition: add.cpp:1757
static void fta_fix_seq_id(CSeq_loc &loc, CSeq_id &id, IndexblkPtr ibp, char *location, const char *name, SeqLocIdsPtr slip, bool iscon, Parser::ESource source)
Definition: add.cpp:2128
string GetQSFromFile(FILE *fd, const Indexblk *ibp)
Definition: add.cpp:2699
void fta_get_project_user_object(TSeqdescList &descrs, char *offset, Parser::EFormat format, bool *drop, Parser::ESource source)
Definition: add.cpp:1610
bool fta_strings_same(const char *s1, const char *s2)
Definition: add.cpp:903
bool check_cds(const DataBlk &entry, Parser::EFormat format)
Definition: add.cpp:258
static bool s_IsAccession(const CSeq_id &id)
Definition: add.cpp:715
void fta_create_far_fetch_policy_user_object(CBioseq &bsp, Int4 num)
Definition: add.cpp:2821
void fta_tsa_tls_comment_dblink_check(const CBioseq &bioseq, bool is_tsa)
Definition: add.cpp:2751
void fta_remove_cleanup_user_object(CSeq_entry &seq_entry)
Definition: add.cpp:2718
bool fta_if_valid_sra(const Char *id, bool dblink)
Definition: add.cpp:1736
static ValNodePtr fta_tokenize_dblink(char *str, Parser::ESource source)
Definition: add.cpp:1780
#define HTG_GAP
Definition: add.cpp:82
bool fta_dblink_has_sra(const CRef< CUser_object > &uop)
Definition: add.cpp:2862
static ValNodePtr fta_vnp_structured_comment(char *buf)
Definition: add.cpp:2436
CRef< CSeq_loc > fta_get_seqloc_int_whole(CSeq_id &seq_id, size_t len)
Definition: add.cpp:1453
char * StringRStr(char *where, const char *what)
Definition: add.cpp:1438
void GapsToDelta(CBioseq &bioseq, GapFeatsPtr gfp, bool *drop)
Definition: add.cpp:387
void fta_get_dblink_user_object(TSeqdescList &descrs, char *offset, size_t len, Parser::ESource source, bool *drop, CRef< CUser_object > &dbuop)
Definition: add.cpp:1950
bool fta_number_is_huge(const Char *s)
Definition: add.cpp:1059
static void fta_do_fix_seq_loc_id(TSeqLocList &locs, IndexblkPtr ibp, char *location, const char *name, SeqLocIdsPtr slip, bool iscon, Parser::ESource source)
Definition: add.cpp:2283
void err_install(const Indexblk *ibp, bool accver)
Definition: add.cpp:302
Int4 fta_fix_seq_loc_id(TSeqLocList &locs, ParserPtr pp, char *location, const char *name, bool iscon)
Definition: add.cpp:2321
string tata_save(string_view t)
Definition: add.cpp:148
bool no_date(Parser::EFormat format, const TSeqdescList &descrs)
Definition: add.cpp:190
static void fta_tpa_block_free(FTATpaBlockPtr ftbp)
Definition: add.cpp:128
static ValNodePtr fta_tokenize_project(char *str, Parser::ESource source, bool newstyle)
Definition: add.cpp:1536
static CRef< CUser_object > fta_build_structured_comment(char *tag, char *buf)
Definition: add.cpp:2510
void fta_parse_structured_comment(char *str, bool &bad, TUserObjVector &objs)
Definition: add.cpp:2583
static void fta_check_tpa_tsa_coverage(FTATpaBlockPtr ftbp, Int4 length, bool tpa)
Definition: add.cpp:984
void StripECO(string &str)
Definition: add.cpp:2842
CRef< CPatent_seq_id > MakeUsptoPatSeqId(const char *acc)
Definition: asci_blk.cpp:884
void ShrinkSpaces(char *line)
Definition: asci_blk.cpp:118
CBioseq_Handle –.
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
Definition: Dbtag.hpp:53
CDelta_seq –.
Definition: Delta_seq.hpp:66
@Imp_feat.hpp User-defined methods of the data storage class.
Definition: Imp_feat.hpp:54
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
Definition: Seq_entry.hpp:56
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
char * mOffset
Definition: ftablock.h:332
size_t len
Definition: ftablock.h:333
CFlatFileData * mpData
Definition: ftablock.h:331
DataBlk * mpNext
Definition: ftablock.h:336
int mType
Definition: ftablock.h:330
@ ParFlat_FH
Definition: embl.h:52
#define ParFlat_COL_DATA_EMBL
Definition: embl.h:38
#define ERR_REFERENCE_Illegalreference
Definition: flat2err.h:287
#define ERR_LOCATION_TpaAndNonTpa
Definition: flat2err.h:401
#define ERR_TPA_SpanLengthDiff
Definition: flat2err.h:591
#define ERR_COMMENT_SameStructuredCommentTags
Definition: flat2err.h:97
#define ERR_TSA_SpanLengthDiff
Definition: flat2err.h:607
#define ERR_TPA_InvalidPrimarySeqId
Definition: flat2err.h:588
#define ERR_TSA_SpanDiffOver300bp
Definition: flat2err.h:608
#define ERR_ENTRY_TLSLacksBioProjectLink
Definition: flat2err.h:92
#define ERR_FORMAT_InvalidBioProjectAcc
Definition: flat2err.h:73
#define ERR_TPA_IncompleteCoverage
Definition: flat2err.h:590
#define ERR_DBLINK_InvalidIdentifier
Definition: flat2err.h:612
#define ERR_SEQUENCE_HasManyComponents
Definition: flat2err.h:158
#define ERR_LOCATION_CrossDatabaseFeatLoc
Definition: flat2err.h:402
#define ERR_COMMENT_StructuredCommentLacksDelim
Definition: flat2err.h:98
#define ERR_TPA_InvalidPrimarySpan
Definition: flat2err.h:587
#define ERR_ENTRY_TSALacksStructuredComment
Definition: flat2err.h:89
#define ERR_FORMAT_WrongBioProjectPrefix
Definition: flat2err.h:72
#define ERR_LOCATION_SeqIdProblem
Definition: flat2err.h:400
#define ERR_SEQUENCE_MultipleWGSProjects
Definition: flat2err.h:159
#define ERR_TSA_IncompleteCoverage
Definition: flat2err.h:606
#define ERR_ACCESSION_CannotGetDivForSecondary
Definition: flat2err.h:171
#define ERR_ENTRY_TSALacksBioProjectLink
Definition: flat2err.h:90
#define ERR_TPA_SpanDiffOver300bp
Definition: flat2err.h:592
#define ERR_FORMAT_ContigVersusAssemblyGapMissmatch
Definition: flat2err.h:71
#define ERR_TSA_InvalidPrimaryBlock
Definition: flat2err.h:605
#define ERR_TSA_InvalidPrimarySpan
Definition: flat2err.h:603
#define ERR_FEATURE_AllNsBetweenGaps
Definition: flat2err.h:368
#define ERR_FEATURE_InvalidGapSequence
Definition: flat2err.h:369
#define ERR_FORMAT_IncorrectDBLINK
Definition: flat2err.h:69
#define ERR_FEATURE_NsAbutGap
Definition: flat2err.h:367
#define ERR_ENTRY_TLSLacksStructuredComment
Definition: flat2err.h:91
#define ERR_LOCATION_ContigAndScaffold
Definition: flat2err.h:405
#define ERR_ACCESSION_WGSPrefixMismatch
Definition: flat2err.h:177
#define ERR_DBLINK_DuplicateIdentifierRemoved
Definition: flat2err.h:613
#define ERR_SEQUENCE_HTGPossibleShortGap
Definition: flat2err.h:152
#define ERR_TPA_InvalidPrimaryBlock
Definition: flat2err.h:589
#define ERR_SEQUENCE_HTGPhaseZeroHasGap
Definition: flat2err.h:154
#define ERR_COMMENT_InvalidStructuredComment
Definition: flat2err.h:96
#define ERR_KEYWORD_MultipleHTGPhases
Definition: flat2err.h:203
#define ERR_SEQUENCE_HTGWithoutGaps
Definition: flat2err.h:151
#define ERR_TSA_InvalidPrimarySeqId
Definition: flat2err.h:604
std::list< std::string > TKeywordList
Definition: ftablock.h:166
std::list< CRef< objects::CSeqdesc > > TSeqdescList
Definition: ftablock.h:60
std::vector< CRef< objects::CUser_object > > TUserObjVector
Definition: ftablock.h:61
bool StringEquNI(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:125
bool StringEquN(const char *s1, const char *s2, size_t n)
Definition: ftacpp.hpp:115
bool StringEqu(const char *s1, const char *s2)
Definition: ftacpp.hpp:105
void StringNCpy(char *d, const char *s, size_t n)
Definition: ftacpp.hpp:84
void MemFree(char *p)
Definition: ftacpp.hpp:55
size_t StringLen(const char *s)
Definition: ftacpp.hpp:60
char * StringRChr(char *s, const char c)
Definition: ftacpp.hpp:87
void FtaInstallPrefix(int prefix, const char *name, const char *location)
Definition: ftaerr.cpp:319
#define PREFIX_LOCUS
Definition: ftaerr.hpp:15
#define PREFIX_ACCESSION
Definition: ftaerr.hpp:14
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static int type
Definition: getdata.c:31
static const char * str(char *buf, int n)
Definition: stats.c:84
int offset
Definition: replacements.h:160
static const char location[]
Definition: config.c:97
@ ParFlat_FEATURES
Definition: genbank.h:51
#define ParFlat_COL_DATA
Definition: genbank.h:37
#define SEV_INFO
Definition: gicache.c:89
#define SEV_WARNING
Definition: gicache.c:90
#define SEV_ERROR
Definition: gicache.c:91
#define SEV_REJECT
Definition: gicache.c:92
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define StringStr
Definition: ncbistr.hpp:322
#define StringSave
Definition: ncbistr.hpp:326
#define ErrPostStr
Definition: ncbierr.hpp:68
#define StringChr
Definition: ncbistr.hpp:317
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
ErrSev
Definition: ncbierr.hpp:63
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
static E_Choice GetAccType(EAccessionInfo info)
Definition: Seq_id.hpp:562
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
CConstRef< CSeq_loc > GetRangeAsSeq_loc(void) const
Get seq-loc for the current iterator position.
Definition: Seq_loc.cpp:2585
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
const CSeq_id & GetSeq_id(void) const
Get seq_id of the current location.
Definition: Seq_loc.hpp:1028
void GetLabel(string *label) const
Appends a label suitable for display (e.g., error messages) label must point to an existing string ob...
Definition: Seq_loc.cpp:3467
CSeq_id_Handle GetAccVer(const CSeq_id_Handle &idh, TGetFlags flags=0)
Get accession.version Seq-id Returns null CSeq_id_Handle if the sequence is not found or if it doesn'...
Definition: scope.cpp:413
TBioseqHandles GetBioseqHandles(const TIds &ids)
Get bioseq handles for all ids.
Definition: scope.cpp:143
const TInst_Ext & GetInst_Ext(void) const
bool IsSetInst_Ext(void) const
bool IsSetInst_Repr(void) const
TInst_Repr GetInst_Repr(void) const
TObjectType * GetNCPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1174
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
char Char
Alias for char.
Definition: ncbitype.h:93
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
std::string CStringUTF8
Definition: ncbistl.hpp:254
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
const_iterator end() const
Return an iterator to the string's ending position (one past the end of the represented sequence)
Definition: tempstr.hpp:306
CTempString literal(const char(&str)[Size])
Templatized initialization from a string literal.
Definition: tempstr.hpp:441
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:776
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
const_iterator begin() const
Return an iterator to the string's starting position.
Definition: tempstr.hpp:299
static const char label[]
list< string > TExtra_accessions
Definition: GB_block_.hpp:91
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
TData & SetData(void)
Assign a value to Data data member.
void SetNum(TNum value)
Assign a value to Num data member.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
void SetLabel(TLabel &value)
Assign a value to Label data member.
const TData & GetData(void) const
Get the Data member data.
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
const TType & GetType(void) const
Get the Type member data.
Tdata & Set(void)
Assign a value to data member.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
void SetDim(TDim value)
Assign a value to Dim data member.
Definition: Seq_align_.hpp:865
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:818
@ eType_partial
mapping pieces together
Definition: Seq_align_.hpp:103
const TKey & GetKey(void) const
Get the Key member data.
Definition: Imp_feat_.hpp:259
void SetTo(TTo value)
Assign a value to To data member.
bool IsGenbank(void) const
Check if variant Genbank is selected.
Definition: Seq_id_.hpp:841
TGeneral & SetGeneral(void)
Select the variant.
Definition: Seq_id_.cpp:375
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
bool IsTpg(void) const
Check if variant Tpg is selected.
Definition: Seq_id_.hpp:928
bool IsEmpty(void) const
Check if variant Empty is selected.
Definition: Seq_loc_.hpp:516
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
bool IsPacked_pnt(void) const
Check if variant Packed_pnt is selected.
Definition: Seq_loc_.hpp:546
bool IsTpd(void) const
Check if variant Tpd is selected.
Definition: Seq_id_.hpp:940
bool IsOther(void) const
Check if variant Other is selected.
Definition: Seq_id_.hpp:871
void SetId(TId &value)
Assign a value to Id data member.
bool IsEmbl(void) const
Check if variant Embl is selected.
Definition: Seq_id_.hpp:847
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
void SetFrom(TFrom value)
Assign a value to From data member.
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
bool IsWhole(void) const
Check if variant Whole is selected.
Definition: Seq_loc_.hpp:522
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
void SetStrand(TStrand value)
Assign a value to Strand data member.
bool IsTpe(void) const
Check if variant Tpe is selected.
Definition: Seq_id_.hpp:934
bool IsPnt(void) const
Check if variant Pnt is selected.
Definition: Seq_loc_.hpp:540
const TAccession & GetAccession(void) const
Get the Accession member data.
bool IsDdbj(void) const
Check if variant Ddbj is selected.
Definition: Seq_id_.hpp:910
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ e_General
for other databases
Definition: Seq_id_.hpp:105
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
@ e_Named_annot_track
Internal named annotation tracking ID.
Definition: Seq_id_.hpp:114
@ e_not_set
No variant selected.
Definition: Seq_id_.hpp:94
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Local
local use
Definition: Seq_id_.hpp:95
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSetDescr(void) const
Check if a value has been assigned to Descr data member.
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
void SetDescr(TDescr &value)
Assign a value to Descr data member.
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
void SetCompleteness(TCompleteness value)
Assign a value to Completeness data member.
Definition: MolInfo_.hpp:600
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
list< CRef< CSeq_align > > TAssembly
Definition: Seq_hist_.hpp:248
bool IsSetSeq_data(void) const
the sequence Check if a value has been assigned to Seq_data data member.
Definition: Seq_inst_.hpp:805
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
TTopology GetTopology(void) const
Get the Topology member data.
Definition: Seq_inst_.hpp:733
const TIupacna & GetIupacna(void) const
Get the variant data.
Definition: Seq_data_.hpp:510
const TAnnot & GetAnnot(void) const
Get the Annot member data.
Definition: Bioseq_.hpp:366
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_gap_.hpp:291
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
list< CRef< CDelta_seq > > Tdata
Definition: Delta_ext_.hpp:89
TLinkage_evidence & SetLinkage_evidence(void)
Assign a value to Linkage_evidence data member.
Definition: Seq_gap_.hpp:375
void SetLinkage(TLinkage value)
Assign a value to Linkage data member.
Definition: Seq_gap_.hpp:338
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
void SetTech(TTech value)
Assign a value to Tech data member.
Definition: MolInfo_.hpp:503
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
TMolinfo & SetMolinfo(void)
Select the variant.
Definition: Seqdesc_.cpp:594
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eCompleteness_complete
complete biological entity
Definition: MolInfo_.hpp:156
@ eTech_htgs_2
ordered High Throughput sequence contig
Definition: MolInfo_.hpp:138
@ eTech_htgs_3
finished High Throughput sequence
Definition: MolInfo_.hpp:139
@ eTech_htgs_1
unordered High Throughput sequence contig
Definition: MolInfo_.hpp:137
@ eTech_wgs
whole genome shotgun sequencing
Definition: MolInfo_.hpp:143
@ eTech_htgs_0
single genomic reads for coordination
Definition: MolInfo_.hpp:141
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
CSeq_id::E_Choice GetNucAccOwner(const CTempString &acc)
Definition: indx_blk.cpp:2271
bool isSupportedAccession(CSeq_id::E_Choice type)
Definition: indx_blk.cpp:2251
Int4 IsNewAccessFormat(const Char *acnum)
Definition: indx_blk.cpp:995
int fta_if_wgs_acc(const CTempString &accession)
Definition: indx_blk.cpp:1193
char * buf
int i
int len
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
static const BitmapCharRec ch1
Definition: ncbi_10x20.c:1827
static const BitmapCharRec ch2
Definition: ncbi_10x20.c:1819
#define fseek
EIPRangeType t
Definition: ncbi_localip.c:101
const char * tag
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
std::list< SeqLoc > TSeqLocList
static Format format
Definition: njn_ioutil.cpp:53
Int4 delta(size_t dimension_, const Int4 *score_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
Int4 from2
Definition: add.cpp:112
Int4 version
Definition: add.cpp:111
CSeq_id::E_Choice sicho
Definition: add.cpp:115
ENa_strand strand
Definition: add.cpp:114
Int4 from1
Definition: add.cpp:108
Int4 to2
Definition: add.cpp:113
char * accession
Definition: add.cpp:110
FTATpaBlock * next
Definition: add.cpp:116
Int4 to1
Definition: add.cpp:109
Int4 from
Definition: add.cpp:121
FTATpaSpan * next
Definition: add.cpp:123
Int4 to
Definition: add.cpp:122
Int4 from
Definition: ftablock.h:118
string gap_type
Definition: ftablock.h:124
Int4 to
Definition: ftablock.h:119
GapFeats * next
Definition: ftablock.h:129
objects::CLinkage_evidence::TLinkage_evidence asn_linkage_evidence
Definition: ftablock.h:127
objects::CSeq_gap::TType asn_gap_type
Definition: ftablock.h:126
bool assembly_gap
Definition: ftablock.h:123
bool rightNs
Definition: ftablock.h:122
Int4 estimated_length
Definition: ftablock.h:120
bool leftNs
Definition: ftablock.h:121
Char acnum[200]
Definition: ftablock.h:169
size_t qsoffset
Definition: ftablock.h:232
Int2 htg
Definition: ftablock.h:199
Int2 vernum
Definition: ftablock.h:170
bool drop
Definition: ftablock.h:185
GapFeatsPtr gaps
Definition: ftablock.h:217
Char locusname[200]
Definition: ftablock.h:173
size_t qslength
Definition: ftablock.h:233
vector< IndexblkPtr > entrylist
bool allow_crossdb_featloc
const Char * wgsscaf
Definition: add.cpp:92
Int4 ddbj
Definition: add.cpp:98
const Char * wgscont
Definition: add.cpp:91
Int4 tpd
Definition: add.cpp:102
Int4 genbank
Definition: add.cpp:93
Int4 pir
Definition: add.cpp:95
CSeq_loc * badslp
Definition: add.cpp:89
Int4 tpg
Definition: add.cpp:100
Int4 tpe
Definition: add.cpp:101
Int4 total
Definition: add.cpp:103
Int4 swissprot
Definition: add.cpp:96
Int4 prf
Definition: add.cpp:99
Int4 other
Definition: add.cpp:97
Int4 embl
Definition: add.cpp:94
const Char * wgsacc
Definition: add.cpp:90
ValNode * next
Definition: valnode.h:51
char * data
Definition: valnode.h:49
Definition: type.c:6
#define _ASSERT
CScope & GetScope()
bool SetTextId(Uint1 seqtype, CSeq_id &seqId, CTextseq_id &textId)
Definition: utilfun.cpp:1596
void fta_StringCpy(char *dst, const char *src)
Definition: utilfun.cpp:1585
DataBlkPtr TrackNodeType(const DataBlk &entry, Int2 type)
Definition: utilfun.cpp:1083
void UnwrapAccessionRange(const CGB_block::TExtra_accessions &extra_accs, CGB_block::TExtra_accessions &hist)
Definition: utilfun.cpp:197
ValNodePtr ValNodeNew(ValNodePtr prev, const char *data)
Definition: valnode.cpp:53
ValNodePtr ValNodeFree(ValNodePtr vnp)
Definition: valnode.cpp:76
ValNodePtr ValNodeFreeData(ValNodePtr vnp)
Definition: valnode.cpp:96
Modified on Sun Apr 21 03:41:52 2024 by modify_doxy.py rev. 669887