NCBI C++ ToolKit
utilities.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: utilities.cpp 101601 2024-01-10 00:08:13Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Mati Shomrat
27  *
28  * File Description:
29  * Implementation of utility classes and functions.
30  *
31  */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <corelib/ncbistr.hpp>
35 
36 #include <serial/enumvalues.hpp>
37 #include <serial/serialimpl.hpp>
38 
44 #include <objects/seq/Bioseq.hpp>
48 #include <objmgr/bioseq_handle.hpp>
49 #include <objmgr/scope.hpp>
50 #include <objmgr/seq_vector.hpp>
51 #include <objmgr/util/sequence.hpp>
53 #include <objmgr/bioseq_ci.hpp>
54 #include <objmgr/seqdesc_ci.hpp>
55 #include <objmgr/align_ci.hpp>
63 
64 #include <vector>
65 #include <algorithm>
66 #include <list>
67 
68 
71 BEGIN_SCOPE(validator)
72 
73 
74 // =============================================================================
75 // Functions
76 // =============================================================================
77 
78 
80 {
81  for ( CTypeConstIterator <CBioseq_set> si(se); si; ++si ) {
82  if ( si->GetClass() == clss ) {
83  return true;
84  }
85  }
86  return false;
87 }
88 
89 
90 bool IsDeltaOrFarSeg(const CSeq_loc& loc, CScope* scope)
91 {
92  CBioseq_Handle bsh = BioseqHandleFromLocation(scope, loc);
94 
95  if ( bsh.IsSetInst_Repr() ) {
97  if ( repr == CSeq_inst::eRepr_delta ) {
99  return true;
100  }
101  }
102  if ( repr == CSeq_inst::eRepr_seg ) {
104  return true;
105  }
106  }
107  }
108 
109  return false;
110 }
111 
112 
113 // Check if string is either empty or contains just white spaces
114 bool IsBlankStringList(const list< string >& str_list)
115 {
116  ITERATE( list< string >, str, str_list ) {
117  if ( !NStr::IsBlank(*str) ) {
118  return false;
119  }
120  }
121  return true;
122 }
123 
124 
126 {
127  TGi gi = ZERO_GI;
129  scope->AddDefaults();
130 
131  try {
133  gi = scope->GetGi (idh);
134  } catch (CException &) {
135  } catch (std::exception &) {
136  }
137  return gi;
138 }
139 
140 
141 
143 {
144  CScope::TIds id_list;
145  CSeq_id tmp_id;
146  tmp_id.SetGi(gi);
148  scope->AddDefaults();
149 
150  try {
151  id_list = scope->GetIds(tmp_id);
152 
153  } catch (CException &) {
154  } catch (std::exception &) {
155  }
156  return id_list;
157 }
158 
159 bool IsFarLocation(const CSeq_loc& loc, const CSeq_entry_Handle& seh)
160 {
161  CScope& scope = seh.GetScope();
162  for ( CSeq_loc_CI citer(loc); citer; ++citer ) {
163  CConstRef<CSeq_id> id(&citer.GetSeq_id());
164  if ( id ) {
165  CBioseq_Handle near_seq = scope.GetBioseqHandleFromTSE(*id, seh);
166  if ( !near_seq ) {
167  return true;
168  }
169  }
170  }
171 
172  return false;
173 }
174 
176  const CSeq_loc& loc,
177  CScope& scope)
178 {
179  CNcbiOstrstream oss;
180  CFastaOstream fasta_ostr(oss);
183  string s;
184 
185  try {
186  for (CSeq_loc_CI citer (loc); citer; ++citer) {
187  const CSeq_loc& part = citer.GetEmbeddingSeq_loc();
188  CBioseq_Handle bsh = BioseqHandleFromLocation (&scope, part);
189  if (bsh) {
190  fasta_ostr.WriteSequence (bsh, &part);
191  }
192  }
193  s = CNcbiOstrstreamToString(oss);
194  NStr::ReplaceInPlace(s, "\n", "");
195  } catch (CException&) {
196  s = kEmptyStr;
197  }
198 
199  return s;
200 }
201 
202 
204  const CSeq_loc& loc,
205  CScope& scope,
207 {
209  CSeqMap::CreateSeqMapForSeq_loc(loc, &scope);
210  return CSeqVector(*map, scope, coding, eNa_strand_plus);
211 }
212 
213 
215  const CSeq_feat& feat,
216  CScope& scope,
218  bool product)
219 {
220 
221  if ( (product && !feat.CanGetProduct()) ||
222  (!product && !feat.CanGetLocation()) ) {
223  return CSeqVector();
224  }
225 
226  const CSeq_loc* loc = product ? &feat.GetProduct() : &feat.GetLocation();
227  return GetSequenceFromLoc(*loc, scope, coding);
228 }
229 
230 
231 /***** Calculate Accession for a given object *****/
232 
233 
234 static string s_GetBioseqAcc(const CSeq_id& id, int* version)
235 {
236  try {
237  string label;
238  id.GetLabel(&label, version, CSeq_id::eFasta);
239  return label;
240  } catch (CException&) {
241  return kEmptyStr;
242  }
243 }
244 
245 
246 static string s_GetBioseqAcc(const CBioseq_Handle& handle, int* version)
247 {
248  if (handle) {
249  CConstRef<CSeq_id> seqid = sequence::GetId(handle, sequence::eGetId_Best).GetSeqId();
250  if (seqid) {
251  return s_GetBioseqAcc(*seqid, version);
252  }
253  }
254  return kEmptyStr;
255 }
256 
257 
258 static string s_GetSeq_featAcc(const CSeq_feat& feat, CScope& scope, int* version)
259 {
260  CBioseq_Handle seq = BioseqHandleFromLocation (&scope, feat.GetLocation());
261  if (seq) {
263  if (parent && parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_parts) {
264  parent = parent.GetParentBioseq_set();
265  if (parent && parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_segset) {
266  CBioseq_CI m(parent);
267  if (m) {
268  return s_GetBioseqAcc(*m, version);
269  }
270  }
271  }
272  }
273 
274  return s_GetBioseqAcc(seq, version);
275 }
276 
277 
278 //static string s_GetBioseqAcc(const CBioseq& seq, CScope& scope, int* version)
279 //{
280 // CBioseq_Handle handle = scope.GetBioseqHandle(seq);
281 // return s_GetBioseqAcc(handle, version);
282 //}
283 
284 
285 static string s_GetBioseqAcc(const CBioseq& seq, int* version)
286 {
287  auto seqid = sequence::GetId(seq, sequence::eGetId_Best).GetSeqId();
288  if (seqid) {
289  return s_GetBioseqAcc(*seqid, version);
290  }
291  return kEmptyStr;
292 }
293 
294 
295 static const CBioseq* s_GetSeqFromSet(const CBioseq_set& bsst)
296 {
297  if (!bsst.IsSetSeq_set()) {
298  return nullptr;
299  }
300 
301  switch (bsst.GetClass()) {
303  // find the genomic bioseq
304  for (auto pSubEntry : bsst.GetSeq_set()) {
305  if (pSubEntry->IsSeq()) {
306  const auto& inst = pSubEntry->GetSeq().GetInst();
307  if (inst.IsSetMol() && inst.GetMol() == CSeq_inst::eMol_dna) {
308  return &(pSubEntry->GetSeq());
309  }
310  }
311  }
312  break;
314  // find the nucleotide bioseq
315  for (auto pSubEntry : bsst.GetSeq_set()) {
316  if (pSubEntry->IsSeq() && pSubEntry->GetSeq().IsNa()) {
317  return &pSubEntry->GetSeq();
318  } else if (pSubEntry->IsSet() &&
319  pSubEntry->GetSet().IsSetClass() &&
320  pSubEntry->GetSet().GetClass() == CBioseq_set::eClass_segset) {
321  return s_GetSeqFromSet(pSubEntry->GetSet());
322  }
323  }
324 
325  for (auto pSubEntry : bsst.GetSeq_set()) {
326  if (pSubEntry->IsSeq()) {
327  return &pSubEntry->GetSeq();
328  }
329  }
330  break;
332  for (auto pSubEntry : bsst.GetSeq_set()) {
333  if (pSubEntry->IsSeq()) {
334  return &pSubEntry->GetSeq();
335  }
336  }
337  break;
338 
339  default:
340  break;
341  }
342 
343  // In this case, return the first bioseq in the set
345  if (seqit) {
346  return &(*seqit);
347  }
348  return nullptr;
349 }
350 
351 
352 static bool s_IsDescOnSeqEntry (const CSeq_entry& entry, const CSeqdesc& desc)
353 {
354  if (entry.IsSetDescr()) {
355  const auto& descs = entry.GetDescr();
356  for (auto& it : descs.Get()) {
357  if (it->Equals(desc)) {
358  return true;
359  }
360  }
361  }
362  return false;
363 }
364 
365 static string s_GetAccessionForSeqdesc (const CSeq_entry_Handle& seh, const CSeqdesc& desc, int* version)
366 {
367  if (!seh) {
368  return kEmptyStr;\
369  } else if (seh.IsSeq()) {
370  return s_GetBioseqAcc(seh.GetSeq(), version);
371  //return s_GetBioseqAcc(*(seh.GetSeq().GetCompleteBioseq()), version);
372  } else if (s_IsDescOnSeqEntry (*(seh.GetCompleteSeq_entry()), desc)) {
373  const CBioseq* seq = s_GetSeqFromSet(*(seh.GetSet().GetCompleteBioseq_set()));
374  if (seq) {
375  return s_GetBioseqAcc(*seq, version);
376  }
377  } else {
378  CSeq_entry_Handle parent = seh.GetParentEntry();
379  if (parent) {
380  return s_GetAccessionForSeqdesc(parent, desc, version);
381  }
382  }
383  return kEmptyStr;
384 }
385 
386 static
387 bool IsBioseqInSameSeqEntryAsAlign(const CBioseq_Handle& bsh, const CSeq_align& align, CScope& scope)
388 {
390  for (CAlign_CI align_it(seh); align_it; ++align_it) {
391  if (&(*align_it) == &align) {
392  return true;
393  }
394  }
395  return false;
396 }
397 
398 
400 {
401  // temporary - to match C Toolkit
402  if (align.IsSetSegs() && align.GetSegs().IsStd()) {
403  return CConstRef<CSeq_id>();
404  }
405  try {
406  if (align.IsSetDim()) {
407  for (int i = 0; i < align.GetDim(); ++i) {
408  const CSeq_id& id = align.GetSeq_id(i);
409  CBioseq_Handle bsh = scope.GetBioseqHandle(id);
410  if (bsh && IsBioseqInSameSeqEntryAsAlign(bsh, align, scope)) {
411  return CConstRef<CSeq_id>(&id);
412  }
413  }
414  } else if (align.IsSetSegs() && align.GetSegs().IsDendiag()) {
415  const CSeq_id& id = *(align.GetSegs().GetDendiag().front()->GetIds()[0]);
416  return CConstRef<CSeq_id>(&id);
417  }
418  // failed to find resolvable ID, use bare ID
419  const CSeq_id& id = align.GetSeq_id(0);
420  return CConstRef<CSeq_id>(&id);
421  } catch (CException& ) {
422  }
423  return CConstRef<CSeq_id>();
424 }
425 
426 
427 string GetAccessionFromBioseq(const CBioseq& bioseq, int* version)
428 {
429  return s_GetBioseqAcc(bioseq, version);
430 }
431 
432 
434 {
435  const CBioseq* seq = s_GetSeqFromSet(bsst);
436  if (seq) {
437  return s_GetBioseqAcc(*seq, version);
438  }
439  return kEmptyStr;
440 }
441 
442 
443 string GetAccessionFromObjects(const CSerialObject* obj, const CSeq_entry* ctx, CScope& scope, int* version)
444 {
445  string empty_acc;
446 
447  if (obj && obj->GetThisTypeInfo() == CSeqdesc::GetTypeInfo() && ctx) {
449  const CSeqdesc& desc = dynamic_cast<const CSeqdesc&>(*obj);
450  string acc = s_GetAccessionForSeqdesc(seh, desc, version);
451  if (!NStr::IsBlank(acc)) {
452  return acc;
453  }
454  }
455 
456  if (ctx) {
457  if (ctx->IsSeq()) {
458  return s_GetBioseqAcc(ctx->GetSeq(), version);
459  } else if (ctx->IsSet()) {
460  const CBioseq* seq = s_GetSeqFromSet(ctx->GetSet());
461  if (seq) {
462  return s_GetBioseqAcc(*seq, version);
463  }
464  }
465  } else if (obj) {
466  if (obj->GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
467  const CSeq_feat& feat = dynamic_cast<const CSeq_feat&>(*obj);
468  return s_GetSeq_featAcc(feat, scope, version);
469  } else if (obj->GetThisTypeInfo() == CBioseq::GetTypeInfo()) {
470  const CBioseq& seq = dynamic_cast<const CBioseq&>(*obj);
471  return s_GetBioseqAcc(seq, version);
472  } else if (obj->GetThisTypeInfo() == CBioseq_set::GetTypeInfo()) {
473  const CBioseq_set& bsst = dynamic_cast<const CBioseq_set&>(*obj);
474  const CBioseq* seq = s_GetSeqFromSet(bsst);
475  if (seq) {
476  return s_GetBioseqAcc(*seq, version);
477  }
478  } else if (obj->GetThisTypeInfo() == CSeq_entry::GetTypeInfo()) {
479  const CSeq_entry& entry = dynamic_cast<const CSeq_entry&>(*obj);
480  if (entry.IsSeq()) {
481  return s_GetBioseqAcc(entry.GetSeq(), version);
482  } else if (entry.IsSet()) {
483  const CBioseq* seq = s_GetSeqFromSet(entry.GetSet());
484  if (seq) {
485  return s_GetBioseqAcc(*seq, version);
486  }
487  }
488  } else if (obj->GetThisTypeInfo() == CSeq_annot::GetTypeInfo()) {
489  CSeq_annot_Handle ah = scope.GetSeq_annotHandle (dynamic_cast<const CSeq_annot&>(*obj));
490  if (ah) {
492  if (seh) {
493  if (seh.IsSeq()) {
494  return s_GetBioseqAcc(seh.GetSeq(), version);
495  } else if (seh.IsSet()) {
496  CBioseq_set_Handle bsh = seh.GetSet();
497  const CBioseq_set& bsst = *(bsh.GetCompleteBioseq_set());
498  const CBioseq* seq = s_GetSeqFromSet(bsst);
499  if (seq) {
500  return s_GetBioseqAcc(*seq, version);
501  }
502  }
503  }
504  }
505  } else if (obj->GetThisTypeInfo() == CSeq_align::GetTypeInfo()) {
506  const CSeq_align& align = dynamic_cast<const CSeq_align&>(*obj);
508  if (id) {
509  CBioseq_Handle bsh = scope.GetBioseqHandle(*id);
510  if (bsh) {
511  return s_GetBioseqAcc(bsh, version);
512  } else {
513  return s_GetBioseqAcc(*id, version);
514  }
515  }
516  } else if (obj->GetThisTypeInfo() == CSeq_graph::GetTypeInfo()) {
517  const CSeq_graph& graph = dynamic_cast<const CSeq_graph&>(*obj);
518  try {
519  const CSeq_loc& loc = graph.GetLoc();
520  const CSeq_id *id = loc.GetId();
521  if (id) {
522  return s_GetBioseqAcc (*id, version);
523  }
524  } catch (CException& ) {
525  }
526  }
527  }
528  return empty_acc;
529 }
530 
531 
533 {
534  CBioseq_set_Handle gps;
535 
536  CSeq_entry_Handle parent = set.GetParentEntry();
537  if (!parent) {
538  return gps;
539  } else if (!(parent = parent.GetParentEntry())) {
540  return gps;
541  } else if (!parent.IsSet()) {
542  return gps;
543  } else if (parent.GetSet().IsSetClass() && parent.GetSet().GetClass() == set_class) {
544  return parent.GetSet();
545  } else {
546  return GetSetParent (parent.GetSet(), set_class);
547  }
548 }
549 
550 
552 {
554 
555  CSeq_entry_Handle parent = bioseq.GetParentEntry();
556  if (!parent) {
557  return set;
558  } else if (!(parent = parent.GetParentEntry())) {
559  return set;
560  } else if (!parent.IsSet()) {
561  return set;
562  } else if (parent.GetSet().IsSetClass() && parent.GetSet().GetClass() == set_class) {
563  return parent.GetSet();
564  } else {
565  return GetSetParent (parent.GetSet(), set_class);
566  }
567 }
568 
569 
571 {
573 }
574 
576 {
578 }
579 
580 
582 {
584 }
585 
586 
588 {
590 
591  if (!bioseq_set) {
592  return nuc;
593  }
594  CBioseq_CI bit(bioseq_set, CSeq_inst::eMol_na);
595  if (bit) {
596  nuc = *bit;
597  } else {
598  CSeq_entry_Handle parent = bioseq_set.GetParentEntry();
599  if (parent && (parent = parent.GetParentEntry())
600  && parent.IsSet()) {
601  nuc = GetNucBioseq (parent.GetSet());
602  }
603  }
604  return nuc;
605 }
606 
607 
609 {
611 
612  if (bioseq.IsNucleotide()) {
613  return bioseq;
614  }
615  CSeq_entry_Handle parent = bioseq.GetParentEntry();
616  if (parent && (parent = parent.GetParentEntry())
617  && parent.IsSet()) {
618  nuc = GetNucBioseq (parent.GetSet());
619  }
620  return nuc;
621 }
622 
623 
624 EAccessionFormatError ValidateAccessionString (const string& accession, bool require_version)
625 {
626  if (NStr::IsBlank (accession)) {
627  return eAccessionFormat_null;
628  } else if (accession.length() >= 16) {
630  } else if (accession.length() < 3
631  || ! isalpha (accession.c_str()[0])
632  || ! isupper (accession.c_str()[0])) {
634  }
635 
636  string str = accession;
637  if (NStr::StartsWith (str, "NZ_")) {
638  str = str.substr(3);
639  }
640 
641  const char *cp = str.c_str();
642  int numAlpha = 0;
643 
644  while (isalpha (*cp)) {
645  numAlpha++;
646  cp++;
647  }
648 
649  int numUndersc = 0;
650 
651  while (*cp == '_') {
652  numUndersc++;
653  cp++;
654  }
655 
656  int numDigits = 0;
657  while (isdigit (*cp)) {
658  numDigits++;
659  cp++;
660  }
661 
662  if ((*cp != '\0' && *cp != ' ' && *cp != '.') || numUndersc > 1) {
664  }
665 
666  if (require_version) {
667  if (*cp != '.') {
669  }
670  cp++;
671  int numVersion = 0;
672  while (isdigit (*cp)) {
673  numVersion++;
674  cp++;
675  }
676  if (numVersion < 1) {
678  } else if (*cp != '\0' && *cp != ' ') {
680  }
681  }
682 
683 
684  if (numUndersc == 0) {
685  if ((numAlpha == 1 && numDigits == 5)
686  || (numAlpha == 2 && numDigits == 6)
687  || (numAlpha == 3 && numDigits == 5)
688  || (numAlpha == 4 && numDigits == 8)
689  || (numAlpha == 5 && numDigits == 7)) {
690  return eAccessionFormat_valid;
691  }
692  } else {
693  if (numAlpha != 2 || (numDigits != 6 && numDigits != 8 && numDigits != 9)) {
695  }
696  char first_letter = accession.c_str()[0];
697  char second_letter = accession.c_str()[1];
698  if (first_letter == 'N' || first_letter == 'X' || first_letter == 'Z') {
699  if (second_letter == 'M' || second_letter == 'C'
700  || second_letter == 'T' || second_letter == 'P'
701  || second_letter == 'G' || second_letter == 'R'
702  || second_letter == 'S' || second_letter == 'W'
703  || second_letter == 'Z') {
704  return eAccessionFormat_valid;
705  }
706  }
707  if ((first_letter == 'A' || first_letter == 'Y')
708  && second_letter == 'P') {
709  return eAccessionFormat_valid;
710  }
711  }
712 
714 }
715 
716 
717 bool s_FeatureIdsMatch (const CFeat_id& f1, const CFeat_id& f2)
718 {
719  if (!f1.IsLocal() || !f2.IsLocal()) {
720  return false;
721  }
722 
723  return 0 == f1.GetLocal().Compare(f2.GetLocal());
724 }
725 
726 
727 bool s_StringHasPMID (const string& str)
728 {
729  if (NStr::IsBlank (str)) {
730  return false;
731  }
732 
733  size_t pos = NStr::Find (str, "(PMID ");
734  if (pos == string::npos) {
735  return false;
736  }
737 
738  const char *ptr = str.c_str() + pos + 6;
739  unsigned int numdigits = 0;
740  while (*ptr != 0 && *ptr != ')') {
741  if (isdigit (*ptr)) {
742  numdigits++;
743  }
744  ptr++;
745  }
746 
747  if (*ptr == ')' && numdigits > 0) {
748  return true;
749  } else {
750  return false;
751  }
752 }
753 
754 
755 bool HasBadCharacter (const string& str)
756 {
757  if (NStr::Find (str, "?") != string::npos
758  || NStr::Find (str, "!") != string::npos
759  || NStr::Find (str, "~") != string::npos
760  || NStr::Find(str, "|") != string::npos) {
761  return true;
762  } else {
763  return false;
764  }
765 }
766 
767 
768 bool EndsWithBadCharacter (const string& str)
769 {
770  if (NStr::EndsWith (str, "_") || NStr::EndsWith (str, ".")
771  || NStr::EndsWith (str, ",") || NStr::EndsWith (str, ":")
772  || NStr::EndsWith (str, ";")) {
773  return true;
774  } else {
775  return false;
776  }
777 }
778 
779 
780 int CheckDate (const CDate& date, bool require_full_date)
781 {
782  int rval = eDateValid_valid;
783 
784  if (date.IsStr()) {
785  if (NStr::IsBlank (date.GetStr()) || NStr::Equal (date.GetStr(), "?")) {
786  rval |= eDateValid_bad_str;
787  }
788  } else if (date.IsStd()) {
789  const auto& sdate = date.GetStd();
790  if (!sdate.IsSetYear() || sdate.GetYear() < 1000) {
791  rval |= eDateValid_bad_year;
792  }
793  if (sdate.IsSetMonth() && sdate.GetMonth() > 12) {
794  rval |= eDateValid_bad_month;
795  }
796  if (sdate.IsSetDay() && sdate.GetDay() > 31) {
797  rval |= eDateValid_bad_day;
798  }
799  if (require_full_date) {
800  if (!sdate.IsSetMonth() || sdate.GetMonth() == 0) {
801  rval |= eDateValid_bad_month;
802  }
803  if (!sdate.IsSetDay() || sdate.GetDay() == 0) {
804  rval |= eDateValid_bad_day;
805  }
806  }
807  if (sdate.IsSetSeason() && !NStr::IsBlank (sdate.GetSeason())) {
808  const char * cp = sdate.GetSeason().c_str();
809  while (*cp != 0) {
810  if (isalpha (*cp) || *cp == '-') {
811  // these are the only acceptable characters
812  } else {
813  rval |= eDateValid_bad_season;
814  break;
815  }
816  ++cp;
817  }
818  }
819  } else {
820  rval |= eDateValid_bad_other;
821  }
822  return rval;
823 }
824 
825 
826 bool IsDateInPast(const CDate& date)
827 {
828  time_t t;
829  time(&t);
830  struct tm *tm;
831  tm = localtime(&t);
832 
833  bool in_past = false;
834  if (!date.IsStd()) {
835  return false;
836  }
837  const auto & sdate = date.GetStd();
838  if (sdate.GetYear() < tm->tm_year + 1900) {
839  in_past = true;
840  } else if (sdate.GetYear() == tm->tm_year + 1900
841  && sdate.IsSetMonth()) {
842  if (sdate.GetMonth() < tm->tm_mon + 1) {
843  in_past = true;
844  } else if (sdate.GetMonth() == tm->tm_mon + 1
845  && sdate.IsSetDay()) {
846  if (sdate.GetDay() < tm->tm_mday) {
847  in_past = true;
848  }
849  }
850  }
851  return in_past;
852 }
853 
854 
856 {
857  string reasons;
858 
860  reasons += "EMPTY_DATE ";
861  }
862  if (flags & eDateValid_bad_str) {
863  reasons += "BAD_STR ";
864  }
865  if (flags & eDateValid_bad_year) {
866  reasons += "BAD_YEAR ";
867  }
868  if (flags & eDateValid_bad_month) {
869  reasons += "BAD_MONTH ";
870  }
871  if (flags & eDateValid_bad_day) {
872  reasons += "BAD_DAY ";
873  }
875  reasons += "BAD_SEASON ";
876  }
877  if (flags & eDateValid_bad_other) {
878  reasons += "BAD_OTHER ";
879  }
880  return reasons;
881 }
882 
883 
884 bool IsBioseqTSA (const CBioseq& seq, CScope* scope)
885 {
886  if (!scope) {
887  return false;
888  }
889  bool is_tsa = false;
890  CBioseq_Handle bsh = scope->GetBioseqHandle(seq);
891  if (bsh) {
892  CSeqdesc_CI desc_ci(bsh, CSeqdesc::e_Molinfo);
893  while (desc_ci && !is_tsa) {
894  if (desc_ci->GetMolinfo().IsSetTech() && desc_ci->GetMolinfo().GetTech() == CMolInfo::eTech_tsa) {
895  is_tsa = true;
896  }
897  ++desc_ci;
898  }
899  }
900  return is_tsa;
901 }
902 
903 
904 #if 0
905 // disabled for now
906 bool IsNCBIFILESeqId (const CSeq_id& id)
907 {
908  if (!id.IsGeneral() || !id.GetGeneral().IsSetDb()
909  || !NStr::Equal(id.GetGeneral().GetDb(), "NCBIFILE")) {
910  return false;
911  } else {
912  return true;
913  }
914 }
915 #endif
916 
917 
918 bool IsAccession(const CSeq_id& id)
919 {
920  if (id.GetTextseq_Id()) {
921  return true;
922  } else {
923  return false;
924  }
925 }
926 
927 
928 static void UpdateToBestId(CSeq_loc& loc, CScope& scope)
929 {
930  bool any_change = false;
931  CSeq_loc_I it(loc);
932  for (; it; ++it) {
933  const CSeq_id& id = it.GetSeq_id();
934  if (!IsAccession(id)) {
935  CConstRef<CSeq_id> best_id;
936  CBioseq_Handle bsh = scope.GetBioseqHandle(id);
937  if (bsh) {
938  const auto& ids = bsh.GetCompleteBioseq()->GetId();
939  for (auto& id_it : ids) {
940  if (IsAccession(*id_it)) {
941  best_id = id_it;
942  break;
943  }
944  }
945  }
946  if (best_id) {
947  it.SetSeq_id(*best_id);
948  any_change = true;
949  }
950  }
951  }
952  if (any_change) {
953  loc.Assign(*it.MakeSeq_loc());
954  }
955 }
956 
957 
958 string GetValidatorLocationLabel (const CSeq_loc& loc, CScope& scope)
959 {
960  string loc_label;
961  if (loc.IsWhole()) {
962  CBioseq_Handle bsh = scope.GetBioseqHandle(loc.GetWhole());
963  if (bsh) {
964  loc_label = GetBioseqIdLabel(*(bsh.GetCompleteBioseq()));
965  NStr::ReplaceInPlace(loc_label, "[", "");
966  NStr::ReplaceInPlace(loc_label, "]", "");
967  }
968  }
969  if (NStr::IsBlank(loc_label)) {
970  CSeq_loc tweaked_loc;
971  tweaked_loc.Assign(loc);
972  UpdateToBestId(tweaked_loc, scope);
973  tweaked_loc.GetLabel(&loc_label);
974  NStr::ReplaceInPlace(loc_label, "[", "(");
975  NStr::ReplaceInPlace(loc_label, "]", ")");
976  }
977  return loc_label;
978 }
979 
980 
981 string GetBioseqIdLabel(const CBioseq& sq)
982 {
983  string content;
984  int num_ids_found = 0;
985  bool id_found = false;
986 
987  const auto& id_list = sq.GetId();
988 
989  /* find first gi */
990  for (auto id_it : id_list) {
991  if (id_it->IsGi()) {
992  CNcbiOstrstream os;
993  id_it->WriteAsFasta(os);
994  string s = CNcbiOstrstreamToString(os);
995  content += s;
996  num_ids_found ++;
997  break;
998  }
999  }
1000  /* find first accession */
1001  for (auto id_it : id_list) {
1002  if (id_it->IsGenbank()
1003  || id_it->IsDdbj()
1004  || id_it->IsEmbl()
1005  || id_it->IsSwissprot()
1006  || id_it->IsOther()
1007  || id_it->IsTpd()
1008  || id_it->IsTpe()
1009  || id_it->IsTpg()) {
1010  if (num_ids_found > 0) {
1011  content += "|";
1012  }
1013  CNcbiOstrstream os;
1014  id_it->WriteAsFasta(os);
1015  string s = CNcbiOstrstreamToString(os);
1016  content += s;
1017  num_ids_found++;
1018  break;
1019  }
1020  }
1021 
1022  if (num_ids_found == 0) {
1023  /* find first general */
1024  for (auto id_it : id_list) {
1025  if (id_it->IsGeneral()) {
1026  if (num_ids_found > 0) {
1027  content += "|";
1028  }
1029  CNcbiOstrstream os;
1030  id_it->WriteAsFasta(os);
1031  string s = CNcbiOstrstreamToString(os);
1032  content += s;
1033  num_ids_found++;
1034  break;
1035  }
1036  }
1037  }
1038  // didn't find any? print them all, but only the first local
1039  if (num_ids_found == 0) {
1040  bool found_local = false;
1041  for (auto id_it : id_list) {
1042  if (id_it->IsLocal()) {
1043  if (found_local) {
1044  continue;
1045  } else {
1046  found_local = true;
1047  }
1048  }
1049  if (id_found) {
1050  content += "|";
1051  }
1052  CNcbiOstrstream os;
1053  id_it->WriteAsFasta(os);
1054  string s = CNcbiOstrstreamToString(os);
1055  content += s;
1056  id_found = true;
1057  }
1058  }
1059 
1060  return content;
1061 }
1062 
1063 
1064 void AppendBioseqLabel(string& str, const CBioseq& sq, bool supress_context)
1065 {
1066  str += "BIOSEQ: ";
1067 
1068  string content = GetBioseqIdLabel (sq);
1069 
1070  if (!supress_context) {
1071  if (!content.empty()) {
1072  content += ": ";
1073  }
1074 
1075  const CEnumeratedTypeValues* tv;
1076  tv = CSeq_inst::ENUM_METHOD_NAME(ERepr)();
1077  const CSeq_inst& inst = sq.GetInst();
1078  content += tv->FindName(inst.GetRepr(), true) + ", ";
1079  tv = CSeq_inst::ENUM_METHOD_NAME(EMol)();
1080  content += tv->FindName(inst.GetMol(), true);
1081  if (inst.IsSetLength()) {
1082  content += string(" len= ") + NStr::IntToString(inst.GetLength());
1083  }
1084  }
1085  str += content;
1086 }
1087 
1088 bool HasECnumberPattern (const string& str)
1089 {
1090  bool rval = false;
1091  if (NStr::IsBlank(str)) {
1092  return false;
1093  }
1094 
1095  bool is_ambig = false;
1096  int numdashes = 0;
1097  int numdigits = 0;
1098  int numperiods = 0;
1099 
1100  string::const_iterator sit = str.begin();
1101  while (sit != str.end() && !rval) {
1102  if (isdigit (*sit)) {
1103  numdigits++;
1104  if (is_ambig) {
1105  is_ambig = false;
1106  numperiods = 0;
1107  numdashes = 0;
1108  }
1109  } else if (*sit == '-') {
1110  numdashes++;
1111  is_ambig = true;
1112  } else if (*sit == 'n') {
1113  numdashes++;
1114  is_ambig = true;
1115  } else if (*sit == '.') {
1116  numperiods++;
1117  if (numdigits > 0 && numdashes > 0) {
1118  is_ambig = false;
1119  numperiods = 0;
1120  } else if (numdigits == 0 && numdashes == 0) {
1121  is_ambig = false;
1122  numperiods = 0;
1123  } else if (numdashes > 1) {
1124  is_ambig = false;
1125  numperiods = 0;
1126  }
1127  numdigits = 0;
1128  numdashes = 0;
1129  } else {
1130  if (numperiods == 3) {
1131  if (numdigits > 0 && numdashes > 0) {
1132  is_ambig = false;
1133  } else if (numdigits > 0 || numdashes == 1) {
1134  rval = true;
1135  }
1136  }
1137  is_ambig = false;
1138  numperiods = 0;
1139  numdigits = 0;
1140  numdashes = 0;
1141  }
1142  ++sit;
1143  }
1144  if (numperiods == 3) {
1145  if (numdigits > 0 && numdashes > 0) {
1146  rval = false;
1147  } else if (numdigits > 0 || numdashes == 1) {
1148  rval = true;
1149  }
1150  }
1151  return rval;
1152 }
1153 
1154 
1155 bool SeqIsPatent (const CBioseq& seq)
1156 {
1157  bool is_patent = false;
1158 
1159  // some tests are suppressed if a patent ID is present
1160  FOR_EACH_SEQID_ON_BIOSEQ (id_it, seq) {
1161  if ((*id_it)->IsPatent()) {
1162  is_patent = true;
1163  break;
1164  }
1165  }
1166  return is_patent;
1167 }
1168 
1169 
1170 bool SeqIsPatent (const CBioseq_Handle& seq)
1171 {
1172  return SeqIsPatent (*(seq.GetCompleteBioseq()));
1173 }
1174 
1175 
1177  CScope* scope,
1178  const CSeq_loc& loc,
1179  unsigned int tag,
1180  bool only_gap
1181 )
1182 
1183 {
1185  return false;
1186  }
1187 
1189  for ( CSeq_loc_CI sl_iter(loc); sl_iter; ++sl_iter ) { // EQUIV_IS_ONE not supported
1190  if ( !first ) {
1191  first = sl_iter;
1192  }
1193  last = sl_iter;
1194  }
1195 
1196  if ( first.GetStrand() != last.GetStrand() ) {
1197  return false;
1198  }
1200 
1201  if (!scope) {
1202  return false;
1203  }
1204 
1206  if (!slp) {
1207  return false;
1208  }
1209  const CSeq_id* id = slp->GetId();
1210  if (!id) {
1211  return false;
1212  }
1213  CBioseq_Handle bsh = scope->GetBioseqHandle(*id);
1214  if (!bsh) {
1215  return false;
1216  }
1217 
1218  TSeqPos acceptor = temp.GetRange().GetFrom();
1219  TSeqPos donor = temp.GetRange().GetTo();
1220  TSeqPos start = acceptor;
1221  TSeqPos stop = donor;
1222 
1224  temp.GetStrand());
1225  TSeqPos len = vec.size();
1226 
1227  if ( temp.GetStrand() == eNa_strand_minus ) {
1228  swap(acceptor, donor);
1229  stop = len - donor - 1;
1230  start = len - acceptor - 1;
1231  }
1232 
1233  bool result = false;
1234 
1235  try {
1236  if (tag == sequence::eSeqlocPartial_Nostop && stop < len - 1 && vec.IsInGap(stop + 1)) {
1237  return true;
1238  } else if (tag == sequence::eSeqlocPartial_Nostart && start > 0 && start < len && vec.IsInGap(start - 1)) {
1239  return true;
1240  }
1241  } catch ( exception& ) {
1242  return false;
1243  }
1244  if (only_gap) {
1245  return false;
1246  }
1247 
1248  if ( (tag == sequence::eSeqlocPartial_Nostop) && (stop < len - 2) ) {
1249  try {
1250  CSeqVector::TResidue res = vec[stop + 1];
1251 
1252  if ( IsResidue(res) && isalpha (res)) {
1253  if ( res == 'N' ) {
1254  result = true;
1255  }
1256  }
1257  } catch ( exception& ) {
1258  return false;
1259  }
1260  } else if ( (tag == sequence::eSeqlocPartial_Nostart) && (start > 1) ) {
1261  try {
1262  CSeqVector::TResidue res = vec[start - 1];
1263  if ( IsResidue(res) && isalpha (res)) {
1264  if ( res == 'N' ) {
1265  result = true;
1266  }
1267  }
1268  } catch ( exception& ) {
1269  return false;
1270  }
1271  }
1272 
1273  return result;
1274 }
1275 
1276 
1278 
1279 {
1280  CBioseq_Handle bsh;
1281  for ( CSeq_loc_CI citer (loc); citer; ++citer) {
1282  const CSeq_id& id = citer.GetSeq_id();
1285  if (bsh) {
1286  return bsh;
1287  }
1288  }
1289  return bsh;
1290 }
1291 
1292 static
1293 bool s_PosIsNNotGap(const CSeqVector& vec, unsigned int pos)
1294 {
1295  if (pos >= vec.size()) {
1296  return false;
1297  } else if (vec[pos] != 'N' && vec[pos] != 'n') {
1298  return false;
1299  } else if (vec.IsInGap(pos)) {
1300  return false;
1301  } else {
1302  return true;
1303  }
1304 }
1305 
1306 
1308 {
1309  if (!bsh || bsh.GetInst_Length() < 10 || (bsh.IsSetInst_Topology() && bsh.GetInst_Topology() == CSeq_inst::eTopology_circular)) {
1310  return false;
1311  } else {
1312  return true;
1313  }
1314 }
1315 
1316 
1318  const CSeqVector& vec,
1319  EBioseqEndIsType& begin_n,
1320  EBioseqEndIsType& begin_gap,
1321  EBioseqEndIsType& end_n,
1322  EBioseqEndIsType& end_gap,
1323  bool& begin_ambig,
1324  bool& end_ambig)
1325 {
1326  begin_n = eBioseqEndIsType_None;
1327  begin_gap = eBioseqEndIsType_None;
1328  end_n = eBioseqEndIsType_None;
1329  end_gap = eBioseqEndIsType_None;
1330  begin_ambig = false;
1331  end_ambig = false;
1332 
1333  if (vec.size() < 10) {
1334  return;
1335  }
1336 
1337  try {
1338 
1339  // check for gap at begining of sequence
1340  if (vec.IsInGap(0) /* || vec.IsInGap(1) */) {
1341  begin_gap = eBioseqEndIsType_All;
1342  for (int i = 0; i < 10; i++) {
1343  if (!vec.IsInGap(i)) {
1344  begin_gap = eBioseqEndIsType_Last;
1345  break;
1346  }
1347  }
1348  }
1349 
1350  // check for gap at end of sequence
1351  if ( /* vec.IsInGap (vec.size() - 2) || */ vec.IsInGap(vec.size() - 1)) {
1352  end_gap = eBioseqEndIsType_All;
1353  for (unsigned int i = vec.size() - 11; i < vec.size(); i++) {
1354  if (!vec.IsInGap(i)) {
1355  end_gap = eBioseqEndIsType_Last;
1356  break;
1357  }
1358  }
1359  }
1360 
1361  if (vec.IsNucleotide()) {
1362  // check for N bases at beginning of sequence
1363  if (s_PosIsNNotGap(vec, 0) /* || s_PosIsNNotGap(vec, 1) */) {
1364  begin_n = eBioseqEndIsType_All;
1365  for (unsigned int i = 0; i < 10; i++) {
1366  if (!s_PosIsNNotGap(vec, i)) {
1367  begin_n = eBioseqEndIsType_Last;
1368  break;
1369  }
1370  }
1371  }
1372 
1373  // check for N bases at end of sequence
1374  if ( /* s_PosIsNNotGap(vec, vec.size() - 2) || */ s_PosIsNNotGap(vec, vec.size() - 1)) {
1375  end_n = eBioseqEndIsType_All;
1376  for (unsigned int i = vec.size() - 10; i < vec.size(); i++) {
1377  if (!s_PosIsNNotGap(vec, i)) {
1378  end_n = eBioseqEndIsType_Last;
1379  break;
1380  }
1381  }
1382  }
1383 
1384  // check for ambiguous concentration
1385  size_t check_len = 50;
1386  if (vec.size() < 50) {
1387  check_len = vec.size();
1388  }
1389  size_t num_ns = 0;
1390  for (TSeqPos i = 0; i < check_len; i++) {
1391  if (vec[i] == 'N') {
1392  num_ns++;
1393  if (num_ns >= 5 && i < 10) {
1394  begin_ambig = true;
1395  break;
1396  } else if (num_ns >= 15) {
1397  begin_ambig = true;
1398  break;
1399  }
1400  }
1401  }
1402  num_ns = 0;
1403  for (TSeqPos i = 0; i < check_len; i++) {
1404  if (vec[vec.size() - i - 1] == 'N') {
1405  num_ns++;
1406  if (num_ns >= 5 && i < 10) {
1407  end_ambig = true;
1408  break;
1409  } else if (num_ns >= 15) {
1410  end_ambig = true;
1411  break;
1412  }
1413  }
1414  }
1415  }
1416  } catch (exception&) {
1417  // if there are exceptions, cannot perform this calculation
1418  }
1419 }
1420 
1421 
1423  const CBioseq_Handle& bsh,
1424  EBioseqEndIsType& begin_n,
1425  EBioseqEndIsType& begin_gap,
1426  EBioseqEndIsType& end_n,
1427  EBioseqEndIsType& end_gap,
1428  bool& begin_ambig,
1429  bool& end_ambig)
1430 {
1431  begin_n = eBioseqEndIsType_None;
1432  begin_gap = eBioseqEndIsType_None;
1433  end_n = eBioseqEndIsType_None;
1434  end_gap = eBioseqEndIsType_None;
1435  begin_ambig = false;
1436  end_ambig = false;
1437  if (!ShouldCheckForNsAndGap(bsh)) {
1438  return;
1439  }
1440 
1441  try {
1442  // check for gap at begining of sequence
1444  CheckBioseqEndsForNAndGap(vec, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
1445  } catch ( exception& ) {
1446  // if there are exceptions, cannot perform this calculation
1447  }
1448 }
1449 
1450 
1451 bool IsLocFullLength (const CSeq_loc& loc, const CBioseq_Handle& bsh)
1452 {
1453  if (loc.IsInt()
1454  && loc.GetInt().GetFrom() == 0
1455  && loc.GetInt().GetTo() == bsh.GetInst_Length() - 1) {
1456  return true;
1457  } else {
1458  return false;
1459  }
1460 }
1461 
1462 
1463 bool PartialsSame (const CSeq_loc& loc1, const CSeq_loc& loc2)
1464 {
1465  bool loc1_partial_start =
1467  bool loc1_partial_stop =
1469  bool loc2_partial_start =
1471  bool loc2_partial_stop =
1473  if (loc1_partial_start == loc2_partial_start &&
1474  loc1_partial_stop == loc2_partial_stop) {
1475  return true;
1476  } else {
1477  return false;
1478  }
1479 }
1480 
1481 
1482 
1483 
1484 // Code for finding duplicate features
1485 bool s_IsSameStrand(const CSeq_loc& l1, const CSeq_loc& l2, CScope& scope)
1486 {
1487  ENa_strand s1 = sequence::GetStrand(l1, &scope);
1488  ENa_strand s2 = sequence::GetStrand(l2, &scope);
1489  if ((s1 == eNa_strand_minus && s2 == eNa_strand_minus)
1490  || (s1 != eNa_strand_minus && s2 != eNa_strand_minus)) {
1491  return true;
1492  } else {
1493  return false;
1494  }
1495 }
1496 
1497 
1498 inline
1499 bool s_IsSameSeqAnnot(const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2, bool& diff_descriptions)
1500 {
1501  const auto& annot1 = f1.GetAnnot();
1502  const auto& annot2 = f2.GetAnnot();
1503  bool rval = annot1 == annot2;
1504  diff_descriptions = false;
1505  if (!rval) {
1506  if ((!annot1.Seq_annot_IsSetDesc() || annot1.Seq_annot_GetDesc().Get().empty()) &&
1507  (!annot2.Seq_annot_IsSetDesc() || annot2.Seq_annot_GetDesc().Get().empty())) {
1508  // neither is set
1509  diff_descriptions = false;
1510  } else if (annot1.Seq_annot_IsSetDesc() && annot2.Seq_annot_IsSetDesc()) {
1511  // both are set - are they different?
1512  const auto d1 = annot1.Seq_annot_GetDesc().Get().front();
1513  const auto d2 = annot2.Seq_annot_GetDesc().Get().front();
1514  if (d1->Which() != d2->Which()) {
1515  diff_descriptions = true;
1516  } else {
1517  if (d1->IsName()
1518  && NStr::EqualNocase(d1->GetName(), d2->GetName())) {
1519  diff_descriptions = false;
1520  } else if (d1->IsTitle()
1521  && NStr::EqualNocase(d1->GetTitle(), d2->GetTitle())) {
1522  diff_descriptions = false;
1523  } else {
1524  diff_descriptions = true;
1525  }
1526  }
1527  } else {
1528  diff_descriptions = true;
1529  }
1530  }
1531  return rval;
1532 }
1533 
1534 
1535 bool s_AreGBQualsIdentical(const CSeq_feat_Handle& feat1, const CSeq_feat_Handle& feat2, bool case_sensitive)
1536 {
1537  if (!feat1.IsSetQual() || !feat2.IsSetQual()) {
1538  return true;
1539  }
1540 
1541  bool rval = true;
1542 
1543  CSeq_feat::TQual::const_iterator gb1 = feat1.GetQual().begin();
1544  CSeq_feat::TQual::const_iterator gb1_end = feat1.GetQual().end();
1545  CSeq_feat::TQual::const_iterator gb2 = feat2.GetQual().begin();
1546  CSeq_feat::TQual::const_iterator gb2_end = feat2.GetQual().end();
1547 
1548  while ((gb1 != gb1_end) && (gb2 != gb2_end) && rval) {
1549  if (!(*gb1)->IsSetQual()) {
1550  if ((*gb2)->IsSetQual()) {
1551  rval = false;
1552  }
1553  } else if (!(*gb2)->IsSetQual()) {
1554  rval = false;
1555  } else if (!NStr::Equal ((*gb1)->GetQual(), (*gb2)->GetQual())) {
1556  rval = false;
1557  }
1558  if (rval) {
1559  string v1 = (*gb1)->IsSetVal() ? (*gb1)->GetVal() : "";
1560  string v2 = (*gb2)->IsSetVal() ? (*gb2)->GetVal() : "";
1563  rval = NStr::Equal(v1, v2, case_sensitive ? NStr::eCase : NStr::eNocase);
1564  }
1565  ++gb1;
1566  ++gb2;
1567  }
1568  if (gb1 != gb1_end || gb2 != gb2_end) {
1569  rval = false;
1570  }
1571 
1572  return rval;
1573 }
1574 
1575 
1576 bool s_AreFeatureLabelsSame(const CSeq_feat_Handle& feat, const CSeq_feat_Handle& prev, bool case_sensitive)
1577 {
1578  if (!feat.GetData().Equals(prev.GetData())) {
1579  return false;
1580  }
1581 
1582  // compare labels and comments
1583  bool same_label = true;
1584  const string& curr_comment =
1585  feat.IsSetComment() ? feat.GetComment() : kEmptyStr;
1586  const string& prev_comment =
1587  prev.IsSetComment() ? prev.GetComment() : kEmptyStr;
1588  string curr_label;
1589  string prev_label;
1590 
1591  feature::GetLabel(*(feat.GetSeq_feat()),
1592  &curr_label, feature::fFGL_Content, &(feat.GetScope()));
1593  feature::GetLabel(*(prev.GetSeq_feat()),
1594  &prev_label, feature::fFGL_Content, &(prev.GetScope()));
1595 
1596  bool comments_same = NStr::Equal(curr_comment, prev_comment, case_sensitive ? NStr::eCase : NStr::eNocase);
1597  bool labels_same = NStr::Equal(curr_label, prev_label, case_sensitive ? NStr::eCase : NStr::eNocase);
1598 
1599  if (!comments_same || !labels_same) {
1600  same_label = false;
1601  } else if (!s_AreGBQualsIdentical(feat, prev, case_sensitive)) {
1602  same_label = false;
1603  }
1604  return same_label;
1605 }
1606 
1607 
1608 bool s_IsDifferentDbxrefs(const TDbtags& list1, const TDbtags& list2)
1609 {
1610  if (list1.empty() || list2.empty()) {
1611  return false;
1612  } else if (list1.size() != list2.size()) {
1613  return true;
1614  }
1615 
1616  TDbtags::const_iterator it1 = list1.begin();
1617  TDbtags::const_iterator it2 = list2.begin();
1618  for (; it1 != list1.end(); ++it1, ++it2) {
1619  if (!NStr::EqualNocase((*it1)->GetDb(), (*it2)->GetDb())) {
1620  return true;
1621  }
1622  string str1 =
1623  (*it1)->GetTag().IsStr() ? (*it1)->GetTag().GetStr() : "";
1624  string str2 =
1625  (*it2)->GetTag().IsStr() ? (*it2)->GetTag().GetStr() : "";
1626  if ( str1.empty() && str2.empty() ) {
1627  if (!(*it1)->GetTag().IsId() && !(*it2)->GetTag().IsId()) {
1628  continue;
1629  } else if ((*it1)->GetTag().IsId() && (*it2)->GetTag().IsId()) {
1630  if ((*it1)->GetTag().GetId() != (*it2)->GetTag().GetId()) {
1631  return true;
1632  }
1633  } else {
1634  return true;
1635  }
1636  } else if (!str1.empty() && !str2.empty() && !NStr::EqualNocase(str1, str2)) {
1637  return true;
1638  }
1639  }
1640  return false;
1641 }
1642 
1643 
1645 {
1646  const auto & f1data = f1.GetData();
1647  const auto & f2data = f2.GetData();
1648  if (!f1data.IsCdregion() || !f2data.IsCdregion()) {
1649  return false;
1650  }
1651  const auto & cd1 = f1data.GetCdregion();
1652  const auto & cd2 = f2data.GetCdregion();
1653 
1654  int frame1 = 1, frame2 = 1;
1655  if (cd1.IsSetFrame()) {
1656  frame1 = cd1.GetFrame();
1657  if (frame1 == 0) {
1658  frame1 = 1;
1659  }
1660  }
1661  if (cd2.IsSetFrame()) {
1662  frame2 = cd2.GetFrame();
1663  if (frame2 == 0) {
1664  frame2 = 1;
1665  }
1666  }
1667  if (frame1 == frame2) {
1668  return false;
1669  }
1670 
1672  if (!IsLocFullLength (f1.GetLocation(), bsh1)) {
1673  return false;
1674  }
1676  if (!IsLocFullLength (f2.GetLocation(), bsh2)) {
1677  return false;
1678  }
1679 
1680  return true;
1681 }
1682 
1683 
1684 //LCOV_EXCL_START
1685 // never used, because different variations generate different labels
1687 {
1688  string replace;
1689  ITERATE(CSeq_feat::TQual, q, quals) {
1690  if ((*q)->IsSetQual() && NStr::Equal((*q)->GetQual(), "replace") && (*q)->IsSetVal()) {
1691  if (NStr::IsBlank((*q)->GetVal())) {
1692  replace += " ";
1693  } else {
1694  replace += (*q)->GetVal();
1695  }
1696  replace += ".";
1697  }
1698  }
1699  return replace;
1700 }
1701 
1702 
1704 {
1707  return false;
1708  }
1709  if (!f1.IsSetQual() || !f2.IsSetQual()) {
1710  return false;
1711  }
1712  string replace1 = s_ReplaceListFromQuals(f1.GetQual());
1713  string replace2 = s_ReplaceListFromQuals(f2.GetQual());
1714 
1715  if (!NStr::Equal(replace1, replace2)) {
1716  return true;
1717  } else {
1718  return false;
1719  }
1720 }
1721 //LCOV_EXCL_STOP
1722 
1723 
1724 typedef vector<CConstRef<CObject_id> > TFeatIdVec;
1726 {
1727  bool rval = false;
1728 
1729  if (f1.GetData().GetSubtype() == s1 && f2.GetData().GetSubtype() == s1) {
1730  CScope& scope = f1.GetScope();
1731  const CSeq_loc& loc = f1.GetLocation();
1732  CBioseq_Handle bsh = BioseqHandleFromLocation (&scope, loc);
1733  if (bsh) {
1734  const CTSE_Handle& tse = bsh.GetTSE_Handle();
1735  TFeatIdVec mrna1_id;
1736  TFeatIdVec mrna2_id;
1737  list<CSeq_feat_Handle> mrna1;
1738  list<CSeq_feat_Handle> mrna2;
1739 
1741  if ((*itx)->IsSetId() && (*itx)->GetId().IsLocal()) {
1742  const CObject_id& feat_id = (*itx)->GetId().GetLocal();
1743  vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, feat_id);
1744  ITERATE( vector<CSeq_feat_Handle>, feat_it, handles ) {
1745  if (feat_it->IsSetData()
1746  && feat_it->GetData().GetSubtype() == s2) {
1747  mrna1.push_back(*feat_it);
1748  CConstRef<CObject_id> f(&feat_id);
1749  mrna1_id.push_back (f);
1750  break;
1751  }
1752  }
1753  }
1754  }
1756  if ((*itx)->IsSetId() && (*itx)->GetId().IsLocal()) {
1757  const CObject_id& feat_id = (*itx)->GetId().GetLocal();
1758  vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, feat_id);
1759  ITERATE( vector<CSeq_feat_Handle>, feat_it, handles ) {
1760  if (feat_it->IsSetData()
1761  && feat_it->GetData().GetSubtype() == s2) {
1762  mrna2.push_back(*feat_it);
1763  CConstRef<CObject_id> f(&feat_id);
1764  mrna2_id.push_back (f);
1765  }
1766  }
1767  }
1768  }
1769 
1770  if (mrna1_id.size() > 0 && mrna2_id.size() > 0) {
1771  rval = true;
1772  for (auto i1 = mrna1_id.begin(); i1 != mrna1_id.end(); ++i1) {
1773  for (auto i2 = mrna2_id.begin(); i2 != mrna2_id.end(); ++i2) {
1774  if ((*i1)->Equals(**i2)) {
1775  rval = false;
1776  break;
1777  }
1778  }
1779  if (!rval) {
1780  break;
1781  }
1782  }
1783 
1784  if (rval) { // Check that locations aren't the same
1785  const CSeq_feat_Handle fh1 = mrna1.front();
1786  const CSeq_feat_Handle fh2 = mrna2.front();
1787 
1788 
1789  if (s_IsSameStrand(fh1.GetLocation(),
1790  fh2.GetLocation(),
1791  fh1.GetScope())
1792  && (sequence::Compare(fh1.GetLocation(),
1793  fh2.GetLocation(),
1794  &(fh1.GetScope()),
1796  rval = false;
1797  }
1798  }
1799  }
1800  }
1801  }
1802  return rval;
1803 }
1804 
1805 
1807 {
1809 }
1810 
1811 
1813 {
1815 }
1816 
1817 
1819 {
1820  if ( f.GetData().GetSubtype() != CSeqFeatData::eSubtype_gene ) return false;
1821  return IsDicistronic(f);
1822 }
1823 
1824 
1826 {
1827  if (!f.IsSetExcept()) return false;
1828  if (!f.IsSetExcept_text()) return false;
1829 
1830  const string& except_text = f.GetExcept_text();
1831  if (NStr::FindNoCase(except_text, "dicistronic gene") == NPOS) return false;
1832 
1833  return true;
1834 }
1835 
1836 
1839  const CSeq_feat_Handle& f1,
1840  const CSeq_feat_Handle& f2,
1841  bool check_partials,
1842  bool case_sensitive)
1843 {
1844 
1846 
1847  // subtypes
1848  CSeqFeatData::ESubtype feat1_subtype = f1.GetData().GetSubtype();
1849  CSeqFeatData::ESubtype feat2_subtype = f2.GetData().GetSubtype();
1850 
1851  // not duplicates if not the same subtype
1852  if (feat1_subtype != feat2_subtype) {
1853  return eDuplicate_Not;
1854  }
1855 
1856  // locations
1857  const CSeq_loc& feat1_loc = f1.GetLocation();
1858  const CSeq_loc& feat2_loc = f2.GetLocation();
1859 
1860  // not duplicates if not the same location and strand
1861  if (!s_IsSameStrand(feat1_loc, feat2_loc, f1.GetScope()) ||
1862  sequence::Compare(feat1_loc, feat2_loc, &(f1.GetScope()),
1864  return eDuplicate_Not;
1865  }
1866 
1867  // same annot?
1868  bool diff_annot_desc = false;
1869  bool same_annot = s_IsSameSeqAnnot(f1, f2, diff_annot_desc);
1870 
1871  if (diff_annot_desc) {
1872  // don't report if features on different annots with different titles or names
1873  return eDuplicate_Not;
1874  }
1875 
1876  // compare labels and comments
1877  bool same_label = s_AreFeatureLabelsSame (f1, f2, case_sensitive);
1878 
1879  // compare dbxrefs
1880  bool different_dbxrefs = (f1.IsSetDbxref() && f2.IsSetDbxref() &&
1882 
1883  if ( feat1_subtype == CSeqFeatData::eSubtype_region && different_dbxrefs) {
1884  return eDuplicate_Not;
1885  }
1886 
1887  // check for frame difference
1888  bool full_length_coding_regions_with_different_frames =
1890  if (!same_label && full_length_coding_regions_with_different_frames) {
1891  // do not report if both coding regions are full length, have different products,
1892  // and have different frames
1893  return eDuplicate_Not;
1894  }
1895 
1896  if ((feat1_subtype == CSeqFeatData::eSubtype_variation && !same_label) || s_AreDifferentVariations(f1, f2)) {
1897  // don't report variations if replace quals are different or labels are different
1898  return eDuplicate_Not;
1899  }
1900 
1901 
1903  // do not report if features are coding regions linked to different mRNAs
1904  return eDuplicate_Not;
1905  }
1906 
1907 
1909  // do not report if features are mRNAs linked to different coding regions
1910  return eDuplicate_Not;
1911  }
1912 
1913 
1914  // only report pubs if they have the same label
1915  if (feat1_subtype == CSeqFeatData::eSubtype_pub && !same_label) {
1916  return eDuplicate_Not;
1917  }
1918 
1919  bool partials_ok = (!check_partials || PartialsSame(feat1_loc, feat2_loc));
1920 
1921  if (!partials_ok) {
1922  return eDuplicate_Not;
1923  }
1924 
1925  if ( same_annot ) {
1926  if (same_label) {
1927  dup_type = eDuplicate_Duplicate;
1928  } else {
1930  }
1931  } else {
1932  if (same_label) {
1934  } else if ( feat2_subtype != CSeqFeatData::eSubtype_pub ) {
1936  }
1937  }
1938 
1939  return dup_type;
1940 }
1941 
1942 // specific-host functions
1943 
1945 {
1946  bool is_common = false;
1947 
1948  if (data.IsSetStatus()) {
1949  ITERATE (CT3Reply::TData::TStatus, status_it, data.GetStatus()) {
1950  if ((*status_it)->IsSetProperty()
1951  && NStr::Equal((*status_it)->GetProperty(), "old_name_class", NStr::eNocase)) {
1952  if ((*status_it)->IsSetValue() && (*status_it)->GetValue().IsStr()) {
1953  string value_str = (*status_it)->GetValue().GetStr();
1954  if (NStr::Equal(value_str, "common name", NStr::eCase)
1955  || NStr::Equal(value_str, "genbank common name", NStr::eCase)) {
1956  is_common = true;
1957  break;
1958  }
1959  }
1960  }
1961  }
1962  }
1963  return is_common;
1964 }
1965 
1967 {
1968  bool has_misspell_flag = false;
1969 
1970  if (data.IsSetStatus()) {
1971  ITERATE (CT3Reply::TData::TStatus, status_it, data.GetStatus()) {
1972  if ((*status_it)->IsSetProperty()) {
1973  string prop = (*status_it)->GetProperty();
1974  if (NStr::EqualNocase(prop, "misspelled_name")) {
1975  has_misspell_flag = true;
1976  break;
1977  }
1978  }
1979  }
1980  }
1981  return has_misspell_flag;
1982 }
1983 
1984 
1985 bool FindMatchInOrgRef (const string& str, const COrg_ref& org)
1986 {
1987  string match;
1988 
1989  if (NStr::IsBlank(str)) {
1990  // do nothing;
1991  } else if (org.IsSetTaxname() && NStr::EqualNocase(str, org.GetTaxname())) {
1992  match = org.GetTaxname();
1993  } else if (org.IsSetCommon() && NStr::EqualNocase(str, org.GetCommon())) {
1994  match = org.GetCommon();
1995  } else {
1996  FOR_EACH_SYN_ON_ORGREF (syn_it, org) {
1997  if (NStr::EqualNocase(str, *syn_it)) {
1998  match = *syn_it;
1999  break;
2000  }
2001  }
2002  if (NStr::IsBlank(match) && org.IsSetOrgname()) {
2003  const COrgName& orgname = org.GetOrgname();
2004  if (orgname.IsSetMod()) {
2005  for (const auto& mod_it : orgname.GetMod()) {
2006  if (mod_it->IsSetSubtype()
2007  && (mod_it->GetSubtype() == COrgMod::eSubtype_gb_synonym
2008  || mod_it->GetSubtype() == COrgMod::eSubtype_old_name)
2009  && mod_it->IsSetSubname()
2010  && NStr::EqualNocase(str, mod_it->GetSubname())) {
2011  match = mod_it->GetSubname();
2012  break;
2013  }
2014  }
2015  }
2016  }
2017  }
2018  return NStr::EqualCase(str, match);
2019 }
2020 
2021 
2022 static const string sIgnoreHostWordList[] = {
2023  " cf.",
2024  " cf ",
2025  " aff ",
2026  " aff.",
2027  " near",
2028  " nr.",
2029  " nr ",
2030 };
2031 
2032 void AdjustSpecificHostForTaxServer (string& spec_host)
2033 {
2034  for (unsigned i = 0; i < ArraySize(sIgnoreHostWordList); ++i) {
2035  NStr::ReplaceInPlace(spec_host, sIgnoreHostWordList[i], " ");
2036  }
2037  NStr::ReplaceInPlace(spec_host, " ", " ");
2038  NStr::TruncateSpacesInPlace(spec_host);
2039 }
2040 
2041 
2042 string SpecificHostValueToCheck(const string& val)
2043 {
2044  if (NStr::IsBlank(val)) {
2045  return val;
2046 #if 0
2047  } else if (! isupper (val.c_str()[0])) {
2048  return kEmptyStr;
2049 #endif
2050  }
2051 
2052  string host = val;
2053  // ignore portion after semicolon
2054  size_t pos = NStr::Find(host, ";");
2055  if (pos != string::npos) {
2056  host = host.substr(0, pos);
2057  }
2059  // must have at least two words to check
2060  pos = NStr::Find(host, " "); // combine with next line
2061  if (pos == string::npos) {
2062  return kEmptyStr;
2063  }
2064 
2066  pos = NStr::Find(host, " ");
2067  if (NStr::StartsWith(host.substr(pos + 1), "hybrid ")) {
2068  pos += 7;
2069  } else if (NStr::StartsWith(host.substr(pos + 1), "x ")) {
2070  pos += 2;
2071  }
2072  if (! NStr::StartsWith(host.substr(pos + 1), "sp.")
2073  && ! NStr::StartsWith(host.substr(pos + 1), "(")) {
2074  pos = NStr::Find(host, " ", pos + 1);
2075  if (pos != string::npos) {
2076  host = host.substr(0, pos);
2077  }
2078  } else {
2079  host = host.substr(0, pos);
2080  }
2081  return host;
2082 }
2083 
2084 
2085 string InterpretSpecificHostResult(const string& host, const CT3Reply& reply, const string& orig_host)
2086 {
2087  string err_str;
2088  if (reply.IsError()) {
2089  err_str = "?";
2090  if (reply.GetError().IsSetMessage()) {
2091  err_str = reply.GetError().GetMessage();
2092  }
2093  if(NStr::FindNoCase(err_str, "ambiguous") != string::npos) {
2094  err_str = "Specific host value is ambiguous: " +
2095  (NStr::IsBlank(orig_host) ? host : orig_host);
2096  } else {
2097  err_str = "Invalid value for specific host: " +
2098  (NStr::IsBlank(orig_host) ? host : orig_host);
2099  }
2100  } else if (reply.IsData()) {
2101  const auto& rdata = reply.GetData();
2102  if (HasMisSpellFlag(rdata)) {
2103  err_str = "Specific host value is misspelled: " +
2104  (NStr::IsBlank(orig_host) ? host : orig_host);
2105  } else if (rdata.IsSetOrg()) {
2106  const auto& org = rdata.GetOrg();
2107  if (NStr::StartsWith(org.GetTaxname(), host)) {
2108  // do nothing, all good
2109  } else if (IsCommonName(rdata)) {
2110  // not actionable
2111  } else if (FindMatchInOrgRef(host, org)) {
2112  // replace with synonym
2113  err_str = "Specific host value is alternate name: " +
2114  orig_host + " should be " +
2115  org.GetTaxname();
2116  } else {
2117  err_str = "Specific host value is incorrectly capitalized: " +
2118  (NStr::IsBlank(orig_host) ? host : orig_host);
2119  }
2120  } else {
2121  err_str = "Invalid value for specific host: " +
2122  (NStr::IsBlank(orig_host) ? host : orig_host);
2123  }
2124  }
2125  return err_str;
2126 }
2127 
2128 
2129 bool IsCommon(const COrg_ref& org, const string& val)
2130 {
2131  bool is_common = false;
2132  if (org.IsSetCommon() && NStr::EqualNocase(val, org.GetCommon())) {
2133  // common name, not genus
2134  is_common = true;
2135  } else if (org.IsSetOrgMod()) {
2136  for (auto& it : org.GetOrgname().GetMod()) {
2137  if (it->IsSetSubtype() &&
2138  it->GetSubtype() == COrgMod::eSubtype_common &&
2139  it->IsSetSubname() &&
2140  NStr::EqualNocase(it->GetSubname(), val)) {
2141  is_common = true;
2142  break;
2143  }
2144  }
2145  }
2146  return is_common;
2147 }
2148 
2149 
2150 bool IsLikelyTaxname(const string& val)
2151 {
2152  if (val.empty() || !isalpha(val.front())) {
2153  return false;
2154  }
2155  size_t pos = NStr::Find(val, " ");
2156  if (pos == NPOS) {
2157  return false;
2158  }
2159 
2160  CTaxon1 taxon1;
2161  taxon1.Init();
2162  TTaxId taxid = taxon1.GetTaxIdByName(val.substr(0, pos));
2163  if (taxid == ZERO_TAX_ID || taxid == INVALID_TAX_ID) {
2164  return false;
2165  }
2166 
2167  bool is_species = false;
2168  bool is_uncultured = false;
2169  string blast_name;
2170 
2171  CConstRef<COrg_ref> org = taxon1.GetOrgRef(taxid, is_species, is_uncultured, blast_name);
2172  if (org && IsCommon(*org, val.substr(0, pos))) {
2173  return false;
2174  } else {
2175  return true;
2176  }
2177 }
2178 
2179 
2180 //LCOV_EXCL_START
2181 //not used by asnvalidate but used by other applications
2182 bool IsSpecificHostValid(const string& val, string& error_msg)
2183 {
2185  return tval.IsOneSpecificHostValid(val, error_msg);
2186 }
2187 
2188 
2189 string FixSpecificHost(const string& val)
2190 {
2191  string hostfix = val;
2192  validator::CTaxValidationAndCleanup tval;
2193  tval.FixOneSpecificHost(hostfix);
2194 
2195  return hostfix;
2196 }
2197 
2198 
2199 static char s_ConvertChar(char ch)
2200 {
2201  if (ch < 0x02 || ch > 0x7F) {
2202  // no change
2203  }
2204  else if (isalpha(ch)) {
2205  ch = tolower(ch);
2206  }
2207  else if (isdigit(ch)) {
2208  // no change
2209  }
2210  else if (ch == '\'' || ch == '/' || ch == '@' || ch == '`' || ch == ',') {
2211  // no change
2212  }
2213  else {
2214  ch = 0x20;
2215  }
2216  return ch;
2217 }
2218 
2219 
2220 void ConvertToEntrezTerm(string& title)
2221 {
2222  string::iterator s = title.begin();
2223  char p = ' ';
2224  while (s != title.end()) {
2225  *s = s_ConvertChar(*s);
2226  if (isspace(*s) && isspace(p)) {
2227  s = title.erase(s);
2228  }
2229  else {
2230  p = *s;
2231  ++s;
2232  }
2233  }
2235 }
2236 //LCOV_EXCL_STOP
2237 
2238 
2240 {
2241  if (!cdr.IsSetCode()) {
2242  return;
2243  }
2244  const auto& gcode = cdr.GetCode();
2245  CGenetic_code::C_E::TId genCode = 0;
2246  for (auto& it : gcode.Get()) {
2247  if (it->IsId()) {
2248  genCode = it->GetId();
2249  }
2250  }
2251 
2252  if (genCode == 7) {
2253  genCode = 4;
2254  } else if (genCode == 8) {
2255  genCode = 1;
2256  } else if (genCode == 0) {
2257  genCode = 1;
2258  }
2259  cdr.ResetCode();
2261  new_code->SetId(genCode);
2262  cdr.SetCode().Set().push_back(new_code);
2263 }
2264 
2265 
2266 string TranslateCodingRegionForValidation(const CSeq_feat& feat, CScope &scope, bool& alt_start)
2267 {
2268  string transl_prot;
2269  CRef<CSeq_feat> tmp_cds(new CSeq_feat());
2270  tmp_cds->Assign(feat);
2271  FixGeneticCode(tmp_cds->SetData().SetCdregion());
2272  const CCdregion& cdregion = tmp_cds->GetData().GetCdregion();
2273  const CSeq_loc& cds_loc = tmp_cds->GetLocation();
2274  if (cds_loc.IsWhole()) {
2275  CBioseq_Handle bsh = scope.GetBioseqHandle(cds_loc.GetWhole());
2276  if (!bsh) {
2277  return kEmptyStr;
2278  }
2279  CSeq_loc::TPoint start = 0;
2280  if (cdregion.IsSetFrame()) {
2281  if (cdregion.GetFrame() == 2) {
2282  start = 1;
2283  } else if (cdregion.GetFrame() == 3) {
2284  start = 2;
2285  }
2286  }
2287  const CGenetic_code* genetic_code = nullptr;
2288  if (cdregion.IsSetCode()) {
2289  genetic_code = &(cdregion.GetCode());
2290  }
2291  CRef<CSeq_id> id(new CSeq_id());
2292  id->Assign(cds_loc.GetWhole());
2293  CRef<CSeq_loc> tmp(new CSeq_loc(*id, start, bsh.GetInst_Length() - 1));
2294  CSeqTranslator::Translate(*tmp, scope, transl_prot, genetic_code, true, false, &alt_start);
2295  } else {
2297  *tmp_cds, scope, transl_prot,
2298  true, // include stop codons
2299  false, // do not remove trailing X/B/Z
2300  &alt_start);
2301  }
2302 
2303  return transl_prot;
2304 }
2305 
2306 
2307 bool HasBadStartCodon(const CSeq_loc& loc, const string& transl_prot)
2308 {
2309  bool got_dash = (transl_prot[0] == '-');
2310  bool got_x = (transl_prot[0] == 'X'
2312 
2313  if (!got_dash && !got_x) {
2314  return false;
2315  }
2316  return true;
2317 }
2318 
2319 
2320 static const char * kUnclassifiedTranslationDiscrepancy = "unclassified translation discrepancy";
2321 
2322 static const char* const sc_BypassCdsTransCheckText[] = {
2323  "RNA editing",
2324  "adjusted for low-quality genome",
2325  "annotated by transcript or proteomic data",
2326  "rearrangement required for product",
2327  "reasons given in citation",
2328  "translated product replaced",
2330 };
2333 
2334 static const char* const sc_ForceCdsTransCheckText[] = {
2335  "artificial frameshift",
2336  "mismatches in translation"
2337 };
2340 
2341 bool ReportTranslationErrors(const string& except_text)
2342 {
2343  bool report = true;
2344  ITERATE(TBypassCdsTransCheckSet, it, sc_BypassCdsTransCheck) {
2345  if (NStr::FindNoCase(except_text, *it) != NPOS) {
2346  report = false;
2347  }
2348  }
2349  if (!report) {
2350  ITERATE(TForceCdsTransCheckSet, it, sc_ForceCdsTransCheck) {
2351  if (NStr::FindNoCase(except_text, *it) != NPOS) {
2352  report = true;
2353  }
2354  }
2355  }
2356  return report;
2357 }
2358 
2359 
2360 //LCOV_EXCL_START
2361 //not used by asnvalidate but used by other applications
2362 bool HasBadStartCodon(const CSeq_feat& feat, CScope& scope, bool ignore_exceptions)
2363 {
2364  if (!feat.IsSetData() || !feat.GetData().IsCdregion()) {
2365  return false;
2366  }
2367  // do not validate for pseudo gene
2368  FOR_EACH_GBQUAL_ON_FEATURE(it, feat) {
2369  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "pseudo")) {
2370  return false;
2371  }
2372  }
2373 
2374  if (!ignore_exceptions && feat.CanGetExcept() && feat.GetExcept() &&
2375  feat.CanGetExcept_text()) {
2376  if (!ReportTranslationErrors(feat.GetExcept_text())) {
2377  return false;
2378  }
2379  }
2380 
2381  bool alt_start = false;
2382  string transl_prot;
2383  try {
2384  transl_prot = TranslateCodingRegionForValidation(feat, scope, alt_start);
2385  } catch (CException& ) {
2386  return false;
2387  }
2388  return HasBadStartCodon(feat.GetLocation(), transl_prot);
2389 }
2390 //LCOV_EXCL_STOP
2391 
2392 
2393 size_t CountInternalStopCodons(const string& transl_prot)
2394 {
2395  if (NStr::IsBlank(transl_prot)) {
2396  return 0;
2397  }
2398  // count internal stops and Xs
2399  size_t internal_stop_count = 0;
2400 
2401  ITERATE(string, it, transl_prot) {
2402  if (*it == '*') {
2403  ++internal_stop_count;
2404  }
2405  }
2406  // if stop at end, reduce count by one (since one of the stops counted isn't internal)
2407  if (transl_prot[transl_prot.length() - 1] == '*') {
2408  --internal_stop_count;
2409  }
2410  return internal_stop_count;
2411 }
2412 
2413 
2414 //LCOV_EXCL_START
2415 //not used by asnvalidate but used by other applications
2416 bool HasInternalStop(const CSeq_feat& feat, CScope& scope, bool ignore_exceptions)
2417 {
2418  if (!feat.IsSetData() || !feat.GetData().IsCdregion()) {
2419  return false;
2420  }
2421  // do not validate for pseudo gene
2422  FOR_EACH_GBQUAL_ON_FEATURE(it, feat) {
2423  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "pseudo")) {
2424  return false;
2425  }
2426  }
2427 
2428  if (!ignore_exceptions && feat.CanGetExcept() && feat.GetExcept() &&
2429  feat.CanGetExcept_text()) {
2430  const string& except_text = feat.GetExcept_text();
2431  if (NStr::Find(except_text, kUnclassifiedTranslationDiscrepancy) == string::npos
2433  return false;
2434  }
2435  }
2436 
2437  bool alt_start = false;
2438  string transl_prot;
2439  try {
2440  transl_prot = TranslateCodingRegionForValidation(feat, scope, alt_start);
2441  } catch (CException& ) {
2442  return false;
2443  }
2444 
2445  size_t internal_stop_codons = CountInternalStopCodons(transl_prot);
2446  if (internal_stop_codons > 0) {
2447  return true;
2448  } else {
2449  return false;
2450  }
2451 }
2452 //LCOV_EXCL_STOP
2453 
2454 
2456 {
2458  CSeq_data::E_Choice seqtyp = bsh.GetInst().IsSetSeq_data() ?
2460  if (seqtyp == CSeq_data::e_Ncbieaa || seqtyp == CSeq_data::e_Ncbistdaa) {
2462  }
2463  return sv;
2464 }
2465 
2466 
2468 {
2469  if (sv.size() < 1) {
2470  return false;
2471  } else if (sv.IsInGap(0) || sv[0] == '-') {
2472  return true;
2473  } else {
2474  return false;
2475  }
2476 }
2477 
2478 
2479 //LCOV_EXCL_START
2480 //not used by asnvalidate but used by other applications
2481 bool HasBadProteinStart(const CSeq_feat& feat, CScope& scope)
2482 {
2483  if (!feat.IsSetData() || !feat.GetData().IsCdregion() ||
2484  !feat.IsSetProduct()) {
2485  return false;
2486  }
2487  // use try catch for those weird situations where the product is
2488  // not specified as a single product sequence (in which case we
2489  // should just skip this test)
2490  try {
2491  CBioseq_Handle bsh = scope.GetBioseqHandle(feat.GetProduct());
2492  if (!bsh.IsAa()) {
2493  return false;
2494  }
2496  return HasBadProteinStart(*sv);
2497  } catch (CException& ) {
2498  return false;
2499  }
2500 }
2501 //LCOV_EXCL_STOP
2502 
2503 
2505 {
2506  size_t terminations = 0;
2507 
2508  for (CSeqVector_CI sv_iter(sv); (sv_iter); ++sv_iter) {
2509  if (*sv_iter == '*') {
2510  terminations++;
2511  }
2512  }
2513  return terminations;
2514 }
2515 
2516 
2517 //LCOV_EXCL_START
2518 //not used by asnvalidate but used by other applications
2519 bool HasStopInProtein(const CSeq_feat& feat, CScope& scope)
2520 {
2521  if (!feat.IsSetData() || !feat.GetData().IsCdregion() ||
2522  !feat.IsSetProduct()) {
2523  return false;
2524  }
2525  // use try catch for those weird situations where the product is
2526  // not specified as a single product sequence (in which case we
2527  // should just skip this test)
2528  try {
2529  CBioseq_Handle bsh = scope.GetBioseqHandle(feat.GetProduct());
2530  if (!bsh.IsAa()) {
2531  return false;
2532  }
2534  if (CountProteinStops(*sv) > 0) {
2535  return true;
2536  } else {
2537  return false;
2538  }
2539  } catch (CException& ) {
2540  return false;
2541  }
2542 }
2543 //LCOV_EXCL_STOP
2544 
2545 
2546 void FeatureHasEnds(const CSeq_feat& feat, CScope* scope, bool& no_beg, bool& no_end)
2547 {
2548  unsigned int part_loc = sequence::SeqLocPartialCheck(feat.GetLocation(), scope);
2549  no_beg = false;
2550  no_end = false;
2551 
2552  if (part_loc & sequence::eSeqlocPartial_Start) {
2553  no_beg = true;
2554  }
2555  if (part_loc & sequence::eSeqlocPartial_Stop) {
2556  no_end = true;
2557  }
2558 
2559 
2560  if ((!no_beg || !no_end) && feat.IsSetProduct()) {
2561  unsigned int part_prod = sequence::SeqLocPartialCheck(feat.GetProduct(), scope);
2562  if (part_prod & sequence::eSeqlocPartial_Start) {
2563  no_beg = true;
2564  }
2565  if (part_prod & sequence::eSeqlocPartial_Stop) {
2566  no_end = true;
2567  }
2568  }
2569 }
2570 
2571 
2572 //LCOV_EXCL_START
2573 // not used by asnvalidate but needed for other applications
2574 CBioseq_Handle GetCDSProductSequence(const CSeq_feat& feat, CScope* scope, const CTSE_Handle & tse, bool far_fetch, bool& is_far)
2575 {
2576  CBioseq_Handle prot_handle;
2577  is_far = false;
2578  if (!feat.IsSetProduct()) {
2579  return prot_handle;
2580  }
2581  const CSeq_id* protid = nullptr;
2582  try {
2583  protid = &sequence::GetId(feat.GetProduct(), scope);
2584  } catch (CException&) {}
2585  if (protid) {
2586  prot_handle = scope->GetBioseqHandleFromTSE(*protid, tse);
2587  if (!prot_handle && far_fetch) {
2588  prot_handle = scope->GetBioseqHandle(*protid);
2589  is_far = true;
2590  }
2591  }
2592  return prot_handle;
2593 }
2594 //LCOV_EXCL_STOP
2595 
2596 
2597 void CalculateEffectiveTranslationLengths(const string& transl_prot, const CSeqVector& prot_vec, size_t &len, size_t& prot_len)
2598 {
2599  len = transl_prot.length();
2600  prot_len = prot_vec.size();
2601 
2602  if (NStr::EndsWith(transl_prot, "*") && (len == prot_len + 1)) { // ok, got stop
2603  --len;
2604  }
2605  while (len > 0) {
2606  if (transl_prot[len - 1] == 'X') { //remove terminal X
2607  --len;
2608  } else {
2609  break;
2610  }
2611  }
2612 
2613  // ignore terminal 'X' from partial last codon if present
2614  while (prot_len > 0) {
2615  if (prot_vec[(TSeqPos)prot_len - 1] == 'X') { //remove terminal X
2616  --prot_len;
2617  } else {
2618  break;
2619  }
2620  }
2621 }
2622 
2623 
2624 //LCOV_EXCL_START
2625 // not used by asnvalidate but needed for other applications
2626 vector<TSeqPos> GetMismatches(const CSeq_feat& feat, const CSeqVector& prot_vec, const string& transl_prot)
2627 {
2628  vector<TSeqPos> mismatches;
2629  size_t prot_len;
2630  size_t len;
2631 
2632  CalculateEffectiveTranslationLengths(transl_prot, prot_vec, len, prot_len);
2633 
2634  if (len == prot_len) { // could be identical
2635  for (TSeqPos i = 0; i < len; ++i) {
2636  CSeqVectorTypes::TResidue p_res = prot_vec[i];
2637  CSeqVectorTypes::TResidue t_res = transl_prot[i];
2638 
2639  if (t_res != p_res) {
2640  if (i == 0) {
2641  bool no_beg, no_end;
2642  FeatureHasEnds(feat, &(prot_vec.GetScope()), no_beg, no_end);
2643  if (feat.IsSetPartial() && feat.GetPartial() && (!no_beg) && (!no_end)) {
2644  } else if (t_res == '-') {
2645  } else {
2646  mismatches.push_back(i);
2647  }
2648  } else {
2649  mismatches.push_back(i);
2650  }
2651  }
2652  }
2653  }
2654  return mismatches;
2655 }
2656 
2657 
2658 vector<TSeqPos> GetMismatches(const CSeq_feat& feat, const CBioseq_Handle& prot_handle, const string& transl_prot)
2659 {
2660  vector<TSeqPos> mismatches;
2661  // can't check for mismatches unless there is a product
2662  if (!prot_handle || !prot_handle.IsAa()) {
2663  return mismatches;
2664  }
2665 
2666  CSeqVector prot_vec = prot_handle.GetSeqVector();
2667  prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
2668 
2669  return GetMismatches(feat, prot_vec, transl_prot);
2670 }
2671 
2672 
2673 bool HasNoStop(const CSeq_feat& feat, CScope* scope)
2674 {
2675  bool no_beg, no_end;
2676  FeatureHasEnds(feat, scope, no_beg, no_end);
2677  if (no_end) {
2678  return false;
2679  }
2680 
2681  string transl_prot;
2682  bool alt_start;
2683  try {
2684  transl_prot = TranslateCodingRegionForValidation(feat, *scope, alt_start);
2685  } catch (CException& ) {
2686  }
2687  if (NStr::EndsWith(transl_prot, "*")) {
2688  return false;
2689  }
2690 
2691  bool show_stop = true;
2692  if (!no_beg && feat.IsSetPartial() && feat.GetPartial()) {
2693  CBioseq_Handle prot_handle;
2694  try {
2695  CBioseq_Handle bsh = scope->GetBioseqHandle(feat.GetLocation());
2696  const CTSE_Handle tse = bsh.GetTSE_Handle();
2697  bool is_far = false;
2698  prot_handle = GetCDSProductSequence(feat, scope, tse, true, is_far);
2699  if (prot_handle) {
2700  vector<TSeqPos> mismatches = GetMismatches(feat, prot_handle, transl_prot);
2701  if (mismatches.size() == 0) {
2702  show_stop = false;
2703  }
2704  }
2705  } catch (CException& ) {
2706  }
2707  }
2708 
2709  return show_stop;
2710 }
2711 //LCOV_EXCL_STOP
2712 
2713 
2714 bool IsSequenceFetchable(const CSeq_id& id, CScope* scope)
2715 {
2716  bool fetchable = false;
2717  try {
2718  if (scope) {
2720  CScope::TIds ids = scope->GetIds(idh);
2721  if (ids.size() > 0) {
2722  fetchable = true;
2723  }
2724  } else {
2727  scopex->AddDefaults();
2728  CBioseq_Handle bsh = scopex->GetBioseqHandle(idh);
2729  if (bsh) {
2730  fetchable = true;
2731  }
2732  }
2733  } catch (CException& ) {
2734  } catch (std::exception &) {
2735  }
2736  return fetchable;
2737 }
2738 
2739 
2740 bool IsSequenceFetchable(const string& seq_id, CScope* scope)
2741 {
2742  bool fetchable = false;
2743  try {
2744  CRef<CSeq_id> id(new CSeq_id(seq_id));
2745  if (id) {
2746  fetchable = IsSequenceFetchable(*id, scope);
2747  }
2748  } catch (CException& ) {
2749  } catch (std::exception &) {
2750  }
2751  return fetchable;
2752 }
2753 
2754 
2755 bool IsNTNCNWACAccession(const string& acc)
2756 {
2757  if (NStr::StartsWith(acc, "NT_") || NStr::StartsWith(acc, "NC_") ||
2758  NStr::StartsWith(acc, "AC_") || NStr::StartsWith(acc, "NW_")) {
2759  return true;
2760  } else {
2761  return false;
2762  }
2763 }
2764 
2765 
2767 {
2768  if (id.IsOther() && id.GetOther().IsSetAccession() &&
2769  IsNTNCNWACAccession(id.GetOther().GetAccession())) {
2770  return true;
2771  } else {
2772  return false;
2773  }
2774 }
2775 
2776 
2778 {
2779  bool is_it = false;
2780  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
2781  if (IsNTNCNWACAccession(**id_it)) {
2782  is_it = true;
2783  break;
2784  }
2785  }
2786  return is_it;
2787 }
2788 
2789 
2790 bool IsNG(const CSeq_id& id)
2791 {
2792  if (id.IsOther() && id.GetOther().IsSetAccession() &&
2793  NStr::StartsWith(id.GetOther().GetAccession(), "NG_")) {
2794  return true;
2795  } else {
2796  return false;
2797  }
2798 }
2799 
2800 
2801 bool IsNG(const CBioseq& seq)
2802 {
2803  bool is_it = false;
2804  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
2805  if (IsNG(**id_it)) {
2806  is_it = true;
2807  break;
2808  }
2809  }
2810  return is_it;
2811 }
2812 
2813 
2814 // See VR-728. These Seq-ids are temporary and will be stripped
2815 // by the ID Load process, so they should not be the only Seq-id
2816 // on a Bioseq, and feature locations should not use these.
2817 bool IsTemporary(const CSeq_id& id)
2818 {
2819  if (id.IsGeneral() && id.GetGeneral().IsSetDb()) {
2820  const string& db = id.GetGeneral().GetDb();
2821  if (NStr::EqualNocase(db, "TMSMART") ||
2822  NStr::EqualNocase(db, "NCBIFILE") ||
2823  NStr::EqualNocase(db, "BankIt")) {
2824  return true;
2825  }
2826  }
2827  return false;
2828 }
2829 
2830 
2831 bool IsOrganelle(int genome)
2832 {
2833  bool rval = false;
2834  switch (genome) {
2847  rval = true;
2848  break;
2849  default:
2850  rval = false;
2851  break;
2852  }
2853  return rval;
2854 }
2855 
2856 
2857 bool IsOrganelle(const CBioseq_Handle& seq)
2858 {
2859  if (!seq) {
2860  return false;
2861  }
2862  bool rval = false;
2864  if (sd && sd->GetSource().IsSetGenome() && IsOrganelle(sd->GetSource().GetGenome())) {
2865  rval = true;
2866  }
2867  return rval;
2868 }
2869 
2870 
2872 {
2873  return (bool)(strchr("ANRMWHVD", ch) != NULL);
2874 }
2875 
2877 {
2878  return (bool)(strchr("CNYMSHBV", ch) != NULL);
2879 }
2880 
2882 {
2883  return (bool)(strchr("GNRKSBVD", ch) != NULL);
2884 }
2885 
2887 {
2888  return (bool)(strchr("TNYKWHBD", ch) != NULL);
2889 }
2890 
2891 
2892 //LCOV_EXCL_START
2893 //not used by validator, but used by Genome Workbench menu item for
2894 //removing unneccessary exceptions
2895 bool DoesCodingRegionHaveUnnecessaryException(const CSeq_feat& feat, const CBioseq_Handle& loc_handle, CScope& scope)
2896 {
2897  CCDSTranslationProblems problems;
2898  CBioseq_Handle prot_handle;
2899  if (feat.IsSetProduct()) {
2900  prot_handle = scope.GetBioseqHandle(feat.GetProduct());
2901  }
2902 
2904  feat,
2905  loc_handle,
2906  prot_handle,
2907  false,
2908  false,
2909  false,
2910  false,
2911  false,
2912  false,
2913  false,
2914  false,
2915  false,
2916  false,
2917  &scope);
2918 
2920 }
2921 
2922 
2924 {
2925  size_t mismatches = 0;
2927  if (feat.IsSetProduct()) {
2928  rna = scope.GetBioseqHandle(feat.GetProduct());
2929  }
2930 
2931  size_t problems = GetMRNATranslationProblems
2932  (feat, mismatches, false,
2933  nuc, rna, false, false, false, &scope);
2934 
2935  return (problems & eMRNAProblem_UnnecessaryException);
2936 }
2937 
2938 
2940 {
2941  if (!feat.IsSetExcept_text()) {
2942  return false;
2943  }
2944  if (!feat.IsSetData()) {
2945  return false;
2946  }
2947  if (!feat.IsSetLocation()) {
2948  return false;
2949  }
2950  try {
2951  CBioseq_Handle bsh = scope.GetBioseqHandle(feat.GetLocation());
2952  if (!bsh) {
2953  return false;
2954  }
2955  CSpliceProblems splice_problems;
2956  splice_problems.CalculateSpliceProblems(feat, true, sequence::IsPseudo(feat, scope), bsh);
2957  if (splice_problems.IsExceptionUnnecessary()) {
2958  return true;
2959  }
2960  if (feat.GetData().IsCdregion()) {
2961  return DoesCodingRegionHaveUnnecessaryException(feat, bsh, scope);
2962  } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
2963  return DoesmRNAHaveUnnecessaryException(feat, bsh, scope);
2964  } else {
2965  return false;
2966  }
2967  } catch (CException&) {
2968  }
2969  return false;
2970 }
2971 //LCOV_EXCL_STOP
2972 
2973 static bool s_IsGenbankMasterAccession(const string& acc)
2974 {
2975  bool rval = false;
2976  switch (acc.length()) {
2977  case 12:
2978  if (NStr::EndsWith(acc, "000000")) {
2979  rval = true;
2980  }
2981  break;
2982  case 13:
2983  if (NStr::EndsWith(acc, "0000000")) {
2984  rval = true;
2985  }
2986  break;
2987  case 14:
2988  if (NStr::EndsWith(acc, "00000000")) {
2989  rval = true;
2990  }
2991  break;
2992  default:
2993  break;
2994  }
2995  return rval;
2996 }
2997 
2998 
3000 {
3001  bool rval = false;
3002  switch (id.Which()) {
3003  case CSeq_id::e_Other:
3004  if (id.GetOther().IsSetAccession()) {
3005  const string& acc = id.GetOther().GetAccession();
3006  switch (acc.length()) {
3007  case 15:
3008  if (NStr::EndsWith(acc, "000000")) {
3009  rval = true;
3010  }
3011  break;
3012  case 16:
3013  case 17:
3014  if (NStr::EndsWith(acc, "0000000")) {
3015  rval = true;
3016  }
3017  break;
3018  default:
3019  break;
3020  }
3021  }
3022  break;
3023  case CSeq_id::e_Genbank:
3024  if (id.GetGenbank().IsSetAccession()) {
3025  rval = s_IsGenbankMasterAccession(id.GetGenbank().GetAccession());
3026  }
3027  break;
3028  case CSeq_id::e_Ddbj:
3029  if (id.GetDdbj().IsSetAccession()) {
3030  rval = s_IsGenbankMasterAccession(id.GetDdbj().GetAccession());
3031  }
3032  break;
3033  case CSeq_id::e_Embl:
3034  if (id.GetEmbl().IsSetAccession()) {
3035  rval = s_IsGenbankMasterAccession(id.GetEmbl().GetAccession());
3036  }
3037  break;
3038  case CSeq_id::e_Tpg:
3039  if (id.GetTpg().IsSetAccession()) {
3040  rval = s_IsGenbankMasterAccession(id.GetTpg().GetAccession());
3041  }
3042  break;
3043  default:
3044  break;
3045  }
3046 
3047  return rval;
3048 }
3049 
3051 {
3052  CBioseq_Handle bsh = scope.GetBioseqHandle(id);
3053  if (!bsh) {
3054  // can't fetch bioseq, can't tell, assume not
3055  return false;
3056  }
3057  CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
3058  if (!src || !src->GetSource().IsSetGenome() || !IsOrganelle(src->GetSource().GetGenome())) {
3059  // not an organelle location
3060  return false;
3061  }
3063  while (set) {
3064  if (!set.IsSetClass()) {
3065  // class not set - quit
3066  break;
3067  } else if (set.GetClass() == CBioseq_set::eClass_small_genome_set) {
3068  return true;
3069  } else if (set.GetClass() == CBioseq_set::eClass_nuc_prot) {
3070  // look at parent
3071  set = set.GetParentBioseq_set();
3072  } else {
3073  break;
3074  }
3075  }
3076  return false;
3077 }
3078 
3079 
3081 {
3082  CSeq_loc_CI lit(loc);
3083  const CSeq_id& id1 = lit.GetSeq_id();
3084 
3085  bool in_organelle_small_genome_set = IsInOrganelleSmallGenomeSet(id1, scope);
3086 
3087  ++lit;
3088  while (lit) {
3089  const CSeq_id& id2 = lit.GetSeq_id();
3090  if (in_organelle_small_genome_set && !IsInOrganelleSmallGenomeSet(id2, scope)) {
3091  // if one sequence in small genome set and other not, this is bad
3092  return true;
3093  }
3094  if (!id2.Match(id1) && !sequence::IsSameBioseq(id1, id2, &scope) && !in_organelle_small_genome_set) {
3095  return true;
3096  }
3097  ++lit;
3098  }
3099  return false;
3100 }
3101 
3102 bool IsBadSubmissionFirstName(const string& first)
3103 {
3104  if (NStr::EqualNocase(first, "Firstname") ||
3105  NStr::EqualNocase(first, "First") ||
3106  NStr::EqualNocase(first, "name") ||
3107  NStr::EqualNocase(first, "Please select") ||
3108  NStr::EqualNocase(first, "Please") ||
3109  NStr::EqualNocase(first, "Select")) {
3110  return true;
3111  }
3112  return false;
3113 }
3114 
3115 
3116 bool IsBadSubmissionLastName(const string& last)
3117 {
3118  if (NStr::EqualNocase(last, "Lastname") ||
3119  NStr::EqualNocase(last, "name") ||
3120  NStr::EqualNocase(last, "Please select") ||
3121  NStr::EqualNocase(last, "Please") ||
3122  NStr::EqualNocase(last, "Select")) {
3123  return true;
3124  }
3125  return false;
3126 }
3127 
3128 
3129 END_SCOPE(validator)
static CRef< CScope > m_Scope
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CAlign_CI –.
Definition: align_ci.hpp:63
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
CBioseq_set_Handle –.
void CalculateTranslationProblems(const CSeq_feat &feat, CBioseq_Handle loc_handle, CBioseq_Handle prot_handle, bool ignore_exceptions, bool far_fetch_cds, bool standalone_annot, bool single_seq, bool is_gpipe, bool is_genomic, bool is_refseq, bool is_nt_or_ng_or_nw, bool is_nc, bool has_accession, CScope *scope)
size_t GetTranslationProblemFlags() const
CCdregion –.
Definition: Cdregion.hpp:66
Definition: Date.hpp:53
FASTA-format output; see also ReadFasta in <objtools/readers/fasta.hpp>
Definition: sequence.hpp:770
CFeat_id –.
Definition: Feat_id.hpp:66
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
int Compare(const CObject_id &oid2) const
Definition: Object_id.cpp:145
bool IsSetOrgMod(void) const
Definition: Org_ref.cpp:169
CScope –.
Definition: scope.hpp:92
ESubtype GetSubtype(void) const
CSeqVector –.
Definition: seq_vector.hpp:65
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
CSeq_annot_Handle –.
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
const CSeq_descr & GetDescr(void) const
Definition: Seq_entry.cpp:120
bool IsSetDescr(void) const
Definition: Seq_entry.cpp:106
CSeq_feat_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:593
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
Base class for all serializable objects.
Definition: serialbase.hpp:150
void CalculateSpliceProblems(const CSeq_feat &feat, bool check_all, bool pseudo, CBioseq_Handle loc_handle)
bool IsExceptionUnnecessary() const
CT3Reply –.
Definition: T3Reply.hpp:66
TSeq_feat_Handles GetFeaturesWithId(CSeqFeatData::E_Choice type, TFeatureIdInt id) const
Definition: tse_handle.cpp:604
bool IsOneSpecificHostValid(const string &val, string &err_msg)
CConstRef< COrg_ref > GetOrgRef(TTaxId tax_id, bool &is_species, bool &is_uncultured, string &blast_name, bool *is_specified=NULL)
Definition: taxon1.cpp:704
TTaxId GetTaxIdByName(const string &orgname)
Definition: taxon1.cpp:523
bool Init(void)
Definition: taxon1.cpp:101
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
Definition: map.hpp:338
Definition: set.hpp:45
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
static uch flags
static const char si[8][64]
Definition: des.c:146
CS_CONTEXT * ctx
Definition: t0006.c:12
#define bool
Definition: bool.h:34
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
static FILE * f
Definition: readconf.c:23
char data[12]
Definition: iconv.c:80
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define INVALID_TAX_ID
Definition: ncbimisc.hpp:1116
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define ZERO_GI
Definition: ncbimisc.hpp:1088
string
Definition: cgiapp.hpp:690
#define NULL
Definition: ncbistd.hpp:225
const string & FindName(TEnumValueType value, bool allowBadValue) const
Find name of the enum by its numeric value.
Definition: enumerated.cpp:146
const CVect2< U > & v2
Definition: globals.hpp:440
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
#define ENUM_METHOD_NAME(EnumName)
Definition: serialbase.hpp:994
virtual const CTypeInfo * GetThisTypeInfo(void) const =0
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1065
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
string GetLabel(const CSeq_id &id)
@ eFasta
Tagged ID in NCBI's traditional FASTA style.
Definition: Seq_id.hpp:607
CRef< CSeq_loc > MakeSeq_loc(EMakeType make_type=eMake_CompactType) const
return constructed CSeq_loc with all changes
Definition: Seq_loc.cpp:2946
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
Definition: Seq_loc.cpp:3222
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
CConstRef< CSeq_loc > GetRangeAsSeq_loc(void) const
Get seq-loc for the current iterator position.
Definition: Seq_loc.cpp:2585
void SetSeq_id(const CSeq_id &id)
Set seq_id of the current location.
Definition: Seq_loc.hpp:713
TSeqPos TPoint
Definition: Seq_loc.hpp:102
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
TRange GetRange(void) const
Get the range.
Definition: Seq_loc.hpp:1042
ENa_strand GetStrand(void) const
Definition: Seq_loc.hpp:1056
const CSeq_id & GetSeq_id(void) const
Get seq_id of the current location.
Definition: Seq_loc.hpp:1028
void GetLabel(string *label) const
Appends a label suitable for display (e.g., error messages) label must point to an existing string ob...
Definition: Seq_loc.cpp:3467
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
CConstBeginInfo ConstBegin(const C &obj)
Get starting point of non-modifiable object hierarchy.
Definition: iterator.hpp:1012
@ fFGL_Content
Include its content if there is any.
Definition: feature.hpp:73
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
ENa_strand GetStrand(const CSeq_loc &loc, CScope *scope=0)
Returns eNa_strand_unknown if multiple Bioseqs in loc Returns eNa_strand_other if multiple strands in...
int SeqLocPartialCheck(const CSeq_loc &loc, CScope *scope)
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
bool IsSameBioseq(const CSeq_id &id1, const CSeq_id &id2, CScope *scope, CScope::EGetBioseqFlag get_flag=CScope::eGetBioseq_All)
Determines if two CSeq_ids represent the same CBioseq.
@ eSeqlocPartial_Nostart
@ eSeqlocPartial_Nostop
@ eSeqlocPartial_Stop
@ eSeqlocPartial_Start
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ eSame
CSeq_locs contain each other.
bool IsPseudo(const CSeq_feat &feat, CScope &scope)
Determines whether given feature is pseudo, using gene associated with feature if necessary Checks to...
Definition: sequence.cpp:1428
virtual void WriteSequence(const CBioseq_Handle &handle, const CSeq_loc *location=0, CSeq_loc::EOpFlags merge_flags=CSeq_loc::fMerge_AbuttingOnly)
Definition: sequence.cpp:3322
void SetFlag(EFlags flag)
Definition: sequence.hpp:859
static void Translate(const string &seq, string &prot, const CGenetic_code *code, bool include_stop=true, bool remove_trailing_X=false, bool *alt_start=NULL, bool is_5prime_complete=true, bool is_3prime_complete=true)
Translate a string using a specified genetic code.
Definition: sequence.cpp:4095
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
@ fInstantiateGaps
honor specifed gap mode; on by default
Definition: sequence.hpp:774
@ fAssembleParts
assemble FAR delta sequences; on by dflt
Definition: sequence.hpp:773
TIds GetIds(const CSeq_id &id, TGetFlags flags=0)
Get "native" bioseq ids without filtering and matching.
Definition: scope.cpp:401
CBioseq_Handle GetBioseqHandleFromTSE(const CSeq_id &id, const CTSE_Handle &tse)
Get bioseq handle for sequence withing one TSE.
Definition: scope.cpp:253
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle GetSeq_entryHandle(CDataLoader *loader, const TBlobId &blob_id, EMissing action=eMissing_Default)
Get Seq-entry handle by its blob-id, with possible loading.
Definition: scope.cpp:113
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
CSeq_annot_Handle GetSeq_annotHandle(const CSeq_annot &annot, EMissing action=eMissing_Default)
Definition: scope.cpp:192
TGi GetGi(const CSeq_id_Handle &idh, TGetFlags flags=0)
Get GI of a sequence Returns ZERO_GI if the sequence is not found or if it doesn't have GI.
Definition: scope.cpp:419
vector< CSeq_id_Handle > TIds
Definition: scope.hpp:143
@ eGetBioseq_All
Search bioseq, load if not loaded yet.
Definition: scope.hpp:128
bool IsSetComment(void) const
bool IsNucleotide(void) const
const CTSE_Handle & GetTSE_Handle(void) const
Get CTSE_Handle of containing TSE.
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
TClass GetClass(void) const
const CSeq_annot_Handle & GetAnnot(void) const
Get handle to seq-annot for this feature.
CBioseq_set_Handle GetParentBioseq_set(void) const
Return a handle for the parent Bioseq-set, or null handle.
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
bool IsSetDbxref(void) const
virtual CConstRef< CSeq_feat > GetSeq_feat(void) const
const CSeqFeatData & GetData(void) const
TSet GetSet(void) const
bool IsAa(void) const
CConstRef< CBioseq_set > GetCompleteBioseq_set(void) const
Return the complete bioseq-set object.
CBioseq_set_Handle GetParentBioseq_set(void) const
Return a handle for the parent Bioseq-set, or null handle.
virtual const CSeq_loc & GetLocation(void) const
TSeq GetSeq(void) const
EVectorCoding
CSeqVector constructor flags.
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
TInst_Topology GetInst_Topology(void) const
const string & GetComment(void) const
const CSeq_annot::TDesc & Seq_annot_GetDesc(void) const
CSeq_entry_Handle GetParentEntry(void) const
Return a handle for the parent seq-entry of the bioseq.
TInst_Length GetInst_Length(void) const
bool IsSetInst_Repr(void) const
bool IsSetClass(void) const
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
CScope & GetScope(void) const
Get scope this handle belongs to.
TInst_Repr GetInst_Repr(void) const
CScope & GetScope(void) const
Get scope this handle belongs to.
bool IsSet(void) const
const CSeq_feat::TDbxref & GetDbxref(void) const
bool IsSetQual(void) const
bool IsSetInst_Topology(void) const
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
const CSeq_feat::TQual & GetQual(void) const
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
const TInst & GetInst(void) const
bool IsSeq(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
unsigned char TResidue
bool IsInGap(TSeqPos pos) const
true if sequence at 0-based position 'pos' has gap Note: this method is not MT-safe,...
Definition: seq_vector.hpp:277
static CRef< CSeqMap > CreateSeqMapForSeq_loc(const CSeq_loc &loc, CScope *scope)
Definition: seq_map.cpp:1134
TSeqPos size(void) const
Definition: seq_vector.hpp:291
void SetCoding(TCoding coding)
bool IsNucleotide(void) const
Definition: seq_vector.hpp:357
CScope & GetScope(void) const
Definition: seq_vector.hpp:330
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2984
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5424
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
Definition: ncbistr.cpp:3192
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2882
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5319
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5347
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5378
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3396
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
@ eCase
Case sensitive compare.
Definition: ncbistr.hpp:1205
static const char label[]
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
bool IsSetGenome(void) const
Check if a value has been assigned to Genome data member.
Definition: BioSource_.hpp:397
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
bool IsStd(void) const
Check if variant Std is selected.
Definition: Date_.hpp:320
const TStr & GetStr(void) const
Get the variant data.
Definition: Date_.hpp:306
const TStd & GetStd(void) const
Get the variant data.
Definition: Date_.cpp:109
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
bool IsStr(void) const
Check if variant Str is selected.
Definition: Date_.hpp:300
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
bool IsSetCommon(void) const
common name Check if a value has been assigned to Common data member.
Definition: Org_ref_.hpp:407
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
const TCommon & GetCommon(void) const
Get the Common member data.
Definition: Org_ref_.hpp:419
bool IsSetMod(void) const
Check if a value has been assigned to Mod data member.
Definition: OrgName_.hpp:827
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
bool IsSetTaxname(void) const
preferred formal name Check if a value has been assigned to Taxname data member.
Definition: Org_ref_.hpp:360
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSubtype_gb_synonym
used by taxonomy database
Definition: OrgMod_.hpp:117
@ eSubtype_old_name
Definition: OrgMod_.hpp:124
TDim GetDim(void) const
Get the Dim member data.
Definition: Seq_align_.hpp:856
bool IsSetSegs(void) const
Check if a value has been assigned to Segs data member.
Definition: Seq_align_.hpp:909
bool IsDendiag(void) const
Check if variant Dendiag is selected.
Definition: Seq_align_.hpp:720
const TDendiag & GetDendiag(void) const
Get the variant data.
Definition: Seq_align_.hpp:726
bool IsStd(void) const
Check if variant Std is selected.
Definition: Seq_align_.hpp:746
bool IsSetDim(void) const
dimensionality Check if a value has been assigned to Dim data member.
Definition: Seq_align_.hpp:837
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
bool IsSetCode(void) const
genetic code used Check if a value has been assigned to Code data member.
Definition: Cdregion_.hpp:700
bool IsCdregion(void) const
Check if variant Cdregion is selected.
void ResetCode(void)
Reset Code data member.
Definition: Cdregion_.cpp:63
bool IsSetPartial(void) const
incomplete in some way? Check if a value has been assigned to Partial data member.
Definition: Seq_feat_.hpp:943
const TLocal & GetLocal(void) const
Get the variant data.
Definition: Feat_id_.cpp:134
void SetCode(TCode &value)
Assign a value to Code data member.
Definition: Cdregion_.cpp:68
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Feat_id_.hpp:353
TFrame GetFrame(void) const
Get the Frame member data.
Definition: Cdregion_.hpp:534
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
const TExcept_text & GetExcept_text(void) const
Get the Except_text member data.
Definition: Seq_feat_.hpp:1405
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
Definition: Seq_feat_.hpp:1393
const TCode & GetCode(void) const
Get the Code member data.
Definition: Cdregion_.hpp:712
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
bool CanGetLocation(void) const
Check if it is safe to call GetLocation method.
Definition: Seq_feat_.hpp:1111
const TCdregion & GetCdregion(void) const
Get the variant data.
bool CanGetExcept_text(void) const
Check if it is safe to call GetExcept_text method.
Definition: Seq_feat_.hpp:1399
const TProduct & GetProduct(void) const
Get the Product member data.
Definition: Seq_feat_.hpp:1096
bool CanGetExcept(void) const
Check if it is safe to call GetExcept method.
Definition: Seq_feat_.hpp:996
TPartial GetPartial(void) const
Get the Partial member data.
Definition: Seq_feat_.hpp:962
TExcept GetExcept(void) const
Get the Except member data.
Definition: Seq_feat_.hpp:1009
vector< CRef< CGb_qual > > TQual
Definition: Seq_feat_.hpp:117
bool CanGetProduct(void) const
Check if it is safe to call GetProduct method.
Definition: Seq_feat_.hpp:1090
bool IsSetProduct(void) const
product of process Check if a value has been assigned to Product data member.
Definition: Seq_feat_.hpp:1084
bool IsSetFrame(void) const
Check if a value has been assigned to Frame data member.
Definition: Cdregion_.hpp:509
bool IsSetLocation(void) const
feature made from Check if a value has been assigned to Location data member.
Definition: Seq_feat_.hpp:1105
@ e_not_set
No variant selected.
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
const TWhole & GetWhole(void) const
Get the variant data.
Definition: Seq_loc_.cpp:172
TFrom GetFrom(void) const
Get the From member data.
TGi & SetGi(void)
Select the variant.
Definition: Seq_id_.hpp:896
TTo GetTo(void) const
Get the To member data.
bool IsWhole(void) const
Check if variant Whole is selected.
Definition: Seq_loc_.hpp:522
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
const TLoc & GetLoc(void) const
Get the Loc member data.
Definition: Seq_graph_.hpp:869
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
TClass GetClass(void) const
Get the Class member data.
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSetSeq_set(void) const
Check if a value has been assigned to Seq_set data member.
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
@ eClass_parts
parts for 2 or 3
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
@ eClass_gen_prod_set
genomic products, chrom+mRNA+protein
@ eClass_segset
segmented sequence + parts
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
const Tdata & Get(void) const
Get the member data.
bool IsSetSeq_data(void) const
the sequence Check if a value has been assigned to Seq_data data member.
Definition: Seq_inst_.hpp:805
ERepr
representation class
Definition: Seq_inst_.hpp:91
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
E_Choice
Choice variants.
Definition: Seq_data_.hpp:102
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
bool IsSetLength(void) const
length of sequence in residues Check if a value has been assigned to Length data member.
Definition: Seq_inst_.hpp:640
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_data_.hpp:475
@ eRepr_seg
segmented sequence
Definition: Seq_inst_.hpp:95
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ e_not_set
No variant selected.
Definition: Seq_data_.hpp:103
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
bool IsData(void) const
Check if variant Data is selected.
Definition: T3Reply_.hpp:263
const TData & GetData(void) const
Get the variant data.
Definition: T3Reply_.cpp:124
list< CRef< CT3StatusFlags > > TStatus
Definition: T3Data_.hpp:94
bool IsError(void) const
Check if variant Error is selected.
Definition: T3Reply_.hpp:257
const TError & GetError(void) const
Get the variant data.
Definition: T3Reply_.cpp:102
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: T3Data_.hpp:285
const TMessage & GetMessage(void) const
Get the Message member data.
Definition: T3Error_.hpp:394
bool IsSetMessage(void) const
Check if a value has been assigned to Message data member.
Definition: T3Error_.hpp:382
int i
int len
const string version
version string
Definition: variables.hpp:66
EIPRangeType t
Definition: ncbi_localip.c:101
const char * tag
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
int isupper(Uchar c)
Definition: ncbictype.hpp:70
const CConstRef< CSeq_id > GetAccession(const CSeq_id_Handle &id_handle)
The Object manager core.
static bool s_PosIsNNotGap(const CSeqVector &vec, unsigned int pos)
Definition: utilities.cpp:1293
bool IsBlankStringList(const list< string > &str_list)
Definition: utilities.cpp:114
bool s_IsSameSeqAnnot(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2, bool &diff_descriptions)
Definition: utilities.cpp:1499
bool DoesCodingRegionHaveUnnecessaryException(const CSeq_feat &feat, const CBioseq_Handle &loc_handle, CScope &scope)
Definition: utilities.cpp:2895
CBioseq_Handle BioseqHandleFromLocation(CScope *m_Scope, const CSeq_loc &loc)
Definition: utilities.cpp:1277
bool IsDateInPast(const CDate &date)
Definition: utilities.cpp:826
void CheckBioseqEndsForNAndGap(const CSeqVector &vec, EBioseqEndIsType &begin_n, EBioseqEndIsType &begin_gap, EBioseqEndIsType &end_n, EBioseqEndIsType &end_gap, bool &begin_ambig, bool &end_ambig)
Definition: utilities.cpp:1317
string s_ReplaceListFromQuals(const CSeq_feat::TQual &quals)
Definition: utilities.cpp:1686
bool IsCommon(const COrg_ref &org, const string &val)
Definition: utilities.cpp:2129
bool ConsistentWithT(Char ch)
Definition: utilities.cpp:2886
DEFINE_STATIC_ARRAY_MAP(TBypassCdsTransCheckSet, sc_BypassCdsTransCheck, sc_BypassCdsTransCheckText)
vector< CConstRef< CObject_id > > TFeatIdVec
Definition: utilities.cpp:1724
static const char *const sc_ForceCdsTransCheckText[]
Definition: utilities.cpp:2334
string GetDateErrorDescription(int flags)
Definition: utilities.cpp:855
void CalculateEffectiveTranslationLengths(const string &transl_prot, const CSeqVector &prot_vec, size_t &len, size_t &prot_len)
Definition: utilities.cpp:2597
CConstRef< CSeq_id > GetReportableSeqIdForAlignment(const CSeq_align &align, CScope &scope)
Definition: utilities.cpp:399
CSeqVector GetSequenceFromLoc(const CSeq_loc &loc, CScope &scope, CBioseq_Handle::EVectorCoding coding)
Definition: utilities.cpp:203
CBioseq_set_Handle GetNucProtSetParent(const CBioseq_Handle &bioseq)
Definition: utilities.cpp:581
bool HasNoStop(const CSeq_feat &feat, CScope *scope)
Definition: utilities.cpp:2673
bool IsCommonName(const CT3Data &data)
Definition: utilities.cpp:1944
bool s_IsDifferentDbxrefs(const TDbtags &list1, const TDbtags &list2)
Definition: utilities.cpp:1608
CScope::TIds GetSeqIdsForGI(TGi gi)
Definition: utilities.cpp:142
static char s_ConvertChar(char ch)
Definition: utilities.cpp:2199
static bool IsBioseqInSameSeqEntryAsAlign(const CBioseq_Handle &bsh, const CSeq_align &align, CScope &scope)
Definition: utilities.cpp:387
string FixSpecificHost(const string &val)
returns the corrected specific host, if the specific host is invalid and can be corrected returns an ...
Definition: utilities.cpp:2189
void FixGeneticCode(CCdregion &cdr)
Definition: utilities.cpp:2239
bool ShouldCheckForNsAndGap(const CBioseq_Handle &bsh)
Definition: utilities.cpp:1307
bool IsDicistronic(const CSeq_feat_Handle &f)
Definition: utilities.cpp:1825
bool HasECnumberPattern(const string &str)
Definition: utilities.cpp:1088
vector< TSeqPos > GetMismatches(const CSeq_feat &feat, const CSeqVector &prot_vec, const string &transl_prot)
Definition: utilities.cpp:2626
bool IsNTNCNWACAccession(const string &acc)
Definition: utilities.cpp:2755
static const CBioseq * s_GetSeqFromSet(const CBioseq_set &bsst)
Definition: utilities.cpp:295
CBioseq_Handle GetNucBioseq(const CBioseq_set_Handle &bioseq_set)
Definition: utilities.cpp:587
string GetBioseqIdLabel(const CBioseq &sq)
Definition: utilities.cpp:981
void AppendBioseqLabel(string &str, const CBioseq &sq, bool supress_context)
Definition: utilities.cpp:1064
CBioseq_Handle GetCDSProductSequence(const CSeq_feat &feat, CScope *scope, const CTSE_Handle &tse, bool far_fetch, bool &is_far)
Definition: utilities.cpp:2574
bool HasBadCharacter(const string &str)
Definition: utilities.cpp:755
bool IsSpecificHostValid(const string &val, string &error_msg)
returns true and error_msg will be empty, if specific host is valid returns true and error_msg will b...
Definition: utilities.cpp:2182
CBioseq_set_Handle GetGenProdSetParent(const CBioseq_set_Handle &set)
Definition: utilities.cpp:570
static string s_GetBioseqAcc(const CSeq_id &id, int *version)
Definition: utilities.cpp:234
static const string sIgnoreHostWordList[]
Definition: utilities.cpp:2022
bool IsInOrganelleSmallGenomeSet(const CSeq_id &id, CScope &scope)
Definition: utilities.cpp:3050
bool s_IsSameStrand(const CSeq_loc &l1, const CSeq_loc &l2, CScope &scope)
Definition: utilities.cpp:1485
bool HasMisSpellFlag(const CT3Data &data)
Definition: utilities.cpp:1966
int CheckDate(const CDate &date, bool require_full_date)
Definition: utilities.cpp:780
string SpecificHostValueToCheck(const string &val)
Definition: utilities.cpp:2042
static bool s_IsGenbankMasterAccession(const string &acc)
Definition: utilities.cpp:2973
string GetAccessionFromBioseqSet(const CBioseq_set &bsst, int *version)
Definition: utilities.cpp:433
bool BadMultipleSequenceLocation(const CSeq_loc &loc, CScope &scope)
Definition: utilities.cpp:3080
bool IsClassInEntry(const CSeq_entry &se, CBioseq_set::EClass clss)
Definition: utilities.cpp:79
static bool s_AreLinkedToDifferentFeats(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2, CSeqFeatData::ESubtype s1, CSeqFeatData::ESubtype s2)
Definition: utilities.cpp:1725
string InterpretSpecificHostResult(const string &host, const CT3Reply &reply, const string &orig_host)
Definition: utilities.cpp:2085
bool EndsWithBadCharacter(const string &str)
Definition: utilities.cpp:768
bool s_FeatureIdsMatch(const CFeat_id &f1, const CFeat_id &f2)
Definition: utilities.cpp:717
bool PartialsSame(const CSeq_loc &loc1, const CSeq_loc &loc2)
Definition: utilities.cpp:1463
bool IsLocFullLength(const CSeq_loc &loc, const CBioseq_Handle &bsh)
Definition: utilities.cpp:1451
bool s_AreDifferentVariations(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2)
Definition: utilities.cpp:1703
string GetSequenceStringFromLoc(const CSeq_loc &loc, CScope &scope)
Definition: utilities.cpp:175
string GetValidatorLocationLabel(const CSeq_loc &loc, CScope &scope)
Definition: utilities.cpp:958
bool IsOrganelle(int genome)
Definition: utilities.cpp:2831
bool IsBadSubmissionLastName(const string &last)
Definition: utilities.cpp:3116
static string s_GetAccessionForSeqdesc(const CSeq_entry_Handle &seh, const CSeqdesc &desc, int *version)
Definition: utilities.cpp:365
static const char * kUnclassifiedTranslationDiscrepancy
Definition: utilities.cpp:2320
bool IsDicistronicGene(const CSeq_feat_Handle &f)
Indicates whether feature is a dicistronic gene.
Definition: utilities.cpp:1818
bool HasBadProteinStart(const CSeqVector &sv)
Definition: utilities.cpp:2467
bool ConsistentWithA(Char ch)
Definition: utilities.cpp:2871
CStaticArraySet< const char *, PCase_CStr > TBypassCdsTransCheckSet
Definition: utilities.cpp:2331
bool s_StringHasPMID(const string &str)
Definition: utilities.cpp:727
bool DoesmRNAHaveUnnecessaryException(const CSeq_feat &feat, const CBioseq_Handle &nuc, CScope &scope)
Definition: utilities.cpp:2923
bool HasBadStartCodon(const CSeq_loc &loc, const string &transl_prot)
Definition: utilities.cpp:2307
bool s_AreGBQualsIdentical(const CSeq_feat_Handle &feat1, const CSeq_feat_Handle &feat2, bool case_sensitive)
Definition: utilities.cpp:1535
bool DoesFeatureHaveUnnecessaryException(const CSeq_feat &feat, CScope &scope)
Definition: utilities.cpp:2939
bool IsDeltaOrFarSeg(const CSeq_loc &loc, CScope *scope)
Definition: utilities.cpp:90
bool IsNG(const CSeq_id &id)
Definition: utilities.cpp:2790
bool ReportTranslationErrors(const string &except_text)
Definition: utilities.cpp:2341
bool HasInternalStop(const CSeq_feat &feat, CScope &scope, bool ignore_exceptions)
Definition: utilities.cpp:2416
bool s_AreFullLengthCodingRegionsWithDifferentFrames(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2)
Definition: utilities.cpp:1644
bool ConsistentWithC(Char ch)
Definition: utilities.cpp:2876
static const char *const sc_BypassCdsTransCheckText[]
Definition: utilities.cpp:2322
bool IsLikelyTaxname(const string &val)
Definition: utilities.cpp:2150
string GetAccessionFromBioseq(const CBioseq &bioseq, int *version)
Definition: utilities.cpp:427
EDuplicateFeatureType IsDuplicate(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2, bool check_partials, bool case_sensitive)
Reports how two features duplicate each other.
Definition: utilities.cpp:1838
bool SeqIsPatent(const CBioseq &seq)
Definition: utilities.cpp:1155
bool s_AreFeatureLabelsSame(const CSeq_feat_Handle &feat, const CSeq_feat_Handle &prev, bool case_sensitive)
Definition: utilities.cpp:1576
bool IsFarLocation(const CSeq_loc &loc, const CSeq_entry_Handle &seh)
Definition: utilities.cpp:159
static bool s_IsDescOnSeqEntry(const CSeq_entry &entry, const CSeqdesc &desc)
Definition: utilities.cpp:352
static bool s_AremRNAsLinkedToDifferentCodingRegions(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2)
Definition: utilities.cpp:1812
TGi GetGIForSeqId(const CSeq_id &id)
Definition: utilities.cpp:125
CStaticArraySet< const char *, PCase_CStr > TForceCdsTransCheckSet
Definition: utilities.cpp:2338
static string s_GetSeq_featAcc(const CSeq_feat &feat, CScope &scope, int *version)
Definition: utilities.cpp:258
size_t CountInternalStopCodons(const string &transl_prot)
Definition: utilities.cpp:2393
CRef< CSeqVector > MakeSeqVectorForResidueCounting(const CBioseq_Handle &bsh)
Definition: utilities.cpp:2455
EAccessionFormatError ValidateAccessionString(const string &accession, bool require_version)
Definition: utilities.cpp:624
CBioseq_set_Handle GetSetParent(const CBioseq_set_Handle &set, CBioseq_set::TClass set_class)
Definition: utilities.cpp:532
void ConvertToEntrezTerm(string &title)
Definition: utilities.cpp:2220
static void UpdateToBestId(CSeq_loc &loc, CScope &scope)
Definition: utilities.cpp:928
bool IsTemporary(const CSeq_id &id)
Definition: utilities.cpp:2817
bool ConsistentWithG(Char ch)
Definition: utilities.cpp:2881
bool IsBioseqTSA(const CBioseq &seq, CScope *scope)
Definition: utilities.cpp:884
bool IsBadSubmissionFirstName(const string &first)
Definition: utilities.cpp:3102
bool FindMatchInOrgRef(const string &str, const COrg_ref &org)
Definition: utilities.cpp:1985
bool g_IsMasterAccession(const CSeq_id &id)
Definition: utilities.cpp:2999
void FeatureHasEnds(const CSeq_feat &feat, CScope *scope, bool &no_beg, bool &no_end)
Definition: utilities.cpp:2546
bool s_PartialAtGapOrNs(CScope *scope, const CSeq_loc &loc, unsigned int tag, bool only_gap)
Definition: utilities.cpp:1176
string GetAccessionFromObjects(const CSerialObject *obj, const CSeq_entry *ctx, CScope &scope, int *version)
Definition: utilities.cpp:443
bool HasStopInProtein(const CSeq_feat &feat, CScope &scope)
Definition: utilities.cpp:2519
static bool s_AreCodingRegionsLinkedToDifferentmRNAs(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2)
Definition: utilities.cpp:1806
size_t CountProteinStops(const CSeqVector &sv)
Definition: utilities.cpp:2504
bool IsSequenceFetchable(const CSeq_id &id, CScope *scope)
Definition: utilities.cpp:2714
void AdjustSpecificHostForTaxServer(string &spec_host)
Definition: utilities.cpp:2032
bool IsAccession(const CSeq_id &id)
Definition: utilities.cpp:918
string TranslateCodingRegionForValidation(const CSeq_feat &feat, CScope &scope, bool &alt_start)
Definition: utilities.cpp:2266
CSeqVector GetSequenceFromFeature(const CSeq_feat &feat, CScope &scope, CBioseq_Handle::EVectorCoding coding, bool product)
Definition: utilities.cpp:214
bool IsResidue(unsigned char residue)
Definition: utilities.hpp:88
EBioseqEndIsType
Definition: utilities.hpp:156
@ eBioseqEndIsType_Last
Definition: utilities.hpp:158
@ eBioseqEndIsType_None
Definition: utilities.hpp:157
@ eBioseqEndIsType_All
Definition: utilities.hpp:159
@ eDateValid_bad_str
Definition: utilities.hpp:124
@ eDateValid_empty_date
Definition: utilities.hpp:130
@ eDateValid_valid
Definition: utilities.hpp:123
@ eDateValid_bad_year
Definition: utilities.hpp:125
@ eDateValid_bad_day
Definition: utilities.hpp:127
@ eDateValid_bad_other
Definition: utilities.hpp:129
@ eDateValid_bad_month
Definition: utilities.hpp:126
@ eDateValid_bad_season
Definition: utilities.hpp:128
EDuplicateFeatureType
Definition: utilities.hpp:191
@ eDuplicate_Duplicate
Definition: utilities.hpp:193
@ eDuplicate_DuplicateDifferentTable
Definition: utilities.hpp:195
@ eDuplicate_SameIntervalDifferentLabel
Definition: utilities.hpp:194
@ eDuplicate_Not
Definition: utilities.hpp:192
@ eDuplicate_SameIntervalDifferentLabelDifferentTable
Definition: utilities.hpp:196
const CSeq_feat::TDbxref TDbtags
Definition: utilities.hpp:199
EAccessionFormatError
Definition: utilities.hpp:105
@ eAccessionFormat_too_long
Definition: utilities.hpp:110
@ eAccessionFormat_missing_version
Definition: utilities.hpp:111
@ eAccessionFormat_valid
Definition: utilities.hpp:106
@ eAccessionFormat_bad_version
Definition: utilities.hpp:112
@ eAccessionFormat_no_start_letters
Definition: utilities.hpp:107
@ eAccessionFormat_wrong_number_of_digits
Definition: utilities.hpp:108
@ eAccessionFormat_null
Definition: utilities.hpp:109
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
Definition: pcre2_match.c:594
#define FOR_EACH_SEQID_ON_BIOSEQ(Itr, Var)
FOR_EACH_SEQID_ON_BIOSEQ EDIT_EACH_SEQID_ON_BIOSEQ.
Definition: seq_macros.hpp:308
#define FOR_EACH_GBQUAL_ON_FEATURE
#define FOR_EACH_SYN_ON_ORGREF(Itr, Var)
FOR_EACH_SYN_ON_ORGREF EDIT_EACH_SYN_ON_ORGREF.
#define FOR_EACH_SEQFEATXREF_ON_SEQFEAT(Itr, Var)
FOR_EACH_SEQFEATXREF_ON_SEQFEAT EDIT_EACH_SEQFEATXREF_ON_SEQFEAT.
else result
Definition: token2.c:20
size_t GetMRNATranslationProblems(const CSeq_feat &feat, size_t &mismatches, bool ignore_exceptions, CBioseq_Handle nuc, CBioseq_Handle rna, bool far_fetch, bool is_gpipe, bool is_genomic, CScope *scope)
@ eMRNAProblem_UnnecessaryException
#define const
Definition: zconf.h:232
Modified on Fri Sep 20 14:58:23 2024 by modify_doxy.py rev. 669887