NCBI C++ ToolKit
utilities.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: utilities.cpp 100058 2023-06-09 18:37:47Z gotvyans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Mati Shomrat
27  *
28  * File Description:
29  * Implementation of utility classes and functions.
30  *
31  */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <corelib/ncbistr.hpp>
35 
36 #include <serial/enumvalues.hpp>
37 #include <serial/serialimpl.hpp>
38 
44 #include <objects/seq/Bioseq.hpp>
48 #include <objmgr/bioseq_handle.hpp>
49 #include <objmgr/scope.hpp>
50 #include <objmgr/seq_vector.hpp>
51 #include <objmgr/util/sequence.hpp>
53 #include <objmgr/bioseq_ci.hpp>
54 #include <objmgr/seqdesc_ci.hpp>
55 #include <objmgr/align_ci.hpp>
63 
64 #include <vector>
65 #include <algorithm>
66 #include <list>
67 
68 
71 BEGIN_SCOPE(validator)
72 
73 
74 // =============================================================================
75 // Functions
76 // =============================================================================
77 
78 
80 {
81  for ( CTypeConstIterator <CBioseq_set> si(se); si; ++si ) {
82  if ( si->GetClass() == clss ) {
83  return true;
84  }
85  }
86  return false;
87 }
88 
89 
90 bool IsDeltaOrFarSeg(const CSeq_loc& loc, CScope* scope)
91 {
92  CBioseq_Handle bsh = BioseqHandleFromLocation(scope, loc);
94 
95  if ( bsh.IsSetInst_Repr() ) {
97  if ( repr == CSeq_inst::eRepr_delta ) {
99  return true;
100  }
101  }
102  if ( repr == CSeq_inst::eRepr_seg ) {
104  return true;
105  }
106  }
107  }
108 
109  return false;
110 }
111 
112 
113 // Check if string is either empty or contains just white spaces
114 bool IsBlankStringList(const list< string >& str_list)
115 {
116  ITERATE( list< string >, str, str_list ) {
117  if ( !NStr::IsBlank(*str) ) {
118  return false;
119  }
120  }
121  return true;
122 }
123 
124 
126 {
127  TGi gi = ZERO_GI;
129  scope->AddDefaults();
130 
131  try {
133  gi = scope->GetGi (idh);
134  } catch (CException &) {
135  } catch (std::exception &) {
136  }
137  return gi;
138 }
139 
140 
141 
143 {
144  CScope::TIds id_list;
145  CSeq_id tmp_id;
146  tmp_id.SetGi(gi);
148  scope->AddDefaults();
149 
150  try {
151  id_list = scope->GetIds(tmp_id);
152 
153  } catch (CException &) {
154  } catch (std::exception &) {
155  }
156  return id_list;
157 }
158 
159 bool IsFarLocation(const CSeq_loc& loc, const CSeq_entry_Handle& seh)
160 {
161  CScope& scope = seh.GetScope();
162  for ( CSeq_loc_CI citer(loc); citer; ++citer ) {
163  CConstRef<CSeq_id> id(&citer.GetSeq_id());
164  if ( id ) {
165  CBioseq_Handle near_seq = scope.GetBioseqHandleFromTSE(*id, seh);
166  if ( !near_seq ) {
167  return true;
168  }
169  }
170  }
171 
172  return false;
173 }
174 
176 (const CSeq_loc& loc,
177  CScope& scope)
178 {
179  CNcbiOstrstream oss;
180  CFastaOstream fasta_ostr(oss);
183  string s;
184 
185  try {
186  for (CSeq_loc_CI citer (loc); citer; ++citer) {
187  const CSeq_loc& part = citer.GetEmbeddingSeq_loc();
188  CBioseq_Handle bsh = BioseqHandleFromLocation (&scope, part);
189  if (bsh) {
190  fasta_ostr.WriteSequence (bsh, &part);
191  }
192  }
193  s = CNcbiOstrstreamToString(oss);
194  NStr::ReplaceInPlace(s, "\n", "");
195  } catch (CException&) {
196  s = kEmptyStr;
197  }
198 
199  return s;
200 }
201 
202 
204 (const CSeq_loc& loc,
205  CScope& scope,
207 {
209  CSeqMap::CreateSeqMapForSeq_loc(loc, &scope);
210  return CSeqVector(*map, scope, coding, eNa_strand_plus);
211 }
212 
213 
215 (const CSeq_feat& feat,
216  CScope& scope,
218  bool product)
219 {
220 
221  if ( (product && !feat.CanGetProduct()) ||
222  (!product && !feat.CanGetLocation()) ) {
223  return CSeqVector();
224  }
225 
226  const CSeq_loc* loc = product ? &feat.GetProduct() : &feat.GetLocation();
227  return GetSequenceFromLoc(*loc, scope, coding);
228 }
229 
230 
231 /***** Calculate Accession for a given object *****/
232 
233 
234 static string s_GetBioseqAcc(const CSeq_id& id, int* version)
235 {
236  try {
237  string label;
238  id.GetLabel(&label, version, CSeq_id::eFasta);
239  return label;
240  } catch (CException&) {
241  return kEmptyStr;
242  }
243 }
244 
245 
246 static string s_GetBioseqAcc(const CBioseq_Handle& handle, int* version)
247 {
248  if (handle) {
249  CConstRef<CSeq_id> seqid = sequence::GetId(handle, sequence::eGetId_Best).GetSeqId();
250  if (seqid) {
251  return s_GetBioseqAcc(*seqid, version);
252  }
253  }
254  return kEmptyStr;
255 }
256 
257 
258 static string s_GetSeq_featAcc(const CSeq_feat& feat, CScope& scope, int* version)
259 {
260  CBioseq_Handle seq = BioseqHandleFromLocation (&scope, feat.GetLocation());
261  if (seq) {
263  if (parent && parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_parts) {
264  parent = parent.GetParentBioseq_set();
265  if (parent && parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_segset) {
266  CBioseq_CI m(parent);
267  if (m) {
268  return s_GetBioseqAcc(*m, version);
269  }
270  }
271  }
272  }
273 
274  return s_GetBioseqAcc(seq, version);
275 }
276 
277 
278 //static string s_GetBioseqAcc(const CBioseq& seq, CScope& scope, int* version)
279 //{
280 // CBioseq_Handle handle = scope.GetBioseqHandle(seq);
281 // return s_GetBioseqAcc(handle, version);
282 //}
283 
284 
285 static string s_GetBioseqAcc(const CBioseq& seq, int* version)
286 {
287  auto seqid = sequence::GetId(seq, sequence::eGetId_Best).GetSeqId();
288  if (seqid) {
289  return s_GetBioseqAcc(*seqid, version);
290  }
291  return kEmptyStr;
292 }
293 
294 
295 static const CBioseq* s_GetSeqFromSet(const CBioseq_set& bsst)
296 {
297  if (!bsst.IsSetSeq_set()) {
298  return nullptr;
299  }
300 
301  switch (bsst.GetClass()) {
303  // find the genomic bioseq
304  for (auto pSubEntry : bsst.GetSeq_set()) {
305  if (pSubEntry->IsSeq()) {
306  const auto& inst = pSubEntry->GetSeq().GetInst();
307  if (inst.IsSetMol() && inst.GetMol() == CSeq_inst::eMol_dna) {
308  return &(pSubEntry->GetSeq());
309  }
310  }
311  }
312  break;
314  // find the nucleotide bioseq
315  for (auto pSubEntry : bsst.GetSeq_set()) {
316  if (pSubEntry->IsSeq() && pSubEntry->GetSeq().IsNa()) {
317  return &pSubEntry->GetSeq();
318  } else if (pSubEntry->IsSet() &&
319  pSubEntry->GetSet().IsSetClass() &&
320  pSubEntry->GetSet().GetClass() == CBioseq_set::eClass_segset) {
321  return s_GetSeqFromSet(pSubEntry->GetSet());
322  }
323  }
324 
325  for (auto pSubEntry : bsst.GetSeq_set()) {
326  if (pSubEntry->IsSeq()) {
327  return &pSubEntry->GetSeq();
328  }
329  }
330  break;
332  for (auto pSubEntry : bsst.GetSeq_set()) {
333  if (pSubEntry->IsSeq()) {
334  return &pSubEntry->GetSeq();
335  }
336  }
337  break;
338 
339  default:
340  break;
341  }
342 
343  // In this case, return the first bioseq in the set
345  if (seqit) {
346  return &(*seqit);
347  }
348  return nullptr;
349 }
350 
351 
352 static bool s_IsDescOnSeqEntry (const CSeq_entry& entry, const CSeqdesc& desc)
353 {
354  if (entry.IsSetDescr()) {
355  const auto& descs = entry.GetDescr();
356  for (auto& it : descs.Get()) {
357  if (it->Equals(desc)) {
358  return true;
359  }
360  }
361  }
362  return false;
363 }
364 
365 
366 
367 static string s_GetAccessionForSeqdesc (const CSeq_entry_Handle& seh, const CSeqdesc& desc, int* version)
368 {
369  if (!seh) {
370  return kEmptyStr;\
371  } else if (seh.IsSeq()) {
372  return s_GetBioseqAcc(seh.GetSeq(), version);
373  //return s_GetBioseqAcc(*(seh.GetSeq().GetCompleteBioseq()), version);
374  } else if (s_IsDescOnSeqEntry (*(seh.GetCompleteSeq_entry()), desc)) {
375  const CBioseq* seq = s_GetSeqFromSet(*(seh.GetSet().GetCompleteBioseq_set()));
376  if (seq) {
377  return s_GetBioseqAcc(*seq, version);
378  }
379  } else {
380  CSeq_entry_Handle parent = seh.GetParentEntry();
381  if (parent) {
382  return s_GetAccessionForSeqdesc(parent, desc, version);
383  }
384  }
385  return kEmptyStr;
386 }
387 
388 
389 bool IsBioseqInSameSeqEntryAsAlign(const CBioseq_Handle& bsh, const CSeq_align& align, CScope& scope)
390 {
392  for (CAlign_CI align_it(seh); align_it; ++align_it) {
393  if (&(*align_it) == &align) {
394  return true;
395  }
396  }
397  return false;
398 }
399 
400 
402 {
403  // temporary - to match C Toolkit
404  if (align.IsSetSegs() && align.GetSegs().IsStd()) {
405  return CConstRef<CSeq_id>();
406  }
407  try {
408  if (align.IsSetDim()) {
409  for (int i = 0; i < align.GetDim(); ++i) {
410  const CSeq_id& id = align.GetSeq_id(i);
411  CBioseq_Handle bsh = scope.GetBioseqHandle(id);
412  if (bsh && IsBioseqInSameSeqEntryAsAlign(bsh, align, scope)) {
413  return CConstRef<CSeq_id>(&id);
414  }
415  }
416  } else if (align.IsSetSegs() && align.GetSegs().IsDendiag()) {
417  const CSeq_id& id = *(align.GetSegs().GetDendiag().front()->GetIds()[0]);
418  return CConstRef<CSeq_id>(&id);
419  }
420  // failed to find resolvable ID, use bare ID
421  const CSeq_id& id = align.GetSeq_id(0);
422  return CConstRef<CSeq_id>(&id);
423  } catch (CException& ) {
424  }
425  return CConstRef<CSeq_id>();
426 }
427 
428 
429 
430 string GetAccessionFromBioseq(const CBioseq& bioseq, int* version)
431 {
432  return s_GetBioseqAcc(bioseq, version);
433 }
434 
435 
437 {
438  const CBioseq* seq = s_GetSeqFromSet(bsst);
439  if (seq) {
440  return s_GetBioseqAcc(*seq, version);
441  }
442  return kEmptyStr;
443 }
444 
445 
446 string GetAccessionFromObjects(const CSerialObject* obj, const CSeq_entry* ctx, CScope& scope, int* version)
447 {
448  string empty_acc;
449 
450  if (obj && obj->GetThisTypeInfo() == CSeqdesc::GetTypeInfo() && ctx) {
452  const CSeqdesc& desc = dynamic_cast<const CSeqdesc&>(*obj);
453  string acc = s_GetAccessionForSeqdesc(seh, desc, version);
454  if (!NStr::IsBlank(acc)) {
455  return acc;
456  }
457  }
458 
459  if (ctx) {
460  if (ctx->IsSeq()) {
461  return s_GetBioseqAcc(ctx->GetSeq(), version);
462  } else if (ctx->IsSet()) {
463  const CBioseq* seq = s_GetSeqFromSet(ctx->GetSet());
464  if (seq) {
465  return s_GetBioseqAcc(*seq, version);
466  }
467  }
468  } else if (obj) {
469  if (obj->GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
470  const CSeq_feat& feat = dynamic_cast<const CSeq_feat&>(*obj);
471  return s_GetSeq_featAcc(feat, scope, version);
472  } else if (obj->GetThisTypeInfo() == CBioseq::GetTypeInfo()) {
473  const CBioseq& seq = dynamic_cast<const CBioseq&>(*obj);
474  return s_GetBioseqAcc(seq, version);
475  } else if (obj->GetThisTypeInfo() == CBioseq_set::GetTypeInfo()) {
476  const CBioseq_set& bsst = dynamic_cast<const CBioseq_set&>(*obj);
477  const CBioseq* seq = s_GetSeqFromSet(bsst);
478  if (seq) {
479  return s_GetBioseqAcc(*seq, version);
480  }
481  } else if (obj->GetThisTypeInfo() == CSeq_entry::GetTypeInfo()) {
482  const CSeq_entry& entry = dynamic_cast<const CSeq_entry&>(*obj);
483  if (entry.IsSeq()) {
484  return s_GetBioseqAcc(entry.GetSeq(), version);
485  } else if (entry.IsSet()) {
486  const CBioseq* seq = s_GetSeqFromSet(entry.GetSet());
487  if (seq) {
488  return s_GetBioseqAcc(*seq, version);
489  }
490  }
491  } else if (obj->GetThisTypeInfo() == CSeq_annot::GetTypeInfo()) {
492  CSeq_annot_Handle ah = scope.GetSeq_annotHandle (dynamic_cast<const CSeq_annot&>(*obj));
493  if (ah) {
495  if (seh) {
496  if (seh.IsSeq()) {
497  return s_GetBioseqAcc(seh.GetSeq(), version);
498  } else if (seh.IsSet()) {
499  CBioseq_set_Handle bsh = seh.GetSet();
500  const CBioseq_set& bsst = *(bsh.GetCompleteBioseq_set());
501  const CBioseq* seq = s_GetSeqFromSet(bsst);
502  if (seq) {
503  return s_GetBioseqAcc(*seq, version);
504  }
505  }
506  }
507  }
508  } else if (obj->GetThisTypeInfo() == CSeq_align::GetTypeInfo()) {
509  const CSeq_align& align = dynamic_cast<const CSeq_align&>(*obj);
511  if (id) {
512  CBioseq_Handle bsh = scope.GetBioseqHandle(*id);
513  if (bsh) {
514  return s_GetBioseqAcc(bsh, version);
515  } else {
516  return s_GetBioseqAcc(*id, version);
517  }
518  }
519  } else if (obj->GetThisTypeInfo() == CSeq_graph::GetTypeInfo()) {
520  const CSeq_graph& graph = dynamic_cast<const CSeq_graph&>(*obj);
521  try {
522  const CSeq_loc& loc = graph.GetLoc();
523  const CSeq_id *id = loc.GetId();
524  if (id) {
525  return s_GetBioseqAcc (*id, version);
526  }
527  } catch (CException& ) {
528  }
529  }
530  }
531  return empty_acc;
532 }
533 
534 
536 {
537  CBioseq_set_Handle gps;
538 
539  CSeq_entry_Handle parent = set.GetParentEntry();
540  if (!parent) {
541  return gps;
542  } else if (!(parent = parent.GetParentEntry())) {
543  return gps;
544  } else if (!parent.IsSet()) {
545  return gps;
546  } else if (parent.GetSet().IsSetClass() && parent.GetSet().GetClass() == set_class) {
547  return parent.GetSet();
548  } else {
549  return GetSetParent (parent.GetSet(), set_class);
550  }
551 }
552 
553 
555 {
557 
558  CSeq_entry_Handle parent = bioseq.GetParentEntry();
559  if (!parent) {
560  return set;
561  } else if (!(parent = parent.GetParentEntry())) {
562  return set;
563  } else if (!parent.IsSet()) {
564  return set;
565  } else if (parent.GetSet().IsSetClass() && parent.GetSet().GetClass() == set_class) {
566  return parent.GetSet();
567  } else {
568  return GetSetParent (parent.GetSet(), set_class);
569  }
570 }
571 
572 
574 {
576 }
577 
579 {
581 }
582 
583 
585 {
587 }
588 
589 
591 {
593 
594  if (!bioseq_set) {
595  return nuc;
596  }
597  CBioseq_CI bit(bioseq_set, CSeq_inst::eMol_na);
598  if (bit) {
599  nuc = *bit;
600  } else {
601  CSeq_entry_Handle parent = bioseq_set.GetParentEntry();
602  if (parent && (parent = parent.GetParentEntry())
603  && parent.IsSet()) {
604  nuc = GetNucBioseq (parent.GetSet());
605  }
606  }
607  return nuc;
608 }
609 
610 
612 {
614 
615  if (bioseq.IsNucleotide()) {
616  return bioseq;
617  }
618  CSeq_entry_Handle parent = bioseq.GetParentEntry();
619  if (parent && (parent = parent.GetParentEntry())
620  && parent.IsSet()) {
621  nuc = GetNucBioseq (parent.GetSet());
622  }
623  return nuc;
624 }
625 
626 
627 EAccessionFormatError ValidateAccessionString (const string& accession, bool require_version)
628 {
629  if (NStr::IsBlank (accession)) {
630  return eAccessionFormat_null;
631  } else if (accession.length() >= 16) {
633  } else if (accession.length() < 3
634  || ! isalpha (accession.c_str()[0])
635  || ! isupper (accession.c_str()[0])) {
637  }
638 
639  string str = accession;
640  if (NStr::StartsWith (str, "NZ_")) {
641  str = str.substr(3);
642  }
643 
644  const char *cp = str.c_str();
645  int numAlpha = 0;
646 
647  while (isalpha (*cp)) {
648  numAlpha++;
649  cp++;
650  }
651 
652  int numUndersc = 0;
653 
654  while (*cp == '_') {
655  numUndersc++;
656  cp++;
657  }
658 
659  int numDigits = 0;
660  while (isdigit (*cp)) {
661  numDigits++;
662  cp++;
663  }
664 
665  if ((*cp != '\0' && *cp != ' ' && *cp != '.') || numUndersc > 1) {
667  }
668 
669  if (require_version) {
670  if (*cp != '.') {
672  }
673  cp++;
674  int numVersion = 0;
675  while (isdigit (*cp)) {
676  numVersion++;
677  cp++;
678  }
679  if (numVersion < 1) {
681  } else if (*cp != '\0' && *cp != ' ') {
683  }
684  }
685 
686 
687  if (numUndersc == 0) {
688  if ((numAlpha == 1 && numDigits == 5)
689  || (numAlpha == 2 && numDigits == 6)
690  || (numAlpha == 3 && numDigits == 5)
691  || (numAlpha == 4 && numDigits == 8)
692  || (numAlpha == 5 && numDigits == 7)) {
693  return eAccessionFormat_valid;
694  }
695  } else {
696  if (numAlpha != 2 || (numDigits != 6 && numDigits != 8 && numDigits != 9)) {
698  }
699  char first_letter = accession.c_str()[0];
700  char second_letter = accession.c_str()[1];
701  if (first_letter == 'N' || first_letter == 'X' || first_letter == 'Z') {
702  if (second_letter == 'M' || second_letter == 'C'
703  || second_letter == 'T' || second_letter == 'P'
704  || second_letter == 'G' || second_letter == 'R'
705  || second_letter == 'S' || second_letter == 'W'
706  || second_letter == 'Z') {
707  return eAccessionFormat_valid;
708  }
709  }
710  if ((first_letter == 'A' || first_letter == 'Y')
711  && second_letter == 'P') {
712  return eAccessionFormat_valid;
713  }
714  }
715 
717 }
718 
719 
720 bool s_FeatureIdsMatch (const CFeat_id& f1, const CFeat_id& f2)
721 {
722  if (!f1.IsLocal() || !f2.IsLocal()) {
723  return false;
724  }
725 
726  return 0 == f1.GetLocal().Compare(f2.GetLocal());
727 }
728 
729 
730 bool s_StringHasPMID (const string& str)
731 {
732  if (NStr::IsBlank (str)) {
733  return false;
734  }
735 
736  size_t pos = NStr::Find (str, "(PMID ");
737  if (pos == string::npos) {
738  return false;
739  }
740 
741  const char *ptr = str.c_str() + pos + 6;
742  unsigned int numdigits = 0;
743  while (*ptr != 0 && *ptr != ')') {
744  if (isdigit (*ptr)) {
745  numdigits++;
746  }
747  ptr++;
748  }
749 
750  if (*ptr == ')' && numdigits > 0) {
751  return true;
752  } else {
753  return false;
754  }
755 }
756 
757 
758 bool HasBadCharacter (const string& str)
759 {
760  if (NStr::Find (str, "?") != string::npos
761  || NStr::Find (str, "!") != string::npos
762  || NStr::Find (str, "~") != string::npos
763  || NStr::Find(str, "|") != string::npos) {
764  return true;
765  } else {
766  return false;
767  }
768 }
769 
770 
771 bool EndsWithBadCharacter (const string& str)
772 {
773  if (NStr::EndsWith (str, "_") || NStr::EndsWith (str, ".")
774  || NStr::EndsWith (str, ",") || NStr::EndsWith (str, ":")
775  || NStr::EndsWith (str, ";")) {
776  return true;
777  } else {
778  return false;
779  }
780 }
781 
782 
783 int CheckDate (const CDate& date, bool require_full_date)
784 {
785  int rval = eDateValid_valid;
786 
787  if (date.IsStr()) {
788  if (NStr::IsBlank (date.GetStr()) || NStr::Equal (date.GetStr(), "?")) {
789  rval |= eDateValid_bad_str;
790  }
791  } else if (date.IsStd()) {
792  const auto& sdate = date.GetStd();
793  if (!sdate.IsSetYear() || sdate.GetYear() < 1000) {
794  rval |= eDateValid_bad_year;
795  }
796  if (sdate.IsSetMonth() && sdate.GetMonth() > 12) {
797  rval |= eDateValid_bad_month;
798  }
799  if (sdate.IsSetDay() && sdate.GetDay() > 31) {
800  rval |= eDateValid_bad_day;
801  }
802  if (require_full_date) {
803  if (!sdate.IsSetMonth() || sdate.GetMonth() == 0) {
804  rval |= eDateValid_bad_month;
805  }
806  if (!sdate.IsSetDay() || sdate.GetDay() == 0) {
807  rval |= eDateValid_bad_day;
808  }
809  }
810  if (sdate.IsSetSeason() && !NStr::IsBlank (sdate.GetSeason())) {
811  const char * cp = sdate.GetSeason().c_str();
812  while (*cp != 0) {
813  if (isalpha (*cp) || *cp == '-') {
814  // these are the only acceptable characters
815  } else {
816  rval |= eDateValid_bad_season;
817  break;
818  }
819  ++cp;
820  }
821  }
822  } else {
823  rval |= eDateValid_bad_other;
824  }
825  return rval;
826 }
827 
828 
829 bool IsDateInPast(const CDate& date)
830 {
831  time_t t;
832  time(&t);
833  struct tm *tm;
834  tm = localtime(&t);
835 
836  bool in_past = false;
837  if (!date.IsStd()) {
838  return false;
839  }
840  const auto & sdate = date.GetStd();
841  if (sdate.GetYear() < tm->tm_year + 1900) {
842  in_past = true;
843  } else if (sdate.GetYear() == tm->tm_year + 1900
844  && sdate.IsSetMonth()) {
845  if (sdate.GetMonth() < tm->tm_mon + 1) {
846  in_past = true;
847  } else if (sdate.GetMonth() == tm->tm_mon + 1
848  && sdate.IsSetDay()) {
849  if (sdate.GetDay() < tm->tm_mday) {
850  in_past = true;
851  }
852  }
853  }
854  return in_past;
855 }
856 
857 
859 {
860  string reasons;
861 
863  reasons += "EMPTY_DATE ";
864  }
865  if (flags & eDateValid_bad_str) {
866  reasons += "BAD_STR ";
867  }
868  if (flags & eDateValid_bad_year) {
869  reasons += "BAD_YEAR ";
870  }
871  if (flags & eDateValid_bad_month) {
872  reasons += "BAD_MONTH ";
873  }
874  if (flags & eDateValid_bad_day) {
875  reasons += "BAD_DAY ";
876  }
878  reasons += "BAD_SEASON ";
879  }
880  if (flags & eDateValid_bad_other) {
881  reasons += "BAD_OTHER ";
882  }
883  return reasons;
884 }
885 
886 
887 bool IsBioseqTSA (const CBioseq& seq, CScope* scope)
888 {
889  if (!scope) {
890  return false;
891  }
892  bool is_tsa = false;
893  CBioseq_Handle bsh = scope->GetBioseqHandle(seq);
894  if (bsh) {
895  CSeqdesc_CI desc_ci(bsh, CSeqdesc::e_Molinfo);
896  while (desc_ci && !is_tsa) {
897  if (desc_ci->GetMolinfo().IsSetTech() && desc_ci->GetMolinfo().GetTech() == CMolInfo::eTech_tsa) {
898  is_tsa = true;
899  }
900  ++desc_ci;
901  }
902  }
903  return is_tsa;
904 }
905 
906 
907 #if 0
908 // disabled for now
909 bool IsNCBIFILESeqId (const CSeq_id& id)
910 {
911  if (!id.IsGeneral() || !id.GetGeneral().IsSetDb()
912  || !NStr::Equal(id.GetGeneral().GetDb(), "NCBIFILE")) {
913  return false;
914  } else {
915  return true;
916  }
917 }
918 #endif
919 
920 
921 bool IsAccession(const CSeq_id& id)
922 {
923  if (id.GetTextseq_Id()) {
924  return true;
925  } else {
926  return false;
927  }
928 }
929 
930 
931 static void UpdateToBestId(CSeq_loc& loc, CScope& scope)
932 {
933  bool any_change = false;
934  CSeq_loc_I it(loc);
935  for (; it; ++it) {
936  const CSeq_id& id = it.GetSeq_id();
937  if (!IsAccession(id)) {
938  CConstRef<CSeq_id> best_id;
939  CBioseq_Handle bsh = scope.GetBioseqHandle(id);
940  if (bsh) {
941  const auto& ids = bsh.GetCompleteBioseq()->GetId();
942  for (auto& id_it : ids) {
943  if (IsAccession(*id_it)) {
944  best_id = id_it;
945  break;
946  }
947  }
948  }
949  if (best_id) {
950  it.SetSeq_id(*best_id);
951  any_change = true;
952  }
953  }
954  }
955  if (any_change) {
956  loc.Assign(*it.MakeSeq_loc());
957  }
958 }
959 
960 
961 string GetValidatorLocationLabel (const CSeq_loc& loc, CScope& scope)
962 {
963  string loc_label;
964  if (loc.IsWhole()) {
965  CBioseq_Handle bsh = scope.GetBioseqHandle(loc.GetWhole());
966  if (bsh) {
967  loc_label = GetBioseqIdLabel(*(bsh.GetCompleteBioseq()));
968  NStr::ReplaceInPlace(loc_label, "[", "");
969  NStr::ReplaceInPlace(loc_label, "]", "");
970  }
971  }
972  if (NStr::IsBlank(loc_label)) {
973  CSeq_loc tweaked_loc;
974  tweaked_loc.Assign(loc);
975  UpdateToBestId(tweaked_loc, scope);
976  tweaked_loc.GetLabel(&loc_label);
977  NStr::ReplaceInPlace(loc_label, "[", "(");
978  NStr::ReplaceInPlace(loc_label, "]", ")");
979  }
980  return loc_label;
981 }
982 
983 
984 
985 string GetBioseqIdLabel(const CBioseq& sq, bool limited)
986 {
987  string content;
988  int num_ids_found = 0;
989  bool id_found = false;
990 
991  const auto& id_list = sq.GetId();
992 
993  /* find first gi */
994  for (auto id_it : id_list) {
995  if (id_it->IsGi()) {
996  CNcbiOstrstream os;
997  id_it->WriteAsFasta(os);
998  string s = CNcbiOstrstreamToString(os);
999  content += s;
1000  num_ids_found ++;
1001  break;
1002  }
1003  }
1004  /* find first accession */
1005  for (auto id_it : id_list) {
1006  if (id_it->IsGenbank()
1007  || id_it->IsDdbj()
1008  || id_it->IsEmbl()
1009  || id_it->IsSwissprot()
1010  || id_it->IsOther()
1011  || id_it->IsTpd()
1012  || id_it->IsTpe()
1013  || id_it->IsTpg()) {
1014  if (num_ids_found > 0) {
1015  content += "|";
1016  }
1017  CNcbiOstrstream os;
1018  id_it->WriteAsFasta(os);
1019  string s = CNcbiOstrstreamToString(os);
1020  content += s;
1021  num_ids_found++;
1022  break;
1023  }
1024  }
1025 
1026  if (num_ids_found == 0) {
1027  /* find first general */
1028  for (auto id_it : id_list) {
1029  if (id_it->IsGeneral()) {
1030  if (num_ids_found > 0) {
1031  content += "|";
1032  }
1033  CNcbiOstrstream os;
1034  id_it->WriteAsFasta(os);
1035  string s = CNcbiOstrstreamToString(os);
1036  content += s;
1037  num_ids_found++;
1038  break;
1039  }
1040  }
1041  }
1042  // didn't find any? print them all, but only the first local
1043  if (num_ids_found == 0) {
1044  bool found_local = false;
1045  for (auto id_it : id_list) {
1046  if (id_it->IsLocal()) {
1047  if (found_local) {
1048  continue;
1049  } else {
1050  found_local = true;
1051  }
1052  }
1053  if (id_found) {
1054  content += "|";
1055  }
1056  CNcbiOstrstream os;
1057  id_it->WriteAsFasta(os);
1058  string s = CNcbiOstrstreamToString(os);
1059  content += s;
1060  id_found = true;
1061  }
1062  }
1063 
1064  return content;
1065 }
1066 
1067 
1068 void AppendBioseqLabel(string& str, const CBioseq& sq, bool supress_context)
1069 {
1070  str += "BIOSEQ: ";
1071 
1072  string content = GetBioseqIdLabel (sq);
1073 
1074  if (!supress_context) {
1075  if (!content.empty()) {
1076  content += ": ";
1077  }
1078 
1079  const CEnumeratedTypeValues* tv;
1080  tv = CSeq_inst::GetTypeInfo_enum_ERepr();
1081  const CSeq_inst& inst = sq.GetInst();
1082  content += tv->FindName(inst.GetRepr(), true) + ", ";
1083  tv = CSeq_inst::GetTypeInfo_enum_EMol();
1084  content += tv->FindName(inst.GetMol(), true);
1085  if (inst.IsSetLength()) {
1086  content += string(" len= ") + NStr::IntToString(inst.GetLength());
1087  }
1088  }
1089  str += content;
1090 }
1091 
1092 bool HasECnumberPattern (const string& str)
1093 {
1094  bool rval = false;
1095  if (NStr::IsBlank(str)) {
1096  return false;
1097  }
1098 
1099  bool is_ambig = false;
1100  int numdashes = 0;
1101  int numdigits = 0;
1102  int numperiods = 0;
1103 
1104  string::const_iterator sit = str.begin();
1105  while (sit != str.end() && !rval) {
1106  if (isdigit (*sit)) {
1107  numdigits++;
1108  if (is_ambig) {
1109  is_ambig = false;
1110  numperiods = 0;
1111  numdashes = 0;
1112  }
1113  } else if (*sit == '-') {
1114  numdashes++;
1115  is_ambig = true;
1116  } else if (*sit == 'n') {
1117  numdashes++;
1118  is_ambig = true;
1119  } else if (*sit == '.') {
1120  numperiods++;
1121  if (numdigits > 0 && numdashes > 0) {
1122  is_ambig = false;
1123  numperiods = 0;
1124  } else if (numdigits == 0 && numdashes == 0) {
1125  is_ambig = false;
1126  numperiods = 0;
1127  } else if (numdashes > 1) {
1128  is_ambig = false;
1129  numperiods = 0;
1130  }
1131  numdigits = 0;
1132  numdashes = 0;
1133  } else {
1134  if (numperiods == 3) {
1135  if (numdigits > 0 && numdashes > 0) {
1136  is_ambig = false;
1137  } else if (numdigits > 0 || numdashes == 1) {
1138  rval = true;
1139  }
1140  }
1141  is_ambig = false;
1142  numperiods = 0;
1143  numdigits = 0;
1144  numdashes = 0;
1145  }
1146  ++sit;
1147  }
1148  if (numperiods == 3) {
1149  if (numdigits > 0 && numdashes > 0) {
1150  rval = false;
1151  } else if (numdigits > 0 || numdashes == 1) {
1152  rval = true;
1153  }
1154  }
1155  return rval;
1156 }
1157 
1158 
1159 bool SeqIsPatent (const CBioseq& seq)
1160 {
1161  bool is_patent = false;
1162 
1163  // some tests are suppressed if a patent ID is present
1164  FOR_EACH_SEQID_ON_BIOSEQ (id_it, seq) {
1165  if ((*id_it)->IsPatent()) {
1166  is_patent = true;
1167  break;
1168  }
1169  }
1170  return is_patent;
1171 }
1172 
1173 
1174 bool SeqIsPatent (const CBioseq_Handle& seq)
1175 {
1176  return SeqIsPatent (*(seq.GetCompleteBioseq()));
1177 }
1178 
1179 
1181  CScope* scope,
1182  const CSeq_loc& loc,
1183  unsigned int tag,
1184  bool only_gap
1185 )
1186 
1187 {
1189  return false;
1190  }
1191 
1193  for ( CSeq_loc_CI sl_iter(loc); sl_iter; ++sl_iter ) { // EQUIV_IS_ONE not supported
1194  if ( !first ) {
1195  first = sl_iter;
1196  }
1197  last = sl_iter;
1198  }
1199 
1200  if ( first.GetStrand() != last.GetStrand() ) {
1201  return false;
1202  }
1204 
1205  if (!scope) {
1206  return false;
1207  }
1208 
1210  if (!slp) {
1211  return false;
1212  }
1213  const CSeq_id* id = slp->GetId();
1214  if (!id) {
1215  return false;
1216  }
1217  CBioseq_Handle bsh = scope->GetBioseqHandle(*id);
1218  if (!bsh) {
1219  return false;
1220  }
1221 
1222  TSeqPos acceptor = temp.GetRange().GetFrom();
1223  TSeqPos donor = temp.GetRange().GetTo();
1224  TSeqPos start = acceptor;
1225  TSeqPos stop = donor;
1226 
1228  temp.GetStrand());
1229  TSeqPos len = vec.size();
1230 
1231  if ( temp.GetStrand() == eNa_strand_minus ) {
1232  swap(acceptor, donor);
1233  stop = len - donor - 1;
1234  start = len - acceptor - 1;
1235  }
1236 
1237  bool result = false;
1238 
1239  try {
1240  if (tag == sequence::eSeqlocPartial_Nostop && stop < len - 1 && vec.IsInGap(stop + 1)) {
1241  return true;
1242  } else if (tag == sequence::eSeqlocPartial_Nostart && start > 0 && start < len && vec.IsInGap(start - 1)) {
1243  return true;
1244  }
1245  } catch ( exception& ) {
1246  return false;
1247  }
1248  if (only_gap) {
1249  return false;
1250  }
1251 
1252  if ( (tag == sequence::eSeqlocPartial_Nostop) && (stop < len - 2) ) {
1253  try {
1254  CSeqVector::TResidue res = vec[stop + 1];
1255 
1256  if ( IsResidue(res) && isalpha (res)) {
1257  if ( res == 'N' ) {
1258  result = true;
1259  }
1260  }
1261  } catch ( exception& ) {
1262  return false;
1263  }
1264  } else if ( (tag == sequence::eSeqlocPartial_Nostart) && (start > 1) ) {
1265  try {
1266  CSeqVector::TResidue res = vec[start - 1];
1267  if ( IsResidue(res) && isalpha (res)) {
1268  if ( res == 'N' ) {
1269  result = true;
1270  }
1271  }
1272  } catch ( exception& ) {
1273  return false;
1274  }
1275  }
1276 
1277  return result;
1278 }
1279 
1280 
1282 
1283 {
1284  CBioseq_Handle bsh;
1285  for ( CSeq_loc_CI citer (loc); citer; ++citer) {
1286  const CSeq_id& id = citer.GetSeq_id();
1289  if (bsh) {
1290  return bsh;
1291  }
1292  }
1293  return bsh;
1294 }
1295 
1296 
1297 bool s_PosIsNNotGap(const CSeqVector& vec, unsigned int pos)
1298 {
1299  if (pos >= vec.size()) {
1300  return false;
1301  } else if (vec[pos] != 'N' && vec[pos] != 'n') {
1302  return false;
1303  } else if (vec.IsInGap(pos)) {
1304  return false;
1305  } else {
1306  return true;
1307  }
1308 }
1309 
1310 
1312 {
1313  if (!bsh || bsh.GetInst_Length() < 10 || (bsh.IsSetInst_Topology() && bsh.GetInst_Topology() == CSeq_inst::eTopology_circular)) {
1314  return false;
1315  } else {
1316  return true;
1317  }
1318 }
1319 
1320 
1322 (const CSeqVector& vec,
1323 EBioseqEndIsType& begin_n,
1324 EBioseqEndIsType& begin_gap,
1325 EBioseqEndIsType& end_n,
1326 EBioseqEndIsType& end_gap,
1327 bool& begin_ambig,
1328 bool& end_ambig)
1329 {
1330  begin_n = eBioseqEndIsType_None;
1331  begin_gap = eBioseqEndIsType_None;
1332  end_n = eBioseqEndIsType_None;
1333  end_gap = eBioseqEndIsType_None;
1334  begin_ambig = false;
1335  end_ambig = false;
1336 
1337  if (vec.size() < 10) {
1338  return;
1339  }
1340 
1341  try {
1342 
1343  // check for gap at begining of sequence
1344  if (vec.IsInGap(0) /* || vec.IsInGap(1) */) {
1345  begin_gap = eBioseqEndIsType_All;
1346  for (int i = 0; i < 10; i++) {
1347  if (!vec.IsInGap(i)) {
1348  begin_gap = eBioseqEndIsType_Last;
1349  break;
1350  }
1351  }
1352  }
1353 
1354  // check for gap at end of sequence
1355  if ( /* vec.IsInGap (vec.size() - 2) || */ vec.IsInGap(vec.size() - 1)) {
1356  end_gap = eBioseqEndIsType_All;
1357  for (unsigned int i = vec.size() - 11; i < vec.size(); i++) {
1358  if (!vec.IsInGap(i)) {
1359  end_gap = eBioseqEndIsType_Last;
1360  break;
1361  }
1362  }
1363  }
1364 
1365  if (vec.IsNucleotide()) {
1366  // check for N bases at beginning of sequence
1367  if (s_PosIsNNotGap(vec, 0) /* || s_PosIsNNotGap(vec, 1) */) {
1368  begin_n = eBioseqEndIsType_All;
1369  for (unsigned int i = 0; i < 10; i++) {
1370  if (!s_PosIsNNotGap(vec, i)) {
1371  begin_n = eBioseqEndIsType_Last;
1372  break;
1373  }
1374  }
1375  }
1376 
1377  // check for N bases at end of sequence
1378  if ( /* s_PosIsNNotGap(vec, vec.size() - 2) || */ s_PosIsNNotGap(vec, vec.size() - 1)) {
1379  end_n = eBioseqEndIsType_All;
1380  for (unsigned int i = vec.size() - 10; i < vec.size(); i++) {
1381  if (!s_PosIsNNotGap(vec, i)) {
1382  end_n = eBioseqEndIsType_Last;
1383  break;
1384  }
1385  }
1386  }
1387 
1388  // check for ambiguous concentration
1389  size_t check_len = 50;
1390  if (vec.size() < 50) {
1391  check_len = vec.size();
1392  }
1393  size_t num_ns = 0;
1394  for (size_t i = 0; i < check_len; i++) {
1395  if (vec[i] == 'N') {
1396  num_ns++;
1397  if (num_ns >= 5 && i < 10) {
1398  begin_ambig = true;
1399  break;
1400  } else if (num_ns >= 15) {
1401  begin_ambig = true;
1402  break;
1403  }
1404  }
1405  }
1406  num_ns = 0;
1407  for (size_t i = 0; i < check_len; i++) {
1408  if (vec[vec.size() - i - 1] == 'N') {
1409  num_ns++;
1410  if (num_ns >= 5 && i < 10) {
1411  end_ambig = true;
1412  break;
1413  } else if (num_ns >= 15) {
1414  end_ambig = true;
1415  break;
1416  }
1417  }
1418  }
1419  }
1420  } catch (exception&) {
1421  // if there are exceptions, cannot perform this calculation
1422  }
1423 }
1424 
1425 
1427 (const CBioseq_Handle& bsh,
1428  EBioseqEndIsType& begin_n,
1429  EBioseqEndIsType& begin_gap,
1430  EBioseqEndIsType& end_n,
1431  EBioseqEndIsType& end_gap,
1432  bool& begin_ambig,
1433  bool& end_ambig)
1434 {
1435  begin_n = eBioseqEndIsType_None;
1436  begin_gap = eBioseqEndIsType_None;
1437  end_n = eBioseqEndIsType_None;
1438  end_gap = eBioseqEndIsType_None;
1439  begin_ambig = false;
1440  end_ambig = false;
1441  if (!ShouldCheckForNsAndGap(bsh)) {
1442  return;
1443  }
1444 
1445  try {
1446  // check for gap at begining of sequence
1448  CheckBioseqEndsForNAndGap(vec, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
1449  } catch ( exception& ) {
1450  // if there are exceptions, cannot perform this calculation
1451  }
1452 }
1453 
1454 
1455 bool IsLocFullLength (const CSeq_loc& loc, const CBioseq_Handle& bsh)
1456 {
1457  if (loc.IsInt()
1458  && loc.GetInt().GetFrom() == 0
1459  && loc.GetInt().GetTo() == bsh.GetInst_Length() - 1) {
1460  return true;
1461  } else {
1462  return false;
1463  }
1464 }
1465 
1466 
1467 bool PartialsSame (const CSeq_loc& loc1, const CSeq_loc& loc2)
1468 {
1469  bool loc1_partial_start =
1471  bool loc1_partial_stop =
1473  bool loc2_partial_start =
1475  bool loc2_partial_stop =
1477  if (loc1_partial_start == loc2_partial_start &&
1478  loc1_partial_stop == loc2_partial_stop) {
1479  return true;
1480  } else {
1481  return false;
1482  }
1483 }
1484 
1485 
1486 
1487 
1488 // Code for finding duplicate features
1489 bool s_IsSameStrand(const CSeq_loc& l1, const CSeq_loc& l2, CScope& scope)
1490 {
1491  ENa_strand s1 = sequence::GetStrand(l1, &scope);
1492  ENa_strand s2 = sequence::GetStrand(l2, &scope);
1493  if ((s1 == eNa_strand_minus && s2 == eNa_strand_minus)
1494  || (s1 != eNa_strand_minus && s2 != eNa_strand_minus)) {
1495  return true;
1496  } else {
1497  return false;
1498  }
1499 }
1500 
1501 
1502 inline
1503 bool s_IsSameSeqAnnot(const CSeq_feat_Handle& f1, const CSeq_feat_Handle& f2, bool& diff_descriptions)
1504 {
1505  const auto& annot1 = f1.GetAnnot();
1506  const auto& annot2 = f2.GetAnnot();
1507  bool rval = annot1 == annot2;
1508  diff_descriptions = false;
1509  if (!rval) {
1510  if ((!annot1.Seq_annot_IsSetDesc() || annot1.Seq_annot_GetDesc().Get().empty()) &&
1511  (!annot2.Seq_annot_IsSetDesc() || annot2.Seq_annot_GetDesc().Get().empty())) {
1512  // neither is set
1513  diff_descriptions = false;
1514  } else if (annot1.Seq_annot_IsSetDesc() && annot2.Seq_annot_IsSetDesc()) {
1515  // both are set - are they different?
1516  const auto d1 = annot1.Seq_annot_GetDesc().Get().front();
1517  const auto d2 = annot2.Seq_annot_GetDesc().Get().front();
1518  if (d1->Which() != d2->Which()) {
1519  diff_descriptions = true;
1520  } else {
1521  if (d1->IsName()
1522  && NStr::EqualNocase(d1->GetName(), d2->GetName())) {
1523  diff_descriptions = false;
1524  } else if (d1->IsTitle()
1525  && NStr::EqualNocase(d1->GetTitle(), d2->GetTitle())) {
1526  diff_descriptions = false;
1527  } else {
1528  diff_descriptions = true;
1529  }
1530  }
1531  } else {
1532  diff_descriptions = true;
1533  }
1534  }
1535  return rval;
1536 }
1537 
1538 
1539 bool s_AreGBQualsIdentical(const CSeq_feat_Handle& feat1, const CSeq_feat_Handle& feat2, bool case_sensitive)
1540 {
1541  if (!feat1.IsSetQual() || !feat2.IsSetQual()) {
1542  return true;
1543  }
1544 
1545  bool rval = true;
1546 
1547  CSeq_feat::TQual::const_iterator gb1 = feat1.GetQual().begin();
1548  CSeq_feat::TQual::const_iterator gb1_end = feat1.GetQual().end();
1549  CSeq_feat::TQual::const_iterator gb2 = feat2.GetQual().begin();
1550  CSeq_feat::TQual::const_iterator gb2_end = feat2.GetQual().end();
1551 
1552  while ((gb1 != gb1_end) && (gb2 != gb2_end) && rval) {
1553  if (!(*gb1)->IsSetQual()) {
1554  if ((*gb2)->IsSetQual()) {
1555  rval = false;
1556  }
1557  } else if (!(*gb2)->IsSetQual()) {
1558  rval = false;
1559  } else if (!NStr::Equal ((*gb1)->GetQual(), (*gb2)->GetQual())) {
1560  rval = false;
1561  }
1562  if (rval) {
1563  string v1 = (*gb1)->IsSetVal() ? (*gb1)->GetVal() : "";
1564  string v2 = (*gb2)->IsSetVal() ? (*gb2)->GetVal() : "";
1567  rval = NStr::Equal(v1, v2, case_sensitive ? NStr::eCase : NStr::eNocase);
1568  }
1569  ++gb1;
1570  ++gb2;
1571  }
1572  if (gb1 != gb1_end || gb2 != gb2_end) {
1573  rval = false;
1574  }
1575 
1576  return rval;
1577 }
1578 
1579 
1580 bool s_AreFeatureLabelsSame(const CSeq_feat_Handle& feat, const CSeq_feat_Handle& prev, bool case_sensitive)
1581 {
1582  if (!feat.GetData().Equals(prev.GetData())) {
1583  return false;
1584  }
1585 
1586  // compare labels and comments
1587  bool same_label = true;
1588  const string& curr_comment =
1589  feat.IsSetComment() ? feat.GetComment() : kEmptyStr;
1590  const string& prev_comment =
1591  prev.IsSetComment() ? prev.GetComment() : kEmptyStr;
1592  string curr_label;
1593  string prev_label;
1594 
1595  feature::GetLabel(*(feat.GetSeq_feat()),
1596  &curr_label, feature::fFGL_Content, &(feat.GetScope()));
1597  feature::GetLabel(*(prev.GetSeq_feat()),
1598  &prev_label, feature::fFGL_Content, &(prev.GetScope()));
1599 
1600  bool comments_same = NStr::Equal(curr_comment, prev_comment, case_sensitive ? NStr::eCase : NStr::eNocase);
1601  bool labels_same = NStr::Equal(curr_label, prev_label, case_sensitive ? NStr::eCase : NStr::eNocase);
1602 
1603  if (!comments_same || !labels_same) {
1604  same_label = false;
1605  } else if (!s_AreGBQualsIdentical(feat, prev, case_sensitive)) {
1606  same_label = false;
1607  }
1608  return same_label;
1609 }
1610 
1611 
1612 bool s_IsDifferentDbxrefs(const TDbtags& list1, const TDbtags& list2)
1613 {
1614  if (list1.empty() || list2.empty()) {
1615  return false;
1616  } else if (list1.size() != list2.size()) {
1617  return true;
1618  }
1619 
1620  TDbtags::const_iterator it1 = list1.begin();
1621  TDbtags::const_iterator it2 = list2.begin();
1622  for (; it1 != list1.end(); ++it1, ++it2) {
1623  if (!NStr::EqualNocase((*it1)->GetDb(), (*it2)->GetDb())) {
1624  return true;
1625  }
1626  string str1 =
1627  (*it1)->GetTag().IsStr() ? (*it1)->GetTag().GetStr() : "";
1628  string str2 =
1629  (*it2)->GetTag().IsStr() ? (*it2)->GetTag().GetStr() : "";
1630  if ( str1.empty() && str2.empty() ) {
1631  if (!(*it1)->GetTag().IsId() && !(*it2)->GetTag().IsId()) {
1632  continue;
1633  } else if ((*it1)->GetTag().IsId() && (*it2)->GetTag().IsId()) {
1634  if ((*it1)->GetTag().GetId() != (*it2)->GetTag().GetId()) {
1635  return true;
1636  }
1637  } else {
1638  return true;
1639  }
1640  } else if (!str1.empty() && !str2.empty() && !NStr::EqualNocase(str1, str2)) {
1641  return true;
1642  }
1643  }
1644  return false;
1645 }
1646 
1647 
1649 {
1650  const auto & f1data = f1.GetData();
1651  const auto & f2data = f2.GetData();
1652  if (!f1data.IsCdregion() || !f2data.IsCdregion()) {
1653  return false;
1654  }
1655  const auto & cd1 = f1data.GetCdregion();
1656  const auto & cd2 = f2data.GetCdregion();
1657 
1658  int frame1 = 1, frame2 = 1;
1659  if (cd1.IsSetFrame()) {
1660  frame1 = cd1.GetFrame();
1661  if (frame1 == 0) {
1662  frame1 = 1;
1663  }
1664  }
1665  if (cd2.IsSetFrame()) {
1666  frame2 = cd2.GetFrame();
1667  if (frame2 == 0) {
1668  frame2 = 1;
1669  }
1670  }
1671  if (frame1 == frame2) {
1672  return false;
1673  }
1674 
1676  if (!IsLocFullLength (f1.GetLocation(), bsh1)) {
1677  return false;
1678  }
1680  if (!IsLocFullLength (f2.GetLocation(), bsh2)) {
1681  return false;
1682  }
1683 
1684  return true;
1685 }
1686 
1687 
1688 //LCOV_EXCL_START
1689 // never used, because different variations generate different labels
1691 {
1692  string replace;
1693  ITERATE(CSeq_feat::TQual, q, quals) {
1694  if ((*q)->IsSetQual() && NStr::Equal((*q)->GetQual(), "replace") && (*q)->IsSetVal()) {
1695  if (NStr::IsBlank((*q)->GetVal())) {
1696  replace += " ";
1697  } else {
1698  replace += (*q)->GetVal();
1699  }
1700  replace += ".";
1701  }
1702  }
1703  return replace;
1704 }
1705 
1706 
1708 {
1711  return false;
1712  }
1713  if (!f1.IsSetQual() || !f2.IsSetQual()) {
1714  return false;
1715  }
1716  string replace1 = s_ReplaceListFromQuals(f1.GetQual());
1717  string replace2 = s_ReplaceListFromQuals(f2.GetQual());
1718 
1719  if (!NStr::Equal(replace1, replace2)) {
1720  return true;
1721  } else {
1722  return false;
1723  }
1724 }
1725 //LCOV_EXCL_STOP
1726 
1727 
1728 typedef vector<CConstRef<CObject_id> > TFeatIdVec;
1730 {
1731  bool rval = false;
1732 
1733  if (f1.GetData().GetSubtype() == s1 && f2.GetData().GetSubtype() == s1) {
1734  CScope& scope = f1.GetScope();
1735  const CSeq_loc& loc = f1.GetLocation();
1736  CBioseq_Handle bsh = BioseqHandleFromLocation (&scope, loc);
1737  if (bsh) {
1738  const CTSE_Handle& tse = bsh.GetTSE_Handle();
1739  TFeatIdVec mrna1_id;
1740  TFeatIdVec mrna2_id;
1741  list<CSeq_feat_Handle> mrna1;
1742  list<CSeq_feat_Handle> mrna2;
1743 
1745  if ((*itx)->IsSetId() && (*itx)->GetId().IsLocal()) {
1746  const CObject_id& feat_id = (*itx)->GetId().GetLocal();
1747  vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, feat_id);
1748  ITERATE( vector<CSeq_feat_Handle>, feat_it, handles ) {
1749  if (feat_it->IsSetData()
1750  && feat_it->GetData().GetSubtype() == s2) {
1751  mrna1.push_back(*feat_it);
1752  CConstRef<CObject_id> f(&feat_id);
1753  mrna1_id.push_back (f);
1754  break;
1755  }
1756  }
1757  }
1758  }
1760  if ((*itx)->IsSetId() && (*itx)->GetId().IsLocal()) {
1761  const CObject_id& feat_id = (*itx)->GetId().GetLocal();
1762  vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, feat_id);
1763  ITERATE( vector<CSeq_feat_Handle>, feat_it, handles ) {
1764  if (feat_it->IsSetData()
1765  && feat_it->GetData().GetSubtype() == s2) {
1766  mrna2.push_back(*feat_it);
1767  CConstRef<CObject_id> f(&feat_id);
1768  mrna2_id.push_back (f);
1769  }
1770  }
1771  }
1772  }
1773 
1774  if (mrna1_id.size() > 0 && mrna2_id.size() > 0) {
1775  rval = true;
1776  for (auto i1 = mrna1_id.begin(); i1 != mrna1_id.end(); ++i1) {
1777  for (auto i2 = mrna2_id.begin(); i2 != mrna2_id.end(); ++i2) {
1778  if ((*i1)->Equals(**i2)) {
1779  rval = false;
1780  break;
1781  }
1782  }
1783  if (!rval) {
1784  break;
1785  }
1786  }
1787 
1788  if (rval) { // Check that locations aren't the same
1789  const CSeq_feat_Handle fh1 = mrna1.front();
1790  const CSeq_feat_Handle fh2 = mrna2.front();
1791 
1792 
1793  if (s_IsSameStrand(fh1.GetLocation(),
1794  fh2.GetLocation(),
1795  fh1.GetScope())
1796  && (sequence::Compare(fh1.GetLocation(),
1797  fh2.GetLocation(),
1798  &(fh1.GetScope()),
1800  rval = false;
1801  }
1802  }
1803  }
1804  }
1805  }
1806  return rval;
1807 }
1808 
1809 
1810 
1811 
1813 {
1815 }
1816 
1817 
1819 {
1821 }
1822 
1823 
1825 {
1826  if ( f.GetData().GetSubtype() != CSeqFeatData::eSubtype_gene ) return false;
1827  return IsDicistronic(f);
1828 }
1829 
1830 
1832 {
1833  if (!f.IsSetExcept()) return false;
1834  if (!f.IsSetExcept_text()) return false;
1835 
1836  const string& except_text = f.GetExcept_text();
1837  if (NStr::FindNoCase(except_text, "dicistronic gene") == NPOS) return false;
1838 
1839  return true;
1840 }
1841 
1842 
1845 (const CSeq_feat_Handle& f1,
1846  const CSeq_feat_Handle& f2,
1847  bool check_partials,
1848  bool case_sensitive)
1849 {
1850 
1852 
1853  // subtypes
1854  CSeqFeatData::ESubtype feat1_subtype = f1.GetData().GetSubtype();
1855  CSeqFeatData::ESubtype feat2_subtype = f2.GetData().GetSubtype();
1856 
1857  // not duplicates if not the same subtype
1858  if (feat1_subtype != feat2_subtype) {
1859  return eDuplicate_Not;
1860  }
1861 
1862  // locations
1863  const CSeq_loc& feat1_loc = f1.GetLocation();
1864  const CSeq_loc& feat2_loc = f2.GetLocation();
1865 
1866  // not duplicates if not the same location and strand
1867  if (!s_IsSameStrand(feat1_loc, feat2_loc, f1.GetScope()) ||
1868  sequence::Compare(feat1_loc, feat2_loc, &(f1.GetScope()),
1870  return eDuplicate_Not;
1871  }
1872 
1873  // same annot?
1874  bool diff_annot_desc = false;
1875  bool same_annot = s_IsSameSeqAnnot(f1, f2, diff_annot_desc);
1876 
1877  if (diff_annot_desc) {
1878  // don't report if features on different annots with different titles or names
1879  return eDuplicate_Not;
1880  }
1881 
1882  // compare labels and comments
1883  bool same_label = s_AreFeatureLabelsSame (f1, f2, case_sensitive);
1884 
1885  // compare dbxrefs
1886  bool different_dbxrefs = (f1.IsSetDbxref() && f2.IsSetDbxref() &&
1888 
1889  if ( feat1_subtype == CSeqFeatData::eSubtype_region && different_dbxrefs) {
1890  return eDuplicate_Not;
1891  }
1892 
1893  // check for frame difference
1894  bool full_length_coding_regions_with_different_frames =
1896  if (!same_label && full_length_coding_regions_with_different_frames) {
1897  // do not report if both coding regions are full length, have different products,
1898  // and have different frames
1899  return eDuplicate_Not;
1900  }
1901 
1902  if ((feat1_subtype == CSeqFeatData::eSubtype_variation && !same_label) || s_AreDifferentVariations(f1, f2)) {
1903  // don't report variations if replace quals are different or labels are different
1904  return eDuplicate_Not;
1905  }
1906 
1907 
1909  // do not report if features are coding regions linked to different mRNAs
1910  return eDuplicate_Not;
1911  }
1912 
1913 
1915  // do not report if features are mRNAs linked to different coding regions
1916  return eDuplicate_Not;
1917  }
1918 
1919 
1920  // only report pubs if they have the same label
1921  if (feat1_subtype == CSeqFeatData::eSubtype_pub && !same_label) {
1922  return eDuplicate_Not;
1923  }
1924 
1925  bool partials_ok = (!check_partials || PartialsSame(feat1_loc, feat2_loc));
1926 
1927  if (!partials_ok) {
1928  return eDuplicate_Not;
1929  }
1930 
1931  if ( same_annot ) {
1932  if (same_label) {
1933  dup_type = eDuplicate_Duplicate;
1934  } else {
1936  }
1937  } else {
1938  if (same_label) {
1940  } else if ( feat2_subtype != CSeqFeatData::eSubtype_pub ) {
1942  }
1943  }
1944 
1945  return dup_type;
1946 }
1947 
1948 // specific-host functions
1949 
1950 bool IsCommonName (const CT3Data& data)
1951 {
1952  bool is_common = false;
1953 
1954  if (data.IsSetStatus()) {
1955  ITERATE (CT3Reply::TData::TStatus, status_it, data.GetStatus()) {
1956  if ((*status_it)->IsSetProperty()
1957  && NStr::Equal((*status_it)->GetProperty(), "old_name_class", NStr::eNocase)) {
1958  if ((*status_it)->IsSetValue() && (*status_it)->GetValue().IsStr()) {
1959  string value_str = (*status_it)->GetValue().GetStr();
1960  if (NStr::Equal(value_str, "common name", NStr::eCase)
1961  || NStr::Equal(value_str, "genbank common name", NStr::eCase)) {
1962  is_common = true;
1963  break;
1964  }
1965  }
1966  }
1967  }
1968  }
1969  return is_common;
1970 }
1971 
1972 bool HasMisSpellFlag (const CT3Data& data)
1973 {
1974  bool has_misspell_flag = false;
1975 
1976  if (data.IsSetStatus()) {
1977  ITERATE (CT3Reply::TData::TStatus, status_it, data.GetStatus()) {
1978  if ((*status_it)->IsSetProperty()) {
1979  string prop = (*status_it)->GetProperty();
1980  if (NStr::EqualNocase(prop, "misspelled_name")) {
1981  has_misspell_flag = true;
1982  break;
1983  }
1984  }
1985  }
1986  }
1987  return has_misspell_flag;
1988 }
1989 
1990 
1991 bool FindMatchInOrgRef (const string& str, const COrg_ref& org)
1992 {
1993  string match;
1994 
1995  if (NStr::IsBlank(str)) {
1996  // do nothing;
1997  } else if (org.IsSetTaxname() && NStr::EqualNocase(str, org.GetTaxname())) {
1998  match = org.GetTaxname();
1999  } else if (org.IsSetCommon() && NStr::EqualNocase(str, org.GetCommon())) {
2000  match = org.GetCommon();
2001  } else {
2002  FOR_EACH_SYN_ON_ORGREF (syn_it, org) {
2003  if (NStr::EqualNocase(str, *syn_it)) {
2004  match = *syn_it;
2005  break;
2006  }
2007  }
2008  if (NStr::IsBlank(match) && org.IsSetOrgname()) {
2009  const COrgName& orgname = org.GetOrgname();
2010  if (orgname.IsSetMod()) {
2011  for (const auto& mod_it : orgname.GetMod()) {
2012  if (mod_it->IsSetSubtype()
2013  && (mod_it->GetSubtype() == COrgMod::eSubtype_gb_synonym
2014  || mod_it->GetSubtype() == COrgMod::eSubtype_old_name)
2015  && mod_it->IsSetSubname()
2016  && NStr::EqualNocase(str, mod_it->GetSubname())) {
2017  match = mod_it->GetSubname();
2018  break;
2019  }
2020  }
2021  }
2022  }
2023  }
2024  return NStr::EqualCase(str, match);
2025 }
2026 
2027 
2028 static const string sIgnoreHostWordList[] = {
2029  " cf.",
2030  " cf ",
2031  " aff ",
2032  " aff.",
2033  " near",
2034  " nr.",
2035  " nr "
2036 };
2037 
2038 
2039 static const int kNumIgnoreHostWordList = sizeof (sIgnoreHostWordList) / sizeof (string);
2040 
2041 void AdjustSpecificHostForTaxServer (string& spec_host)
2042 {
2043  for (int i = 0; i < kNumIgnoreHostWordList; i++) {
2044  NStr::ReplaceInPlace(spec_host, sIgnoreHostWordList[i], " ");
2045  }
2046  NStr::ReplaceInPlace(spec_host, " ", " ");
2047  NStr::TruncateSpacesInPlace(spec_host);
2048 }
2049 
2050 
2051 string SpecificHostValueToCheck(const string& val)
2052 {
2053  if (NStr::IsBlank(val)) {
2054  return val;
2055 #if 0
2056  } else if (! isupper (val.c_str()[0])) {
2057  return kEmptyStr;
2058 #endif
2059  }
2060 
2061  string host = val;
2062  // ignore portion after semicolon
2063  size_t pos = NStr::Find(host, ";");
2064  if (pos != string::npos) {
2065  host = host.substr(0, pos);
2066  }
2068  // must have at least two words to check
2069  pos = NStr::Find(host, " "); // combine with next line
2070  if (pos == string::npos) {
2071  return kEmptyStr;
2072  }
2073 
2075  pos = NStr::Find(host, " ");
2076  if (NStr::StartsWith(host.substr(pos + 1), "hybrid ")) {
2077  pos += 7;
2078  } else if (NStr::StartsWith(host.substr(pos + 1), "x ")) {
2079  pos += 2;
2080  }
2081  if (! NStr::StartsWith(host.substr(pos + 1), "sp.")
2082  && ! NStr::StartsWith(host.substr(pos + 1), "(")) {
2083  pos = NStr::Find(host, " ", pos + 1);
2084  if (pos != string::npos) {
2085  host = host.substr(0, pos);
2086  }
2087  } else {
2088  host = host.substr(0, pos);
2089  }
2090  return host;
2091 }
2092 
2093 
2094 string InterpretSpecificHostResult(const string& host, const CT3Reply& reply, const string& orig_host)
2095 {
2096  string err_str;
2097  if (reply.IsError()) {
2098  err_str = "?";
2099  if (reply.GetError().IsSetMessage()) {
2100  err_str = reply.GetError().GetMessage();
2101  }
2102  if(NStr::FindNoCase(err_str, "ambiguous") != string::npos) {
2103  err_str = "Specific host value is ambiguous: " +
2104  (NStr::IsBlank(orig_host) ? host : orig_host);
2105  } else {
2106  err_str = "Invalid value for specific host: " +
2107  (NStr::IsBlank(orig_host) ? host : orig_host);
2108  }
2109  } else if (reply.IsData()) {
2110  const auto& rdata = reply.GetData();
2111  if (HasMisSpellFlag(rdata)) {
2112  err_str = "Specific host value is misspelled: " +
2113  (NStr::IsBlank(orig_host) ? host : orig_host);
2114  } else if (rdata.IsSetOrg()) {
2115  const auto& org = rdata.GetOrg();
2116  if (NStr::StartsWith(org.GetTaxname(), host)) {
2117  // do nothing, all good
2118  } else if (IsCommonName(rdata)) {
2119  // not actionable
2120  } else if (FindMatchInOrgRef(host, org)) {
2121  // replace with synonym
2122  err_str = "Specific host value is alternate name: " +
2123  orig_host + " should be " +
2124  org.GetTaxname();
2125  } else {
2126  err_str = "Specific host value is incorrectly capitalized: " +
2127  (NStr::IsBlank(orig_host) ? host : orig_host);
2128  }
2129  } else {
2130  err_str = "Invalid value for specific host: " +
2131  (NStr::IsBlank(orig_host) ? host : orig_host);
2132  }
2133  }
2134  return err_str;
2135 }
2136 
2137 
2138 bool IsCommon(const COrg_ref& org, const string& val)
2139 {
2140  bool is_common = false;
2141  if (org.IsSetCommon() && NStr::EqualNocase(val, org.GetCommon())) {
2142  // common name, not genus
2143  is_common = true;
2144  } else if (org.IsSetOrgMod()) {
2145  for (auto& it : org.GetOrgname().GetMod()) {
2146  if (it->IsSetSubtype() &&
2147  it->GetSubtype() == COrgMod::eSubtype_common &&
2148  it->IsSetSubname() &&
2149  NStr::EqualNocase(it->GetSubname(), val)) {
2150  is_common = true;
2151  break;
2152  }
2153  }
2154  }
2155  return is_common;
2156 }
2157 
2158 
2159 bool IsLikelyTaxname(const string& val)
2160 {
2161  if (val.empty() || !isalpha(val.front())) {
2162  return false;
2163  }
2164  size_t pos = NStr::Find(val, " ");
2165  if (pos == NPOS) {
2166  return false;
2167  }
2168 
2169  CTaxon1 taxon1;
2170  taxon1.Init();
2171  TTaxId taxid = taxon1.GetTaxIdByName(val.substr(0, pos));
2172  if (taxid == ZERO_TAX_ID || taxid == INVALID_TAX_ID) {
2173  return false;
2174  }
2175 
2176  bool is_species = false;
2177  bool is_uncultured = false;
2178  string blast_name;
2179  CConstRef<COrg_ref> org = taxon1.GetOrgRef(taxid, is_species, is_uncultured, blast_name);
2180  if (org && IsCommon(*org, val.substr(0, pos))) {
2181  return false;
2182  } else {
2183  return true;
2184  }
2185 }
2186 
2187 
2188 //LCOV_EXCL_START
2189 //not used by asnvalidate but used by other applications
2190 bool IsSpecificHostValid(const string& val, string& error_msg)
2191 {
2193  return tval.IsOneSpecificHostValid(val, error_msg);
2194 }
2195 
2196 
2197 string FixSpecificHost(const string& val)
2198 {
2199  string hostfix = val;
2200  validator::CTaxValidationAndCleanup tval;
2201  tval.FixOneSpecificHost(hostfix);
2202 
2203  return hostfix;
2204 }
2205 
2206 
2207 static char s_ConvertChar(char ch)
2208 {
2209  if (ch < 0x02 || ch > 0x7F) {
2210  // no change
2211  }
2212  else if (isalpha(ch)) {
2213  ch = tolower(ch);
2214  }
2215  else if (isdigit(ch)) {
2216  // no change
2217  }
2218  else if (ch == '\'' || ch == '/' || ch == '@' || ch == '`' || ch == ',') {
2219  // no change
2220  }
2221  else {
2222  ch = 0x20;
2223  }
2224  return ch;
2225 }
2226 
2227 
2228 void ConvertToEntrezTerm(string& title)
2229 {
2230  string::iterator s = title.begin();
2231  char p = ' ';
2232  while (s != title.end()) {
2233  *s = s_ConvertChar(*s);
2234  if (isspace(*s) && isspace(p)) {
2235  s = title.erase(s);
2236  }
2237  else {
2238  p = *s;
2239  ++s;
2240  }
2241  }
2243 }
2244 //LCOV_EXCL_STOP
2245 
2246 
2248 {
2249  if (!cdr.IsSetCode()) {
2250  return;
2251  }
2252  const auto& gcode = cdr.GetCode();
2253  CGenetic_code::C_E::TId genCode = 0;
2254  for (auto& it : gcode.Get()) {
2255  if (it->IsId()) {
2256  genCode = it->GetId();
2257  }
2258  }
2259 
2260  if (genCode == 7) {
2261  genCode = 4;
2262  } else if (genCode == 8) {
2263  genCode = 1;
2264  } else if (genCode == 0) {
2265  genCode = 1;
2266  }
2267  cdr.ResetCode();
2269  new_code->SetId(genCode);
2270  cdr.SetCode().Set().push_back(new_code);
2271 }
2272 
2273 
2274 string TranslateCodingRegionForValidation(const CSeq_feat& feat, CScope &scope, bool& alt_start)
2275 {
2276  string transl_prot;
2277  CRef<CSeq_feat> tmp_cds(new CSeq_feat());
2278  tmp_cds->Assign(feat);
2279  FixGeneticCode(tmp_cds->SetData().SetCdregion());
2280  const CCdregion& cdregion = tmp_cds->GetData().GetCdregion();
2281  const CSeq_loc& cds_loc = tmp_cds->GetLocation();
2282  if (cds_loc.IsWhole()) {
2283  CBioseq_Handle bsh = scope.GetBioseqHandle(cds_loc.GetWhole());
2284  if (!bsh) {
2285  return kEmptyStr;
2286  }
2287  size_t start = 0;
2288  if (cdregion.IsSetFrame()) {
2289  if (cdregion.GetFrame() == 2) {
2290  start = 1;
2291  } else if (cdregion.GetFrame() == 3) {
2292  start = 2;
2293  }
2294  }
2295  const CGenetic_code* genetic_code = nullptr;
2296  if (cdregion.IsSetCode()) {
2297  genetic_code = &(cdregion.GetCode());
2298  }
2299  CRef<CSeq_id> id(new CSeq_id());
2300  id->Assign(cds_loc.GetWhole());
2301  CRef<CSeq_loc> tmp(new CSeq_loc(*id, start, bsh.GetInst_Length() - 1));
2302  CSeqTranslator::Translate(*tmp, scope, transl_prot, genetic_code, true, false, &alt_start);
2303  } else {
2304  CSeqTranslator::Translate(*tmp_cds, scope, transl_prot,
2305  true, // include stop codons
2306  false, // do not remove trailing X/B/Z
2307  &alt_start);
2308  }
2309 
2310  return transl_prot;
2311 }
2312 
2313 
2314 bool HasBadStartCodon(const CSeq_loc& loc, const string& transl_prot)
2315 {
2316  bool got_dash = (transl_prot[0] == '-');
2317  bool got_x = (transl_prot[0] == 'X'
2319 
2320  if (!got_dash && !got_x) {
2321  return false;
2322  }
2323  return true;
2324 }
2325 
2326 
2327 static const char * kUnclassifiedTranslationDiscrepancy = "unclassified translation discrepancy";
2328 
2329 static const char* const sc_BypassCdsTransCheckText[] = {
2330  "RNA editing",
2331  "adjusted for low-quality genome",
2332  "annotated by transcript or proteomic data",
2333  "rearrangement required for product",
2334  "reasons given in citation",
2335  "translated product replaced",
2337 };
2340 
2341 static const char* const sc_ForceCdsTransCheckText[] = {
2342  "artificial frameshift",
2343  "mismatches in translation"
2344 };
2347 
2348 bool ReportTranslationErrors(const string& except_text)
2349 {
2350  bool report = true;
2351  ITERATE(TBypassCdsTransCheckSet, it, sc_BypassCdsTransCheck) {
2352  if (NStr::FindNoCase(except_text, *it) != NPOS) {
2353  report = false;
2354  }
2355  }
2356  if (!report) {
2357  ITERATE(TForceCdsTransCheckSet, it, sc_ForceCdsTransCheck) {
2358  if (NStr::FindNoCase(except_text, *it) != NPOS) {
2359  report = true;
2360  }
2361  }
2362  }
2363  return report;
2364 }
2365 
2366 
2367 //LCOV_EXCL_START
2368 //not used by asnvalidate but used by other applications
2369 bool HasBadStartCodon(const CSeq_feat& feat, CScope& scope, bool ignore_exceptions)
2370 {
2371  if (!feat.IsSetData() || !feat.GetData().IsCdregion()) {
2372  return false;
2373  }
2374  // do not validate for pseudo gene
2375  FOR_EACH_GBQUAL_ON_FEATURE(it, feat) {
2376  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "pseudo")) {
2377  return false;
2378  }
2379  }
2380 
2381  if (!ignore_exceptions && feat.CanGetExcept() && feat.GetExcept() &&
2382  feat.CanGetExcept_text()) {
2383  if (!ReportTranslationErrors(feat.GetExcept_text())) {
2384  return false;
2385  }
2386  }
2387 
2388  bool alt_start = false;
2389  string transl_prot;
2390  try {
2391  transl_prot = TranslateCodingRegionForValidation(feat, scope, alt_start);
2392  } catch (CException& ) {
2393  return false;
2394  }
2395  return HasBadStartCodon(feat.GetLocation(), transl_prot);
2396 }
2397 //LCOV_EXCL_STOP
2398 
2399 
2400 size_t CountInternalStopCodons(const string& transl_prot)
2401 {
2402  if (NStr::IsBlank(transl_prot)) {
2403  return 0;
2404  }
2405  // count internal stops and Xs
2406  size_t internal_stop_count = 0;
2407 
2408  ITERATE(string, it, transl_prot) {
2409  if (*it == '*') {
2410  ++internal_stop_count;
2411  }
2412  }
2413  // if stop at end, reduce count by one (since one of the stops counted isn't internal)
2414  if (transl_prot[transl_prot.length() - 1] == '*') {
2415  --internal_stop_count;
2416  }
2417  return internal_stop_count;
2418 }
2419 
2420 
2421 //LCOV_EXCL_START
2422 //not used by asnvalidate but used by other applications
2423 bool HasInternalStop(const CSeq_feat& feat, CScope& scope, bool ignore_exceptions)
2424 {
2425  if (!feat.IsSetData() || !feat.GetData().IsCdregion()) {
2426  return false;
2427  }
2428  // do not validate for pseudo gene
2429  FOR_EACH_GBQUAL_ON_FEATURE(it, feat) {
2430  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "pseudo")) {
2431  return false;
2432  }
2433  }
2434 
2435  if (!ignore_exceptions && feat.CanGetExcept() && feat.GetExcept() &&
2436  feat.CanGetExcept_text()) {
2437  const string& except_text = feat.GetExcept_text();
2438  if (NStr::Find(except_text, kUnclassifiedTranslationDiscrepancy) == string::npos
2440  return false;
2441  }
2442  }
2443 
2444  bool alt_start = false;
2445  string transl_prot;
2446  try {
2447  transl_prot = TranslateCodingRegionForValidation(feat, scope, alt_start);
2448  } catch (CException& ) {
2449  return false;
2450  }
2451 
2452  size_t internal_stop_codons = CountInternalStopCodons(transl_prot);
2453  if (internal_stop_codons > 0) {
2454  return true;
2455  } else {
2456  return false;
2457  }
2458 }
2459 //LCOV_EXCL_STOP
2460 
2461 
2463 {
2465  CSeq_data::E_Choice seqtyp = bsh.GetInst().IsSetSeq_data() ?
2467  if (seqtyp == CSeq_data::e_Ncbieaa || seqtyp == CSeq_data::e_Ncbistdaa) {
2469  }
2470  return sv;
2471 }
2472 
2473 
2475 {
2476  if (sv.size() < 1) {
2477  return false;
2478  } else if (sv.IsInGap(0) || sv[0] == '-') {
2479  return true;
2480  } else {
2481  return false;
2482  }
2483 }
2484 
2485 
2486 //LCOV_EXCL_START
2487 //not used by asnvalidate but used by other applications
2488 bool HasBadProteinStart(const CSeq_feat& feat, CScope& scope)
2489 {
2490  if (!feat.IsSetData() || !feat.GetData().IsCdregion() ||
2491  !feat.IsSetProduct()) {
2492  return false;
2493  }
2494  // use try catch for those weird situations where the product is
2495  // not specified as a single product sequence (in which case we
2496  // should just skip this test)
2497  try {
2498  CBioseq_Handle bsh = scope.GetBioseqHandle(feat.GetProduct());
2499  if (!bsh.IsAa()) {
2500  return false;
2501  }
2503  return HasBadProteinStart(*sv);
2504  } catch (CException& ) {
2505  return false;
2506  }
2507 }
2508 //LCOV_EXCL_STOP
2509 
2510 
2512 {
2513  size_t terminations = 0;
2514 
2515  for (CSeqVector_CI sv_iter(sv); (sv_iter); ++sv_iter) {
2516  if (*sv_iter == '*') {
2517  terminations++;
2518  }
2519  }
2520  return terminations;
2521 }
2522 
2523 
2524 //LCOV_EXCL_START
2525 //not used by asnvalidate but used by other applications
2526 bool HasStopInProtein(const CSeq_feat& feat, CScope& scope)
2527 {
2528  if (!feat.IsSetData() || !feat.GetData().IsCdregion() ||
2529  !feat.IsSetProduct()) {
2530  return false;
2531  }
2532  // use try catch for those weird situations where the product is
2533  // not specified as a single product sequence (in which case we
2534  // should just skip this test)
2535  try {
2536  CBioseq_Handle bsh = scope.GetBioseqHandle(feat.GetProduct());
2537  if (!bsh.IsAa()) {
2538  return false;
2539  }
2541  if (CountProteinStops(*sv) > 0) {
2542  return true;
2543  } else {
2544  return false;
2545  }
2546  } catch (CException& ) {
2547  return false;
2548  }
2549 }
2550 //LCOV_EXCL_STOP
2551 
2552 
2553 void FeatureHasEnds(const CSeq_feat& feat, CScope* scope, bool& no_beg, bool& no_end)
2554 {
2555  unsigned int part_loc = sequence::SeqLocPartialCheck(feat.GetLocation(), scope);
2556  no_beg = false;
2557  no_end = false;
2558 
2559  if (part_loc & sequence::eSeqlocPartial_Start) {
2560  no_beg = true;
2561  }
2562  if (part_loc & sequence::eSeqlocPartial_Stop) {
2563  no_end = true;
2564  }
2565 
2566 
2567  if ((!no_beg || !no_end) && feat.IsSetProduct()) {
2568  unsigned int part_prod = sequence::SeqLocPartialCheck(feat.GetProduct(), scope);
2569  if (part_prod & sequence::eSeqlocPartial_Start) {
2570  no_beg = true;
2571  }
2572  if (part_prod & sequence::eSeqlocPartial_Stop) {
2573  no_end = true;
2574  }
2575  }
2576 }
2577 
2578 
2579 //LCOV_EXCL_START
2580 // not used by asnvalidate but needed for other applications
2581 CBioseq_Handle GetCDSProductSequence(const CSeq_feat& feat, CScope* scope, const CTSE_Handle & tse, bool far_fetch, bool& is_far)
2582 {
2583  CBioseq_Handle prot_handle;
2584  is_far = false;
2585  if (!feat.IsSetProduct()) {
2586  return prot_handle;
2587  }
2588  const CSeq_id* protid = nullptr;
2589  try {
2590  protid = &sequence::GetId(feat.GetProduct(), scope);
2591  } catch (CException&) {}
2592  if (protid) {
2593  prot_handle = scope->GetBioseqHandleFromTSE(*protid, tse);
2594  if (!prot_handle && far_fetch) {
2595  prot_handle = scope->GetBioseqHandle(*protid);
2596  is_far = true;
2597  }
2598  }
2599  return prot_handle;
2600 }
2601 //LCOV_EXCL_STOP
2602 
2603 
2604 void CalculateEffectiveTranslationLengths(const string& transl_prot, const CSeqVector& prot_vec, size_t &len, size_t& prot_len)
2605 {
2606  len = transl_prot.length();
2607  prot_len = prot_vec.size();
2608 
2609  if (NStr::EndsWith(transl_prot, "*") && (len == prot_len + 1)) { // ok, got stop
2610  --len;
2611  }
2612  while (len > 0) {
2613  if (transl_prot[len - 1] == 'X') { //remove terminal X
2614  --len;
2615  } else {
2616  break;
2617  }
2618  }
2619 
2620  // ignore terminal 'X' from partial last codon if present
2621  while (prot_len > 0) {
2622  if (prot_vec[prot_len - 1] == 'X') { //remove terminal X
2623  --prot_len;
2624  } else {
2625  break;
2626  }
2627  }
2628 }
2629 
2630 
2631 //LCOV_EXCL_START
2632 // not used by asnvalidate but needed for other applications
2633 vector<TSeqPos> GetMismatches(const CSeq_feat& feat, const CSeqVector& prot_vec, const string& transl_prot)
2634 {
2635  vector<TSeqPos> mismatches;
2636  size_t prot_len;
2637  size_t len;
2638 
2639  CalculateEffectiveTranslationLengths(transl_prot, prot_vec, len, prot_len);
2640 
2641  if (len == prot_len) { // could be identical
2642  for (TSeqPos i = 0; i < len; ++i) {
2643  CSeqVectorTypes::TResidue p_res = prot_vec[i];
2644  CSeqVectorTypes::TResidue t_res = transl_prot[i];
2645 
2646  if (t_res != p_res) {
2647  if (i == 0) {
2648  bool no_beg, no_end;
2649  FeatureHasEnds(feat, &(prot_vec.GetScope()), no_beg, no_end);
2650  if (feat.IsSetPartial() && feat.GetPartial() && (!no_beg) && (!no_end)) {
2651  } else if (t_res == '-') {
2652  } else {
2653  mismatches.push_back(i);
2654  }
2655  } else {
2656  mismatches.push_back(i);
2657  }
2658  }
2659  }
2660  }
2661  return mismatches;
2662 }
2663 
2664 
2665 vector<TSeqPos> GetMismatches(const CSeq_feat& feat, const CBioseq_Handle& prot_handle, const string& transl_prot)
2666 {
2667  vector<TSeqPos> mismatches;
2668  // can't check for mismatches unless there is a product
2669  if (!prot_handle || !prot_handle.IsAa()) {
2670  return mismatches;
2671  }
2672 
2673  CSeqVector prot_vec = prot_handle.GetSeqVector();
2674  prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
2675 
2676  return GetMismatches(feat, prot_vec, transl_prot);
2677 }
2678 
2679 
2680 bool HasNoStop(const CSeq_feat& feat, CScope* scope)
2681 {
2682  bool no_beg, no_end;
2683  FeatureHasEnds(feat, scope, no_beg, no_end);
2684  if (no_end) {
2685  return false;
2686  }
2687 
2688  string transl_prot;
2689  bool alt_start;
2690  try {
2691  transl_prot = TranslateCodingRegionForValidation(feat, *scope, alt_start);
2692  } catch (CException& ) {
2693  }
2694  if (NStr::EndsWith(transl_prot, "*")) {
2695  return false;
2696  }
2697 
2698  bool show_stop = true;
2699  if (!no_beg && feat.IsSetPartial() && feat.GetPartial()) {
2700  CBioseq_Handle prot_handle;
2701  try {
2702  CBioseq_Handle bsh = scope->GetBioseqHandle(feat.GetLocation());
2703  const CTSE_Handle tse = bsh.GetTSE_Handle();
2704  bool is_far = false;
2705  prot_handle = GetCDSProductSequence(feat, scope, tse, true, is_far);
2706  if (prot_handle) {
2707  vector<TSeqPos> mismatches = GetMismatches(feat, prot_handle, transl_prot);
2708  if (mismatches.size() == 0) {
2709  show_stop = false;
2710  }
2711  }
2712  } catch (CException& ) {
2713  }
2714  }
2715 
2716  return show_stop;
2717 }
2718 //LCOV_EXCL_STOP
2719 
2720 
2721 bool IsSequenceFetchable(const CSeq_id& id, CScope* scope)
2722 {
2723  bool fetchable = false;
2724  try {
2725  if (scope) {
2727  CScope::TIds ids = scope->GetIds(idh);
2728  if (ids.size() > 0) {
2729  fetchable = true;
2730  }
2731  } else {
2734  scopex->AddDefaults();
2735  CBioseq_Handle bsh = scopex->GetBioseqHandle(idh);
2736  if (bsh) {
2737  fetchable = true;
2738  }
2739  }
2740  } catch (CException& ) {
2741  } catch (std::exception &) {
2742  }
2743  return fetchable;
2744 }
2745 
2746 
2747 bool IsSequenceFetchable(const string& seq_id, CScope* scope)
2748 {
2749  bool fetchable = false;
2750  try {
2751  CRef<CSeq_id> id(new CSeq_id(seq_id));
2752  if (id) {
2753  fetchable = IsSequenceFetchable(*id, scope);
2754  }
2755  } catch (CException& ) {
2756  } catch (std::exception &) {
2757  }
2758  return fetchable;
2759 }
2760 
2761 
2762 bool IsNTNCNWACAccession(const string& acc)
2763 {
2764  if (NStr::StartsWith(acc, "NT_") || NStr::StartsWith(acc, "NC_") ||
2765  NStr::StartsWith(acc, "AC_") || NStr::StartsWith(acc, "NW_")) {
2766  return true;
2767  } else {
2768  return false;
2769  }
2770 }
2771 
2772 
2774 {
2775  if (id.IsOther() && id.GetOther().IsSetAccession() &&
2776  IsNTNCNWACAccession(id.GetOther().GetAccession())) {
2777  return true;
2778  } else {
2779  return false;
2780  }
2781 }
2782 
2783 
2785 {
2786  bool is_it = false;
2787  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
2788  if (IsNTNCNWACAccession(**id_it)) {
2789  is_it = true;
2790  break;
2791  }
2792  }
2793  return is_it;
2794 }
2795 
2796 
2797 bool IsNG(const CSeq_id& id)
2798 {
2799  if (id.IsOther() && id.GetOther().IsSetAccession() &&
2800  NStr::StartsWith(id.GetOther().GetAccession(), "NG_")) {
2801  return true;
2802  } else {
2803  return false;
2804  }
2805 }
2806 
2807 
2808 bool IsNG(const CBioseq& seq)
2809 {
2810  bool is_it = false;
2811  FOR_EACH_SEQID_ON_BIOSEQ(id_it, seq) {
2812  if (IsNG(**id_it)) {
2813  is_it = true;
2814  break;
2815  }
2816  }
2817  return is_it;
2818 }
2819 
2820 
2821 // See VR-728. These Seq-ids are temporary and will be stripped
2822 // by the ID Load process, so they should not be the only Seq-id
2823 // on a Bioseq, and feature locations should not use these.
2824 bool IsTemporary(const CSeq_id& id)
2825 {
2826  if (id.IsGeneral() && id.GetGeneral().IsSetDb()) {
2827  const string& db = id.GetGeneral().GetDb();
2828  if (NStr::EqualNocase(db, "TMSMART") ||
2829  NStr::EqualNocase(db, "NCBIFILE") ||
2830  NStr::EqualNocase(db, "BankIt")) {
2831  return true;
2832  }
2833  }
2834  return false;
2835 }
2836 
2837 
2838 bool IsOrganelle(int genome)
2839 {
2840  bool rval = false;
2841  switch (genome) {
2854  rval = true;
2855  break;
2856  default:
2857  rval = false;
2858  break;
2859  }
2860  return rval;
2861 }
2862 
2863 
2864 bool IsOrganelle(const CBioseq_Handle& seq)
2865 {
2866  if (!seq) {
2867  return false;
2868  }
2869  bool rval = false;
2871  if (sd && sd->GetSource().IsSetGenome() && IsOrganelle(sd->GetSource().GetGenome())) {
2872  rval = true;
2873  }
2874  return rval;
2875 }
2876 
2877 
2879 
2880 {
2881  return (bool)(strchr("ANRMWHVD", ch) != NULL);
2882 }
2883 
2885 
2886 {
2887  return (bool)(strchr("CNYMSHBV", ch) != NULL);
2888 }
2889 
2891 
2892 {
2893  return (bool)(strchr("GNRKSBVD", ch) != NULL);
2894 }
2895 
2897 
2898 {
2899  return (bool)(strchr("TNYKWHBD", ch) != NULL);
2900 }
2901 
2902 
2903 //LCOV_EXCL_START
2904 //not used by validator, but used by Genome Workbench menu item for
2905 //removing unneccessary exceptions
2906 bool DoesCodingRegionHaveUnnecessaryException(const CSeq_feat& feat, const CBioseq_Handle& loc_handle, CScope& scope)
2907 {
2908  CCDSTranslationProblems problems;
2909  CBioseq_Handle prot_handle;
2910  if (feat.IsSetProduct()) {
2911  prot_handle = scope.GetBioseqHandle(feat.GetProduct());
2912  }
2913 
2914  problems.CalculateTranslationProblems(feat,
2915  loc_handle,
2916  prot_handle,
2917  false,
2918  false,
2919  false,
2920  false,
2921  false,
2922  false,
2923  false,
2924  false,
2925  false,
2926  false,
2927  &scope);
2928 
2930 }
2931 
2932 
2934 {
2935  size_t mismatches = 0;
2937  if (feat.IsSetProduct()) {
2938  rna = scope.GetBioseqHandle(feat.GetProduct());
2939  }
2940 
2941  size_t problems = GetMRNATranslationProblems
2942  (feat, mismatches, false,
2943  nuc, rna, false, false, false, &scope);
2944 
2945  return (problems & eMRNAProblem_UnnecessaryException);
2946 }
2947 
2948 
2950 {
2951  if (!feat.IsSetExcept_text()) {
2952  return false;
2953  }
2954  if (!feat.IsSetData()) {
2955  return false;
2956  }
2957  if (!feat.IsSetLocation()) {
2958  return false;
2959  }
2960  try {
2961  CBioseq_Handle bsh = scope.GetBioseqHandle(feat.GetLocation());
2962  if (!bsh) {
2963  return false;
2964  }
2965  CSpliceProblems splice_problems;
2966  splice_problems.CalculateSpliceProblems(feat, true, sequence::IsPseudo(feat, scope), bsh);
2967  if (splice_problems.IsExceptionUnnecessary()) {
2968  return true;
2969  }
2970  if (feat.GetData().IsCdregion()) {
2971  return DoesCodingRegionHaveUnnecessaryException(feat, bsh, scope);
2972  } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
2973  return DoesmRNAHaveUnnecessaryException(feat, bsh, scope);
2974  } else {
2975  return false;
2976  }
2977  } catch (CException&) {
2978  }
2979  return false;
2980 }
2981 //LCOV_EXCL_STOP
2982 
2983 static bool s_IsGenbankMasterAccession(const string& acc)
2984 {
2985  bool rval = false;
2986  switch (acc.length()) {
2987  case 12:
2988  if (NStr::EndsWith(acc, "000000")) {
2989  rval = true;
2990  }
2991  break;
2992  case 13:
2993  if (NStr::EndsWith(acc, "0000000")) {
2994  rval = true;
2995  }
2996  break;
2997  case 14:
2998  if (NStr::EndsWith(acc, "00000000")) {
2999  rval = true;
3000  }
3001  break;
3002  default:
3003  break;
3004  }
3005  return rval;
3006 }
3007 
3008 
3010 {
3011  bool rval = false;
3012  switch (id.Which()) {
3013  case CSeq_id::e_Other:
3014  if (id.GetOther().IsSetAccession()) {
3015  const string& acc = id.GetOther().GetAccession();
3016  switch (acc.length()) {
3017  case 15:
3018  if (NStr::EndsWith(acc, "000000")) {
3019  rval = true;
3020  }
3021  break;
3022  case 16:
3023  case 17:
3024  if (NStr::EndsWith(acc, "0000000")) {
3025  rval = true;
3026  }
3027  break;
3028  default:
3029  break;
3030  }
3031  }
3032  break;
3033  case CSeq_id::e_Genbank:
3034  if (id.GetGenbank().IsSetAccession()) {
3035  rval = s_IsGenbankMasterAccession(id.GetGenbank().GetAccession());
3036  }
3037  break;
3038  case CSeq_id::e_Ddbj:
3039  if (id.GetDdbj().IsSetAccession()) {
3040  rval = s_IsGenbankMasterAccession(id.GetDdbj().GetAccession());
3041  }
3042  break;
3043  case CSeq_id::e_Embl:
3044  if (id.GetEmbl().IsSetAccession()) {
3045  rval = s_IsGenbankMasterAccession(id.GetEmbl().GetAccession());
3046  }
3047  break;
3048  case CSeq_id::e_Tpg:
3049  if (id.GetTpg().IsSetAccession()) {
3050  rval = s_IsGenbankMasterAccession(id.GetTpg().GetAccession());
3051  }
3052  break;
3053  default:
3054  break;
3055  }
3056 
3057  return rval;
3058 }
3059 
3061 {
3062  CBioseq_Handle bsh = scope.GetBioseqHandle(id);
3063  if (!bsh) {
3064  // can't fetch bioseq, can't tell, assume not
3065  return false;
3066  }
3067  CSeqdesc_CI src(bsh, CSeqdesc::e_Source);
3068  if (!src || !src->GetSource().IsSetGenome() || !IsOrganelle(src->GetSource().GetGenome())) {
3069  // not an organelle location
3070  return false;
3071  }
3073  while (set) {
3074  if (!set.IsSetClass()) {
3075  // class not set - quit
3076  break;
3077  } else if (set.GetClass() == CBioseq_set::eClass_small_genome_set) {
3078  return true;
3079  } else if (set.GetClass() == CBioseq_set::eClass_nuc_prot) {
3080  // look at parent
3081  set = set.GetParentBioseq_set();
3082  } else {
3083  break;
3084  }
3085  }
3086  return false;
3087 }
3088 
3089 
3091 {
3092  CSeq_loc_CI lit(loc);
3093  const CSeq_id& id1 = lit.GetSeq_id();
3094 
3095  bool in_organelle_small_genome_set = IsInOrganelleSmallGenomeSet(id1, scope);
3096 
3097  ++lit;
3098  while (lit) {
3099  const CSeq_id& id2 = lit.GetSeq_id();
3100  if (in_organelle_small_genome_set && !IsInOrganelleSmallGenomeSet(id2, scope)) {
3101  // if one sequence in small genome set and other not, this is bad
3102  return true;
3103  }
3104  if (!id2.Match(id1) && !sequence::IsSameBioseq(id1, id2, &scope) && !in_organelle_small_genome_set) {
3105  return true;
3106  }
3107  ++lit;
3108  }
3109  return false;
3110 }
3111 
3112 
3113 
3114 END_SCOPE(validator)
static CRef< CScope > m_Scope
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
#define bool
Definition: bool.h:34
CAlign_CI –.
Definition: align_ci.hpp:63
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
CBioseq_set_Handle –.
void CalculateTranslationProblems(const CSeq_feat &feat, CBioseq_Handle loc_handle, CBioseq_Handle prot_handle, bool ignore_exceptions, bool far_fetch_cds, bool standalone_annot, bool single_seq, bool is_gpipe, bool is_genomic, bool is_refseq, bool is_nt_or_ng_or_nw, bool is_nc, bool has_accession, CScope *scope)
size_t GetTranslationProblemFlags() const
CCdregion –.
Definition: Cdregion.hpp:66
Definition: Date.hpp:53
FASTA-format output; see also ReadFasta in <objtools/readers/fasta.hpp>
Definition: sequence.hpp:770
CFeat_id –.
Definition: Feat_id.hpp:66
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
int Compare(const CObject_id &oid2) const
Definition: Object_id.cpp:145
bool IsSetOrgMod(void) const
Definition: Org_ref.cpp:169
CScope –.
Definition: scope.hpp:92
ESubtype GetSubtype(void) const
CSeqVector –.
Definition: seq_vector.hpp:65
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
CSeq_annot_Handle –.
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
const CSeq_descr & GetDescr(void) const
Definition: Seq_entry.cpp:120
bool IsSetDescr(void) const
Definition: Seq_entry.cpp:106
CSeq_feat_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:593
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
Base class for all serializable objects.
Definition: serialbase.hpp:150
void CalculateSpliceProblems(const CSeq_feat &feat, bool check_all, bool pseudo, CBioseq_Handle loc_handle)
bool IsExceptionUnnecessary() const
CT3Reply –.
Definition: T3Reply.hpp:66
TSeq_feat_Handles GetFeaturesWithId(CSeqFeatData::E_Choice type, TFeatureIdInt id) const
Definition: tse_handle.cpp:604
bool IsOneSpecificHostValid(const string &val, string &err_msg)
CConstRef< COrg_ref > GetOrgRef(TTaxId tax_id, bool &is_species, bool &is_uncultured, string &blast_name, bool *is_specified=NULL)
Definition: taxon1.cpp:704
TTaxId GetTaxIdByName(const string &orgname)
Definition: taxon1.cpp:523
bool Init(void)
Definition: taxon1.cpp:101
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
Definition: map.hpp:338
Definition: set.hpp:45
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
static uch flags
CS_CONTEXT * ctx
Definition: t0006.c:12
static const char si[8][64]
Definition: des.c:146
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define INVALID_TAX_ID
Definition: ncbimisc.hpp:1116
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define ZERO_GI
Definition: ncbimisc.hpp:1088
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
const string & FindName(TEnumValueType value, bool allowBadValue) const
Find name of the enum by its numeric value.
Definition: enumerated.cpp:146
const CVect2< U > & v2
Definition: globals.hpp:440
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
virtual const CTypeInfo * GetThisTypeInfo(void) const =0
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1033
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
string GetLabel(const CSeq_id &id)
@ eFasta
Tagged ID in NCBI's traditional FASTA style.
Definition: Seq_id.hpp:575
CRef< CSeq_loc > MakeSeq_loc(EMakeType make_type=eMake_CompactType) const
return constructed CSeq_loc with all changes
Definition: Seq_loc.cpp:2946
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
Definition: Seq_loc.cpp:3222
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
CConstRef< CSeq_loc > GetRangeAsSeq_loc(void) const
Get seq-loc for the current iterator position.
Definition: Seq_loc.cpp:2585
void SetSeq_id(const CSeq_id &id)
Set seq_id of the current location.
Definition: Seq_loc.hpp:713
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
TRange GetRange(void) const
Get the range.
Definition: Seq_loc.hpp:1042
ENa_strand GetStrand(void) const
Definition: Seq_loc.hpp:1056
const CSeq_id & GetSeq_id(void) const
Get seq_id of the current location.
Definition: Seq_loc.hpp:1028
void GetLabel(string *label) const
Appends a label suitable for display (e.g., error messages) label must point to an existing string ob...
Definition: Seq_loc.cpp:3467
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
CConstBeginInfo ConstBegin(const C &obj)
Get starting point of non-modifiable object hierarchy.
Definition: iterator.hpp:1012
@ fFGL_Content
Include its content if there is any.
Definition: feature.hpp:73
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
ENa_strand GetStrand(const CSeq_loc &loc, CScope *scope=0)
Returns eNa_strand_unknown if multiple Bioseqs in loc Returns eNa_strand_other if multiple strands in...
int SeqLocPartialCheck(const CSeq_loc &loc, CScope *scope)
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
bool IsSameBioseq(const CSeq_id &id1, const CSeq_id &id2, CScope *scope, CScope::EGetBioseqFlag get_flag=CScope::eGetBioseq_All)
Determines if two CSeq_ids represent the same CBioseq.
@ eSeqlocPartial_Nostart
@ eSeqlocPartial_Nostop
@ eSeqlocPartial_Stop
@ eSeqlocPartial_Start
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ eSame
CSeq_locs contain each other.
bool IsPseudo(const CSeq_feat &feat, CScope &scope)
Determines whether given feature is pseudo, using gene associated with feature if necessary Checks to...
Definition: sequence.cpp:1428
virtual void WriteSequence(const CBioseq_Handle &handle, const CSeq_loc *location=0, CSeq_loc::EOpFlags merge_flags=CSeq_loc::fMerge_AbuttingOnly)
Definition: sequence.cpp:3322
void SetFlag(EFlags flag)
Definition: sequence.hpp:859
static void Translate(const string &seq, string &prot, const CGenetic_code *code, bool include_stop=true, bool remove_trailing_X=false, bool *alt_start=NULL, bool is_5prime_complete=true, bool is_3prime_complete=true)
Translate a string using a specified genetic code.
Definition: sequence.cpp:4095
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
@ fInstantiateGaps
honor specifed gap mode; on by default
Definition: sequence.hpp:774
@ fAssembleParts
assemble FAR delta sequences; on by dflt
Definition: sequence.hpp:773
TIds GetIds(const CSeq_id &id, TGetFlags flags=0)
Get "native" bioseq ids without filtering and matching.
Definition: scope.cpp:401
CBioseq_Handle GetBioseqHandleFromTSE(const CSeq_id &id, const CTSE_Handle &tse)
Get bioseq handle for sequence withing one TSE.
Definition: scope.cpp:253
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle GetSeq_entryHandle(CDataLoader *loader, const TBlobId &blob_id, EMissing action=eMissing_Default)
Get Seq-entry handle by its blob-id, with possible loading.
Definition: scope.cpp:113
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
CSeq_annot_Handle GetSeq_annotHandle(const CSeq_annot &annot, EMissing action=eMissing_Default)
Definition: scope.cpp:192
TGi GetGi(const CSeq_id_Handle &idh, TGetFlags flags=0)
Get GI of a sequence Returns ZERO_GI if the sequence is not found or if it doesn't have GI.
Definition: scope.cpp:419
vector< CSeq_id_Handle > TIds
Definition: scope.hpp:143
@ eGetBioseq_All
Search bioseq, load if not loaded yet.
Definition: scope.hpp:128
bool IsSetComment(void) const
bool IsNucleotide(void) const
const CTSE_Handle & GetTSE_Handle(void) const
Get CTSE_Handle of containing TSE.
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
TClass GetClass(void) const
const CSeq_annot_Handle & GetAnnot(void) const
Get handle to seq-annot for this feature.
CBioseq_set_Handle GetParentBioseq_set(void) const
Return a handle for the parent Bioseq-set, or null handle.
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
bool IsSetDbxref(void) const
virtual CConstRef< CSeq_feat > GetSeq_feat(void) const
const CSeqFeatData & GetData(void) const
TSet GetSet(void) const
bool IsAa(void) const
CConstRef< CBioseq_set > GetCompleteBioseq_set(void) const
Return the complete bioseq-set object.
CBioseq_set_Handle GetParentBioseq_set(void) const
Return a handle for the parent Bioseq-set, or null handle.
virtual const CSeq_loc & GetLocation(void) const
TSeq GetSeq(void) const
EVectorCoding
CSeqVector constructor flags.
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
TInst_Topology GetInst_Topology(void) const
const string & GetComment(void) const
const CSeq_annot::TDesc & Seq_annot_GetDesc(void) const
CSeq_entry_Handle GetParentEntry(void) const
Return a handle for the parent seq-entry of the bioseq.
TInst_Length GetInst_Length(void) const
bool IsSetInst_Repr(void) const
bool IsSetClass(void) const
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
CScope & GetScope(void) const
Get scope this handle belongs to.
TInst_Repr GetInst_Repr(void) const
CScope & GetScope(void) const
Get scope this handle belongs to.
bool IsSet(void) const
const CSeq_feat::TDbxref & GetDbxref(void) const
bool IsSetQual(void) const
bool IsSetInst_Topology(void) const
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
const CSeq_feat::TQual & GetQual(void) const
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
const TInst & GetInst(void) const
bool IsSeq(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
unsigned char TResidue
bool IsInGap(TSeqPos pos) const
true if sequence at 0-based position 'pos' has gap Note: this method is not MT-safe,...
Definition: seq_vector.hpp:277
static CRef< CSeqMap > CreateSeqMapForSeq_loc(const CSeq_loc &loc, CScope *scope)
Definition: seq_map.cpp:1134
TSeqPos size(void) const
Definition: seq_vector.hpp:291
void SetCoding(TCoding coding)
bool IsNucleotide(void) const
Definition: seq_vector.hpp:357
CScope & GetScope(void) const
Definition: seq_vector.hpp:330
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5429
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3197
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2887
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5324
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5352
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5383
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3401
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
@ eCase
Case sensitive compare.
Definition: ncbistr.hpp:1205
static const char label[]
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
bool IsSetGenome(void) const
Check if a value has been assigned to Genome data member.
Definition: BioSource_.hpp:397
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
bool IsStd(void) const
Check if variant Std is selected.
Definition: Date_.hpp:320
const TStr & GetStr(void) const
Get the variant data.
Definition: Date_.hpp:306
const TStd & GetStd(void) const
Get the variant data.
Definition: Date_.cpp:109
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
bool IsStr(void) const
Check if variant Str is selected.
Definition: Date_.hpp:300
const TMod & GetMod(void) const
Get the Mod member data.
Definition: OrgName_.hpp:839
bool IsSetCommon(void) const
common name Check if a value has been assigned to Common data member.
Definition: Org_ref_.hpp:407
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
const TCommon & GetCommon(void) const
Get the Common member data.
Definition: Org_ref_.hpp:419
bool IsSetMod(void) const
Check if a value has been assigned to Mod data member.
Definition: OrgName_.hpp:827
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
bool IsSetTaxname(void) const
preferred formal name Check if a value has been assigned to Taxname data member.
Definition: Org_ref_.hpp:360
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
@ eSubtype_gb_synonym
used by taxonomy database
Definition: OrgMod_.hpp:117
@ eSubtype_old_name
Definition: OrgMod_.hpp:124
TDim GetDim(void) const
Get the Dim member data.
Definition: Seq_align_.hpp:856
bool IsSetSegs(void) const
Check if a value has been assigned to Segs data member.
Definition: Seq_align_.hpp:909
bool IsDendiag(void) const
Check if variant Dendiag is selected.
Definition: Seq_align_.hpp:720
const TDendiag & GetDendiag(void) const
Get the variant data.
Definition: Seq_align_.hpp:726
bool IsStd(void) const
Check if variant Std is selected.
Definition: Seq_align_.hpp:746
bool IsSetDim(void) const
dimensionality Check if a value has been assigned to Dim data member.
Definition: Seq_align_.hpp:837
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
bool IsSetCode(void) const
genetic code used Check if a value has been assigned to Code data member.
Definition: Cdregion_.hpp:700
bool IsCdregion(void) const
Check if variant Cdregion is selected.
void ResetCode(void)
Reset Code data member.
Definition: Cdregion_.cpp:63
bool IsSetPartial(void) const
incomplete in some way? Check if a value has been assigned to Partial data member.
Definition: Seq_feat_.hpp:943
const TLocal & GetLocal(void) const
Get the variant data.
Definition: Feat_id_.cpp:134
void SetCode(TCode &value)
Assign a value to Code data member.
Definition: Cdregion_.cpp:68
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Feat_id_.hpp:353
TFrame GetFrame(void) const
Get the Frame member data.
Definition: Cdregion_.hpp:534
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
const TExcept_text & GetExcept_text(void) const
Get the Except_text member data.
Definition: Seq_feat_.hpp:1405
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
Definition: Seq_feat_.hpp:1393
const TCode & GetCode(void) const
Get the Code member data.
Definition: Cdregion_.hpp:712
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
bool CanGetLocation(void) const
Check if it is safe to call GetLocation method.
Definition: Seq_feat_.hpp:1111
const TCdregion & GetCdregion(void) const
Get the variant data.
bool CanGetExcept_text(void) const
Check if it is safe to call GetExcept_text method.
Definition: Seq_feat_.hpp:1399
const TProduct & GetProduct(void) const
Get the Product member data.
Definition: Seq_feat_.hpp:1096
bool CanGetExcept(void) const
Check if it is safe to call GetExcept method.
Definition: Seq_feat_.hpp:996
TPartial GetPartial(void) const
Get the Partial member data.
Definition: Seq_feat_.hpp:962
TExcept GetExcept(void) const
Get the Except member data.
Definition: Seq_feat_.hpp:1009
vector< CRef< CGb_qual > > TQual
Definition: Seq_feat_.hpp:117
bool CanGetProduct(void) const
Check if it is safe to call GetProduct method.
Definition: Seq_feat_.hpp:1090
bool IsSetProduct(void) const
product of process Check if a value has been assigned to Product data member.
Definition: Seq_feat_.hpp:1084
bool IsSetFrame(void) const
Check if a value has been assigned to Frame data member.
Definition: Cdregion_.hpp:509
bool IsSetLocation(void) const
feature made from Check if a value has been assigned to Location data member.
Definition: Seq_feat_.hpp:1105
@ e_not_set
No variant selected.
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
const TWhole & GetWhole(void) const
Get the variant data.
Definition: Seq_loc_.cpp:172
TFrom GetFrom(void) const
Get the From member data.
TGi & SetGi(void)
Select the variant.
Definition: Seq_id_.hpp:896
TTo GetTo(void) const
Get the To member data.
bool IsWhole(void) const
Check if variant Whole is selected.
Definition: Seq_loc_.hpp:522
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
const TLoc & GetLoc(void) const
Get the Loc member data.
Definition: Seq_graph_.hpp:869
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
TClass GetClass(void) const
Get the Class member data.
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSetSeq_set(void) const
Check if a value has been assigned to Seq_set data member.
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
@ eClass_parts
parts for 2 or 3
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
@ eClass_gen_prod_set
genomic products, chrom+mRNA+protein
@ eClass_segset
segmented sequence + parts
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
const Tdata & Get(void) const
Get the member data.
bool IsSetSeq_data(void) const
the sequence Check if a value has been assigned to Seq_data data member.
Definition: Seq_inst_.hpp:805
ERepr
representation class
Definition: Seq_inst_.hpp:91
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
E_Choice
Choice variants.
Definition: Seq_data_.hpp:102
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
bool IsSetLength(void) const
length of sequence in residues Check if a value has been assigned to Length data member.
Definition: Seq_inst_.hpp:640
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_data_.hpp:475
@ eRepr_seg
segmented sequence
Definition: Seq_inst_.hpp:95
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ e_not_set
No variant selected.
Definition: Seq_data_.hpp:103
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
bool IsData(void) const
Check if variant Data is selected.
Definition: T3Reply_.hpp:263
const TData & GetData(void) const
Get the variant data.
Definition: T3Reply_.cpp:124
bool IsSetStatus(void) const
Check if a value has been assigned to Status data member.
Definition: T3Data_.hpp:328
const TStatus & GetStatus(void) const
Get the Status member data.
Definition: T3Data_.hpp:340
list< CRef< CT3StatusFlags > > TStatus
Definition: T3Data_.hpp:94
bool IsError(void) const
Check if variant Error is selected.
Definition: T3Reply_.hpp:257
const TError & GetError(void) const
Get the variant data.
Definition: T3Reply_.cpp:102
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: T3Data_.hpp:285
const TMessage & GetMessage(void) const
Get the Message member data.
Definition: T3Error_.hpp:394
bool IsSetMessage(void) const
Check if a value has been assigned to Message data member.
Definition: T3Error_.hpp:382
int i
int len
static int version
Definition: mdb_load.c:29
EIPRangeType t
Definition: ncbi_localip.c:101
const char * tag
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
int isupper(Uchar c)
Definition: ncbictype.hpp:70
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
const CConstRef< CSeq_id > GetAccession(const CSeq_id_Handle &id_handle)
The Object manager core.
bool IsBlankStringList(const list< string > &str_list)
Definition: utilities.cpp:114
bool s_IsSameSeqAnnot(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2, bool &diff_descriptions)
Definition: utilities.cpp:1503
bool DoesCodingRegionHaveUnnecessaryException(const CSeq_feat &feat, const CBioseq_Handle &loc_handle, CScope &scope)
Definition: utilities.cpp:2906
CBioseq_Handle BioseqHandleFromLocation(CScope *m_Scope, const CSeq_loc &loc)
Definition: utilities.cpp:1281
bool IsDateInPast(const CDate &date)
Definition: utilities.cpp:829
void CheckBioseqEndsForNAndGap(const CSeqVector &vec, EBioseqEndIsType &begin_n, EBioseqEndIsType &begin_gap, EBioseqEndIsType &end_n, EBioseqEndIsType &end_gap, bool &begin_ambig, bool &end_ambig)
Definition: utilities.cpp:1322
string s_ReplaceListFromQuals(const CSeq_feat::TQual &quals)
Definition: utilities.cpp:1690
bool IsCommon(const COrg_ref &org, const string &val)
Definition: utilities.cpp:2138
bool ConsistentWithT(Char ch)
Definition: utilities.cpp:2896
DEFINE_STATIC_ARRAY_MAP(TBypassCdsTransCheckSet, sc_BypassCdsTransCheck, sc_BypassCdsTransCheckText)
vector< CConstRef< CObject_id > > TFeatIdVec
Definition: utilities.cpp:1728
static const char *const sc_ForceCdsTransCheckText[]
Definition: utilities.cpp:2341
string GetDateErrorDescription(int flags)
Definition: utilities.cpp:858
void CalculateEffectiveTranslationLengths(const string &transl_prot, const CSeqVector &prot_vec, size_t &len, size_t &prot_len)
Definition: utilities.cpp:2604
CConstRef< CSeq_id > GetReportableSeqIdForAlignment(const CSeq_align &align, CScope &scope)
Definition: utilities.cpp:401
CSeqVector GetSequenceFromLoc(const CSeq_loc &loc, CScope &scope, CBioseq_Handle::EVectorCoding coding)
Definition: utilities.cpp:204
CBioseq_set_Handle GetNucProtSetParent(const CBioseq_Handle &bioseq)
Definition: utilities.cpp:584
bool HasNoStop(const CSeq_feat &feat, CScope *scope)
Definition: utilities.cpp:2680
bool IsCommonName(const CT3Data &data)
Definition: utilities.cpp:1950
string GetBioseqIdLabel(const CBioseq &sq, bool limited)
Definition: utilities.cpp:985
bool s_IsDifferentDbxrefs(const TDbtags &list1, const TDbtags &list2)
Definition: utilities.cpp:1612
CScope::TIds GetSeqIdsForGI(TGi gi)
Definition: utilities.cpp:142
static const int kNumIgnoreHostWordList
Definition: utilities.cpp:2039
static char s_ConvertChar(char ch)
Definition: utilities.cpp:2207
string FixSpecificHost(const string &val)
returns the corrected specific host, if the specific host is invalid and can be corrected returns an ...
Definition: utilities.cpp:2197
void FixGeneticCode(CCdregion &cdr)
Definition: utilities.cpp:2247
bool ShouldCheckForNsAndGap(const CBioseq_Handle &bsh)
Definition: utilities.cpp:1311
bool IsDicistronic(const CSeq_feat_Handle &f)
Definition: utilities.cpp:1831
bool HasECnumberPattern(const string &str)
Definition: utilities.cpp:1092
vector< TSeqPos > GetMismatches(const CSeq_feat &feat, const CSeqVector &prot_vec, const string &transl_prot)
Definition: utilities.cpp:2633
bool IsNTNCNWACAccession(const string &acc)
Definition: utilities.cpp:2762
static const CBioseq * s_GetSeqFromSet(const CBioseq_set &bsst)
Definition: utilities.cpp:295
CBioseq_Handle GetNucBioseq(const CBioseq_set_Handle &bioseq_set)
Definition: utilities.cpp:590
void AppendBioseqLabel(string &str, const CBioseq &sq, bool supress_context)
Definition: utilities.cpp:1068
CBioseq_Handle GetCDSProductSequence(const CSeq_feat &feat, CScope *scope, const CTSE_Handle &tse, bool far_fetch, bool &is_far)
Definition: utilities.cpp:2581
bool HasBadCharacter(const string &str)
Definition: utilities.cpp:758
bool IsSpecificHostValid(const string &val, string &error_msg)
returns true and error_msg will be empty, if specific host is valid returns true and error_msg will b...
Definition: utilities.cpp:2190
CBioseq_set_Handle GetGenProdSetParent(const CBioseq_set_Handle &set)
Definition: utilities.cpp:573
static string s_GetBioseqAcc(const CSeq_id &id, int *version)
Definition: utilities.cpp:234
static const string sIgnoreHostWordList[]
Definition: utilities.cpp:2028
bool IsInOrganelleSmallGenomeSet(const CSeq_id &id, CScope &scope)
Definition: utilities.cpp:3060
bool s_IsSameStrand(const CSeq_loc &l1, const CSeq_loc &l2, CScope &scope)
Definition: utilities.cpp:1489
bool HasMisSpellFlag(const CT3Data &data)
Definition: utilities.cpp:1972
int CheckDate(const CDate &date, bool require_full_date)
Definition: utilities.cpp:783
bool s_PosIsNNotGap(const CSeqVector &vec, unsigned int pos)
Definition: utilities.cpp:1297
string SpecificHostValueToCheck(const string &val)
Definition: utilities.cpp:2051
static bool s_IsGenbankMasterAccession(const string &acc)
Definition: utilities.cpp:2983
string GetAccessionFromBioseqSet(const CBioseq_set &bsst, int *version)
Definition: utilities.cpp:436
bool BadMultipleSequenceLocation(const CSeq_loc &loc, CScope &scope)
Definition: utilities.cpp:3090
bool IsClassInEntry(const CSeq_entry &se, CBioseq_set::EClass clss)
Definition: utilities.cpp:79
static bool s_AreLinkedToDifferentFeats(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2, CSeqFeatData::ESubtype s1, CSeqFeatData::ESubtype s2)
Definition: utilities.cpp:1729
string InterpretSpecificHostResult(const string &host, const CT3Reply &reply, const string &orig_host)
Definition: utilities.cpp:2094
bool EndsWithBadCharacter(const string &str)
Definition: utilities.cpp:771
bool s_FeatureIdsMatch(const CFeat_id &f1, const CFeat_id &f2)
Definition: utilities.cpp:720
bool PartialsSame(const CSeq_loc &loc1, const CSeq_loc &loc2)
Definition: utilities.cpp:1467
bool IsLocFullLength(const CSeq_loc &loc, const CBioseq_Handle &bsh)
Definition: utilities.cpp:1455
bool s_AreDifferentVariations(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2)
Definition: utilities.cpp:1707
string GetSequenceStringFromLoc(const CSeq_loc &loc, CScope &scope)
Definition: utilities.cpp:176
string GetValidatorLocationLabel(const CSeq_loc &loc, CScope &scope)
Definition: utilities.cpp:961
bool IsOrganelle(int genome)
Definition: utilities.cpp:2838
static string s_GetAccessionForSeqdesc(const CSeq_entry_Handle &seh, const CSeqdesc &desc, int *version)
Definition: utilities.cpp:367
static const char * kUnclassifiedTranslationDiscrepancy
Definition: utilities.cpp:2327
bool IsDicistronicGene(const CSeq_feat_Handle &f)
Indicates whether feature is a dicistronic gene.
Definition: utilities.cpp:1824
bool HasBadProteinStart(const CSeqVector &sv)
Definition: utilities.cpp:2474
bool ConsistentWithA(Char ch)
Definition: utilities.cpp:2878
CStaticArraySet< const char *, PCase_CStr > TBypassCdsTransCheckSet
Definition: utilities.cpp:2338
bool s_StringHasPMID(const string &str)
Definition: utilities.cpp:730
bool DoesmRNAHaveUnnecessaryException(const CSeq_feat &feat, const CBioseq_Handle &nuc, CScope &scope)
Definition: utilities.cpp:2933
bool HasBadStartCodon(const CSeq_loc &loc, const string &transl_prot)
Definition: utilities.cpp:2314
bool s_AreGBQualsIdentical(const CSeq_feat_Handle &feat1, const CSeq_feat_Handle &feat2, bool case_sensitive)
Definition: utilities.cpp:1539
bool DoesFeatureHaveUnnecessaryException(const CSeq_feat &feat, CScope &scope)
Definition: utilities.cpp:2949
bool IsDeltaOrFarSeg(const CSeq_loc &loc, CScope *scope)
Definition: utilities.cpp:90
bool IsNG(const CSeq_id &id)
Definition: utilities.cpp:2797
bool ReportTranslationErrors(const string &except_text)
Definition: utilities.cpp:2348
bool HasInternalStop(const CSeq_feat &feat, CScope &scope, bool ignore_exceptions)
Definition: utilities.cpp:2423
bool s_AreFullLengthCodingRegionsWithDifferentFrames(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2)
Definition: utilities.cpp:1648
bool ConsistentWithC(Char ch)
Definition: utilities.cpp:2884
static const char *const sc_BypassCdsTransCheckText[]
Definition: utilities.cpp:2329
bool IsLikelyTaxname(const string &val)
Definition: utilities.cpp:2159
string GetAccessionFromBioseq(const CBioseq &bioseq, int *version)
Definition: utilities.cpp:430
EDuplicateFeatureType IsDuplicate(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2, bool check_partials, bool case_sensitive)
Reports how two features duplicate each other.
Definition: utilities.cpp:1845
bool SeqIsPatent(const CBioseq &seq)
Definition: utilities.cpp:1159
bool s_AreFeatureLabelsSame(const CSeq_feat_Handle &feat, const CSeq_feat_Handle &prev, bool case_sensitive)
Definition: utilities.cpp:1580
bool IsFarLocation(const CSeq_loc &loc, const CSeq_entry_Handle &seh)
Definition: utilities.cpp:159
static bool s_IsDescOnSeqEntry(const CSeq_entry &entry, const CSeqdesc &desc)
Definition: utilities.cpp:352
TGi GetGIForSeqId(const CSeq_id &id)
Definition: utilities.cpp:125
CStaticArraySet< const char *, PCase_CStr > TForceCdsTransCheckSet
Definition: utilities.cpp:2345
static string s_GetSeq_featAcc(const CSeq_feat &feat, CScope &scope, int *version)
Definition: utilities.cpp:258
size_t CountInternalStopCodons(const string &transl_prot)
Definition: utilities.cpp:2400
CRef< CSeqVector > MakeSeqVectorForResidueCounting(const CBioseq_Handle &bsh)
Definition: utilities.cpp:2462
EAccessionFormatError ValidateAccessionString(const string &accession, bool require_version)
Definition: utilities.cpp:627
CBioseq_set_Handle GetSetParent(const CBioseq_set_Handle &set, CBioseq_set::TClass set_class)
Definition: utilities.cpp:535
void ConvertToEntrezTerm(string &title)
Definition: utilities.cpp:2228
bool IsBioseqInSameSeqEntryAsAlign(const CBioseq_Handle &bsh, const CSeq_align &align, CScope &scope)
Definition: utilities.cpp:389
static void UpdateToBestId(CSeq_loc &loc, CScope &scope)
Definition: utilities.cpp:931
bool IsTemporary(const CSeq_id &id)
Definition: utilities.cpp:2824
bool ConsistentWithG(Char ch)
Definition: utilities.cpp:2890
bool IsBioseqTSA(const CBioseq &seq, CScope *scope)
Definition: utilities.cpp:887
bool FindMatchInOrgRef(const string &str, const COrg_ref &org)
Definition: utilities.cpp:1991
bool g_IsMasterAccession(const CSeq_id &id)
Definition: utilities.cpp:3009
void FeatureHasEnds(const CSeq_feat &feat, CScope *scope, bool &no_beg, bool &no_end)
Definition: utilities.cpp:2553
bool s_PartialAtGapOrNs(CScope *scope, const CSeq_loc &loc, unsigned int tag, bool only_gap)
Definition: utilities.cpp:1180
bool s_AremRNAsLinkedToDifferentCodingRegions(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2)
Definition: utilities.cpp:1818
string GetAccessionFromObjects(const CSerialObject *obj, const CSeq_entry *ctx, CScope &scope, int *version)
Definition: utilities.cpp:446
bool HasStopInProtein(const CSeq_feat &feat, CScope &scope)
Definition: utilities.cpp:2526
static bool s_AreCodingRegionsLinkedToDifferentmRNAs(const CSeq_feat_Handle &f1, const CSeq_feat_Handle &f2)
Definition: utilities.cpp:1812
size_t CountProteinStops(const CSeqVector &sv)
Definition: utilities.cpp:2511
bool IsSequenceFetchable(const CSeq_id &id, CScope *scope)
Definition: utilities.cpp:2721
void AdjustSpecificHostForTaxServer(string &spec_host)
Definition: utilities.cpp:2041
bool IsAccession(const CSeq_id &id)
Definition: utilities.cpp:921
string TranslateCodingRegionForValidation(const CSeq_feat &feat, CScope &scope, bool &alt_start)
Definition: utilities.cpp:2274
CSeqVector GetSequenceFromFeature(const CSeq_feat &feat, CScope &scope, CBioseq_Handle::EVectorCoding coding, bool product)
Definition: utilities.cpp:215
bool IsResidue(unsigned char residue)
Definition: utilities.hpp:88
EBioseqEndIsType
Definition: utilities.hpp:156
@ eBioseqEndIsType_Last
Definition: utilities.hpp:158
@ eBioseqEndIsType_None
Definition: utilities.hpp:157
@ eBioseqEndIsType_All
Definition: utilities.hpp:159
@ eDateValid_bad_str
Definition: utilities.hpp:124
@ eDateValid_empty_date
Definition: utilities.hpp:130
@ eDateValid_valid
Definition: utilities.hpp:123
@ eDateValid_bad_year
Definition: utilities.hpp:125
@ eDateValid_bad_day
Definition: utilities.hpp:127
@ eDateValid_bad_other
Definition: utilities.hpp:129
@ eDateValid_bad_month
Definition: utilities.hpp:126
@ eDateValid_bad_season
Definition: utilities.hpp:128
EDuplicateFeatureType
Definition: utilities.hpp:191
@ eDuplicate_Duplicate
Definition: utilities.hpp:193
@ eDuplicate_DuplicateDifferentTable
Definition: utilities.hpp:195
@ eDuplicate_SameIntervalDifferentLabel
Definition: utilities.hpp:194
@ eDuplicate_Not
Definition: utilities.hpp:192
@ eDuplicate_SameIntervalDifferentLabelDifferentTable
Definition: utilities.hpp:196
const CSeq_feat::TDbxref TDbtags
Definition: utilities.hpp:199
EAccessionFormatError
Definition: utilities.hpp:105
@ eAccessionFormat_too_long
Definition: utilities.hpp:110
@ eAccessionFormat_missing_version
Definition: utilities.hpp:111
@ eAccessionFormat_valid
Definition: utilities.hpp:106
@ eAccessionFormat_bad_version
Definition: utilities.hpp:112
@ eAccessionFormat_no_start_letters
Definition: utilities.hpp:107
@ eAccessionFormat_wrong_number_of_digits
Definition: utilities.hpp:108
@ eAccessionFormat_null
Definition: utilities.hpp:109
static char tmp[2048]
Definition: utf8.c:42
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
#define FOR_EACH_SEQID_ON_BIOSEQ(Itr, Var)
FOR_EACH_SEQID_ON_BIOSEQ EDIT_EACH_SEQID_ON_BIOSEQ.
Definition: seq_macros.hpp:308
#define FOR_EACH_GBQUAL_ON_FEATURE
#define FOR_EACH_SYN_ON_ORGREF(Itr, Var)
FOR_EACH_SYN_ON_ORGREF EDIT_EACH_SYN_ON_ORGREF.
#define FOR_EACH_SEQFEATXREF_ON_SEQFEAT(Itr, Var)
FOR_EACH_SEQFEATXREF_ON_SEQFEAT EDIT_EACH_SEQFEATXREF_ON_SEQFEAT.
static const char * str(char *buf, int n)
Definition: stats.c:84
else result
Definition: token2.c:20
size_t GetMRNATranslationProblems(const CSeq_feat &feat, size_t &mismatches, bool ignore_exceptions, CBioseq_Handle nuc, CBioseq_Handle rna, bool far_fetch, bool is_gpipe, bool is_genomic, CScope *scope)
@ eMRNAProblem_UnnecessaryException
#define const
Definition: zconf.h:230
Modified on Thu Sep 21 03:45:22 2023 by modify_doxy.py rev. 669887