NCBI C++ ToolKit
validatorp.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: validatorp.cpp 101604 2024-01-10 17:07:04Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko, Mati Shomrat, ....
27  *
28  * File Description:
29  * Implementation of private parts of the validator
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/ncbiapp.hpp>
38 
48 
49 #include <serial/iterator.hpp>
50 #include <serial/enumvalues.hpp>
51 
55 
57 
60 
61 #include <objects/seq/Bioseq.hpp>
63 #include <objects/seq/Seqdesc.hpp>
65 #include <objects/seq/Pubdesc.hpp>
66 #include <objects/seq/MolInfo.hpp>
73 
78 
80 
83 
84 #include <objmgr/bioseq_ci.hpp>
85 #include <objmgr/seqdesc_ci.hpp>
86 #include <objmgr/graph_ci.hpp>
87 #include <objmgr/seq_annot_ci.hpp>
88 #include <objmgr/util/feature.hpp>
89 #include <objmgr/util/sequence.hpp>
90 
91 #include <objmgr/feat_ci.hpp>
92 #include <objmgr/align_ci.hpp>
93 #include <objmgr/seq_vector.hpp>
94 #include <objmgr/scope.hpp>
95 
96 #include <objects/pub/Pub.hpp>
98 
110 #include <objects/biblio/Title.hpp>
112 #include <objects/biblio/Affil.hpp>
115 #include <objects/taxon3/taxon3.hpp>
117 
124 
125 #include <objtools/error_codes.hpp>
131 #include <util/sgml_entity.hpp>
132 #include <util/line_reader.hpp>
133 #include <util/util_misc.hpp>
134 #include <util/static_set.hpp>
135 
136 #include <algorithm>
137 
138 
139 #include <serial/iterator.hpp>
140 
141 #define NCBI_USE_ERRCODE_X Objtools_Validator
142 
145 BEGIN_SCOPE(validator)
146 using namespace sequence;
147 
148 namespace {
149  // avoid creating a PQuickStringLess for every comparison
150  PQuickStringLess s_QuickStringLess;
151 };
152 
153 
154 // =============================================================================
155 // CValidError_imp Public
156 // =============================================================================
157 
163 
167 
169 (CObjectManager& objmgr,
170  shared_ptr<SValidatorContext> pContext,
171  CValidError* errs,
172  Uint4 options) :
173  m_ObjMgr{&objmgr},
174  m_ErrRepository{errs},
175  m_pContext{pContext}
176 {
177  x_Init(options);
178 }
179 
181 {
182  SetOptions(options);
183  Reset();
184 
186 }
187 
188 // Destructor
190 {
191 }
192 
193 
195 {
196  // if (!m_pContext) {
197  // m_pContext = make_shared<SValidatorContext>();
198  // }
200  return *m_pContext;
201 }
202 
203 
205 {
207  return *m_pContext;
208 }
209 
210 
212 {
213  const auto& context = GetContext();
214  return context.PreprocessHugeFile ||
215  context.PostprocessHugeFile;
216 }
217 
218 
219 bool CValidError_imp::IsHugeSet(const CBioseq_set& bioseqSet) const
220 {
221  if (bioseqSet.IsSetClass()) {
222  return IsHugeSet(bioseqSet.GetClass());
223  }
224  return false;
225 }
226 
227 
229 {
230  return edit::CHugeAsnReader::IsHugeSet(setClass);
231 }
232 
233 
234 bool CValidError_imp::IsFarSequence(const CSeq_id& id) // const
235 {
236  if (IsHugeFileMode() && GetContext().IsIdInBlob) {
237  return !GetContext().IsIdInBlob(id);
238  }
239 
240  _ASSERT(m_Scope);
241  if (GetBioseqHandleFromTSE(id)) {
242  return false;
243  }
244  return true;
245 }
246 
247 
249 {
250  if (m_Scope) {
252  }
253  return CBioseq_Handle();
254 }
255 
256 
258 {
259  if (!IsHugeFileMode()) {
260  return GetBioseqHandleFromTSE(id);
261  }
262  // Huge-file mode
263  if (!IsFarSequence(id)) {
264  return m_Scope->GetBioseqHandle(id);
265  }
266  return CBioseq_Handle();
267 }
268 
269 
271 {
272  m_NonASCII = (options & CValidator::eVal_non_ascii) != 0;
275  m_ValidateExons = (options & CValidator::eVal_val_exons) != 0;
276  m_OvlPepErr = (options & CValidator::eVal_ovl_pep_err) != 0;
279  m_RemoteFetch = (options & CValidator::eVal_remote_fetch) != 0;
285  m_UseEntrez = (options & CValidator::eVal_use_entrez) != 0;
299 }
300 
301 
302 //LCOV_EXCL_START
303 //not used by asnvalidate
305 {
306  m_ErrRepository = errors;
307 }
308 //LCOV_EXCL_STOP
309 
310 
312 {
313  m_Scope = nullptr;
314  m_TSE = nullptr;
315  m_IsStandaloneAnnot = false;
316  m_SeqAnnot.Reset();
317 
318  m_pEntryInfo.reset(new CValidatorEntryInfo());
319 
320  m_IsNC = false;
321  m_IsNG = false;
322  m_IsNM = false;
323  m_IsNP = false;
324  m_IsNR = false;
325  m_IsNZ = false;
326  m_IsNS = false;
327  m_IsNT = false;
328  m_IsNW = false;
329  m_IsWP = false;
330  m_IsXR = false;
331 
332  m_PrgCallback = nullptr;
333  m_NumAlign = 0;
334  m_NumAnnot = 0;
335  m_NumBioseq = 0;
336  m_NumBioseq_set = 0;
338  m_NumDesc = 0;
339  m_NumDescr = 0;
340  m_NumFeat = 0;
341  m_NumGraph = 0;
345  m_NumGenes = 0;
346  m_NumGeneXrefs = 0;
349  m_NumPseudo = 0;
350  m_NumPseudogene = 0;
351  m_FarFetchFailure = false;
352  m_IsTbl2Asn = false;
353 }
354 
355 
356 // Error post methods
358 (EDiagSev sv,
359  EErrType et,
360  const string& msg,
361  const CSerialObject& obj)
362 {
363  const CTypeInfo* type_info = obj.GetThisTypeInfo();
364  if (type_info == CSeqdesc::GetTypeInfo()) {
365  const CSeqdesc* desc = dynamic_cast < const CSeqdesc* > (&obj);
366  ERR_POST_X(1, Warning << "Seqdesc validation error using default context.");
367  PostErr (sv, et, msg, GetTSE(), *desc);
368  } else if (type_info == CSeq_feat::GetTypeInfo()) {
369  const CSeq_feat* feat = dynamic_cast < const CSeq_feat* > (&obj);
370  PostErr (sv, et, msg, *feat);
371  } else if (type_info == CBioseq::GetTypeInfo()) {
372  const CBioseq* seq = dynamic_cast < const CBioseq* > (&obj);
373  PostErr (sv, et, msg, *seq);
374  } else if (type_info == CBioseq_set::GetTypeInfo()) {
375  const CBioseq_set* set = dynamic_cast < const CBioseq_set* > (&obj);
376  PostErr (sv, et, msg, *set);
377  } else if (type_info == CSeq_annot::GetTypeInfo()) {
378  const CSeq_annot* annot = dynamic_cast < const CSeq_annot* > (&obj);
379  PostErr (sv, et, msg, *annot);
380  } else if (type_info == CSeq_graph::GetTypeInfo()) {
381  const CSeq_graph* graph = dynamic_cast < const CSeq_graph* > (&obj);
382  PostErr (sv, et, msg, *graph);
383  } else if (type_info == CSeq_align::GetTypeInfo()) {
384  const CSeq_align* align = dynamic_cast < const CSeq_align* > (&obj);
385  PostErr (sv, et, msg, *align);
386  } else if (type_info == CSeq_entry::GetTypeInfo()) {
387  const CSeq_entry* entry = dynamic_cast < const CSeq_entry* > (&obj);
388  PostErr (sv, et, msg, *entry);
389  } else if (type_info == CBioSource::GetTypeInfo()) {
390  const CBioSource* src = dynamic_cast < const CBioSource* > (&obj);
391  PostErr (sv, et, msg, *src);
392  } else if (type_info == COrg_ref::GetTypeInfo()) {
393  const COrg_ref* org = dynamic_cast < const COrg_ref* > (&obj);
394  PostErr (sv, et, msg, *org);
395  } else if (type_info == CPubdesc::GetTypeInfo()) {
396  const CPubdesc* pd = dynamic_cast < const CPubdesc* > (&obj);
397  PostErr (sv, et, msg, *pd);
398  } else if (type_info == CSeq_submit::GetTypeInfo()) {
399  const CSeq_submit* ss = dynamic_cast < const CSeq_submit* > (&obj);
400  PostErr (sv, et, msg, *ss);
401  } else {
402  ERR_POST_X(1, Warning << "Unknown data type in PostErr.");
403  }
404 }
405 
406 
407 /*
408 void CValidError_imp::PostErr
409 (EDiagSev sv,
410  EErrType et,
411  const string& msg,
412  TDesc ds)
413 {
414  // Append Descriptor label
415  string desc = "DESCRIPTOR: ";
416  ds.GetLabel (&desc, CSeqdesc::eBoth);
417  desc += ", NO Descriptor Context";
418  m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, *m_Scope);
419 }
420 */
421 
422 static const EErrType sc_ValidGenomeRaise[] = {
580 };
581 
583 
589 };
590 
592 
593 
596 };
597 
599 
600 
602  EErrType et
603 )
604 
605 {
606  if (sc_GenomeRaiseExceptEmblDdbjRefSeqArray.find(et) != sc_GenomeRaiseExceptEmblDdbjRefSeqArray.end()) {
607  if (IsEmbl() || IsDdbj() || IsRefSeq()) {
608  return false;
609  } else {
610  return true;
611  }
612  }
613  if (sc_GenomeRaiseExceptEmblDdbjArray.find(et) != sc_GenomeRaiseExceptEmblDdbjArray.end()) {
614  if (IsEmbl() || IsDdbj()) {
615  return false;
616  } else {
617  return true;
618  }
619  }
620  if (sc_GenomeRaiseArray.find (et) != sc_GenomeRaiseArray.end()) {
621  return true;
622  }
623  return false;
624 }
625 
627 (EDiagSev sv,
628  EErrType et,
629  const string& msg,
630  TFeat ft)
631 {
633 
634  // Adjust severity
636  sv = eDiag_Error;
637  }
638 
639  item->SetSev(sv);
640  item->SetErrIndex(et);
641  item->SetMsg(msg);
642  item->SetObject(ft);
643 
644  if (GenerateGoldenFile()) {
646  return;
647  }
648 
649  string content_label = CValidErrorFormat::GetFeatureContentLabel(ft, m_Scope);
650  item->SetObj_content(content_label);
651 
652  string feature_id = CValidErrorFormat::GetFeatureIdLabel(ft);
653  if (!NStr::IsBlank(feature_id)) {
654  item->SetFeatureId(feature_id);
655  }
656 
658  if (!NStr::IsBlank(bioseq_label)) {
659  item->SetBioseq(bioseq_label);
660  }
661 
662  // Calculate sequence offset
663  TSeqPos offset = 0;
664  string location;
665  if (ft.IsSetLocation()) {
668  if (!NStr::IsBlank(loc_label)) {
669  item->SetLocation(loc_label);
670  }
671  item->SetSeqOffset(offset);
672  }
673 
674 
676  if (!NStr::IsBlank(product_label)) {
677  item->SetProduct_loc(product_label);
678  }
679 
680  int version = 0;
681  string accession;
682  if (m_Scope) {
683  accession = GetAccessionFromObjects(&ft, nullptr, *m_Scope, &version);
684  }
685  item->SetAccession(accession);
686  if (version > 0) {
687  item->SetAccnver(accession + "." + NStr::IntToString(version));
688  item->SetVersion(version);
689  } else {
690  item->SetAccnver(accession);
691  }
692 
693  if (ft.IsSetData()) {
694  if (ft.GetData().IsGene()) {
695  if (ft.GetData().GetGene().IsSetLocus_tag() &&
697  item->SetLocus_tag(ft.GetData().GetGene().GetLocus_tag());
698  }
699  } else {
700  if (m_CollectLocusTags) {
701  // TODO: this should be part of post-processing
703  if (gene && gene->GetData().GetGene().IsSetLocus_tag() &&
704  !NStr::IsBlank(gene->GetData().GetGene().GetLocus_tag())) {
705  item->SetLocus_tag(gene->GetData().GetGene().GetLocus_tag());
706  }
707  }
708  }
709  }
710 
711  item->SetFeatureObjDescFromFields();
713 }
714 
715 
717 (EDiagSev sv,
718  EErrType et,
719  const string& msg,
720  TBioseq sq)
721 {
722  // Adjust severity
724  sv = eDiag_Error;
725  }
726 
727  if (GenerateGoldenFile()) {
728  m_ErrRepository->AddValidErrItem(sv, et, msg);
729  return;
730  }
731 
732  // Append bioseq label
733  string desc;
735  int version = 0;
736  const string& accession = GetAccessionFromBioseq(sq, &version);
737  // GetAccessionFromObjects(&sq, nullptr, *m_Scope, &version);
738  x_AddValidErrItem(sv, et, msg, desc, sq, accession, version);
739 }
740 
741 
743 (EDiagSev sv,
744  EErrType et,
745  const string& msg,
746  TSet st)
747 {
748  // Adjust severity
750  sv = eDiag_Error;
751  }
752 
753  if (GenerateGoldenFile()) {
754  m_ErrRepository->AddValidErrItem(sv, et, msg);
755  return;
756  }
757 
758  // Append Bioseq_set label
759 
760  const auto isSetClass = st.IsSetClass();
761 
762  if (isSetClass && GetContext().PreprocessHugeFile) {
763  if (auto setClass = st.GetClass(); IsHugeSet(setClass)) {
764  string desc =
766  x_AddValidErrItem(sv, et, msg, desc, st, GetContext().HugeSetId, 0);
767  return;
768  }
769  }
770 
771  int version = 0;
772  const string& accession = GetAccessionFromBioseqSet(st, &version);
773  //string desc = CValidErrorFormat::GetBioseqSetLabel(st, m_SuppressContext);
774  string desc = CValidErrorFormat::GetBioseqSetLabel(accession,
775  isSetClass ? st.GetClass() : CBioseq_set::eClass_not_set,
776  isSetClass ? m_SuppressContext : true);
777  x_AddValidErrItem(sv, et, msg, desc, st, accession, version);
778 }
779 
780 
782 (EDiagSev sv,
783  EErrType et,
784  const string& msg,
785  TEntry ctx,
786  TDesc ds)
787 {
788  // Adjust severity
790  sv = eDiag_Error;
791  }
792 
793  if (GenerateGoldenFile()) {
794  m_ErrRepository->AddValidErrItem(sv, et, msg);
795  return;
796  }
797 
798 
799  if (GetContext().PreprocessHugeFile &&
800  ctx.IsSet() && ctx.GetSet().IsSetClass()) {
801  if (auto setClass = ctx.GetSet().GetClass(); IsHugeSet(setClass)) {
802  string desc{"DESCRIPTOR: "};
803  desc += CValidErrorFormat::GetDescriptorContent(ds) + " ";
804  desc += "BIOSEQ-SET: ";
805  if (!m_SuppressContext) {
806  if (setClass == CBioseq_set::eClass_genbank) {
807  desc += "genbank: ";
808  }
809  else {
810  desc += "wgs-set: ";
811  }
812  }
813  desc += GetContext().HugeSetId;
814  m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, GetContext().HugeSetId, 0);
815  return;
816  }
817  }
818 
819  // Append Descriptor label
821  int version = 0;
822  const string& accession = GetAccessionFromObjects(&ds, &ctx, *m_Scope, &version);
823  m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, ctx, accession, version);
824 }
825 
826 
827 //void CValidError_imp::PostErr
828 //(EDiagSev sv,
829 // EErrType et,
830 // const string& msg,
831 // TBioseq sq,
832 // TDesc ds)
833 //{
834 // // Append Descriptor label
835 // string desc("DESCRIPTOR: ");
836 // ds.GetLabel(&desc, CSeqdesc::eBoth);
837 //
838 // s_AppendBioseqLabel(desc, sq, m_SuppressContext);
839 // m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, *m_Scope);
840 // //PostErr(sv, et, msg, sq);
841 //}
842 
843 
844 //void CValidError_imp::PostErr
845 //(EDiagSev sv,
846 // EErrType et,
847 // const string& msg,
848 // TSet st,
849 // TDesc ds)
850 //{
851 // // Append Descriptor label
852 // string desc = " DESCRIPTOR: ";
853 // ds.GetLabel(&desc, CSeqdesc::eBoth);
854 // s_AppendSetLabel(desc, st, m_SuppressContext);
855 // m_ErrRepository->AddValidErrItem(sv, et, msg, desc, st, *m_Scope);
856 //
857 //}
858 
859 
861 (EDiagSev sv,
862  EErrType et,
863  const string& msg,
864  TAnnot an)
865 {
866  // Adjust severity
868  sv = eDiag_Error;
869  }
870 
871  if (GenerateGoldenFile()) {
872  m_ErrRepository->AddValidErrItem(sv, et, msg);
873  return;
874  }
875 
876  // Append Annotation label
877  string desc = "ANNOTATION: ";
878 
879  // !!! need to decide on the message
880 
881  int version = 0;
882  const string& accession = GetAccessionFromObjects(&an, nullptr, *m_Scope, &version);
883  x_AddValidErrItem(sv, et, msg, desc, an, accession, version);
884 }
885 
886 
888 (EDiagSev sv,
889  EErrType et,
890  const string& msg,
891  TGraph graph)
892 {
893  // Adjust severity
895  sv = eDiag_Error;
896  }
897 
898  if (GenerateGoldenFile()) {
899  m_ErrRepository->AddValidErrItem(sv, et, msg);
900  return;
901  }
902 
903  // Append Graph label
904  string desc = "GRAPH: ";
905  if (graph.IsSetTitle()) {
906  desc += graph.GetTitle();
907  } else {
908  desc += "<Unnamed>";
909  }
910  desc += " ";
911  graph.GetLoc().GetLabel(&desc);
912 
913  int version = 0;
914  const string& accession = GetAccessionFromObjects(&graph, nullptr, *m_Scope, &version);
915  x_AddValidErrItem(sv, et, msg, desc, graph, accession, version);
916 }
917 
918 
920 (EDiagSev sv,
921  EErrType et,
922  const string& msg,
923  TBioseq sq,
924  TGraph graph)
925 {
926  // Adjust severity
928  sv = eDiag_Error;
929  }
930 
931  if (GenerateGoldenFile()) {
932  m_ErrRepository->AddValidErrItem(sv, et, msg);
933  return;
934  }
935 
936  // Append Graph label
937  string desc("GRAPH: ");
938  if ( graph.IsSetTitle() ) {
939  desc += graph.GetTitle();
940  } else {
941  desc += "<Unnamed>";
942  }
943  desc += " ";
944  graph.GetLoc().GetLabel(&desc);
946  int version = 0;
947  const string& accession = GetAccessionFromObjects(&graph, nullptr, *m_Scope, &version);
948  x_AddValidErrItem(sv, et, msg, desc, graph, accession, version);
949 }
950 
951 
953 (EDiagSev sv,
954  EErrType et,
955  const string& msg,
956  TAlign align)
957 {
958  // Adjust severity
960  sv = eDiag_Error;
961  }
962 
963  if (GenerateGoldenFile()) {
964  m_ErrRepository->AddValidErrItem(sv, et, msg);
965  return;
966  }
967 
969  if (id) {
971  if (bsh) {
972  PostErr(sv, et, msg, *(bsh.GetCompleteBioseq()));
973  return;
974  }
975  }
976 
977  // Can't get bioseq for reporting, use other Alignment label
978  string desc = "ALIGNMENT: ";
979  if (align.IsSetType()) {
980  desc += align.ENUM_METHOD_NAME(EType)()->FindName(align.GetType(), true);
981  }
982  try {
983  CSeq_align::TDim dim = align.GetDim();
984  desc += ", dim=" + NStr::NumericToString(dim);
985  } catch ( const CUnassignedMember& ) {
986  desc += ", dim=UNASSIGNED";
987  }
988 
989  if (align.IsSetSegs()) {
990  desc += " SEGS: ";
991  desc += align.GetSegs().SelectionName(align.GetSegs().Which());
992  }
993 
994  int version = 0;
995  const string& accession = GetAccessionFromObjects(&align, nullptr, *m_Scope, &version);
996  x_AddValidErrItem(sv, et, msg, desc, align, accession, version);
997 }
998 
999 
1001 (EDiagSev sv,
1002  EErrType et,
1003  const string& msg,
1004  TEntry entry)
1005 {
1006  // Adjust severity
1007  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1008  sv = eDiag_Error;
1009  }
1010 
1011  if (GenerateGoldenFile()) {
1012  m_ErrRepository->AddValidErrItem(sv, et, msg);
1013  return;
1014  }
1015 
1016  if (entry.IsSeq()) {
1017  PostErr(sv, et, msg, entry.GetSeq());
1018  } else if (entry.IsSet()) {
1019  PostErr(sv, et, msg, entry.GetSet());
1020  } else {
1021  string desc = "SEQ-ENTRY: ";
1022  entry.GetLabel(&desc, CSeq_entry::eContent);
1023 
1024  int version = 0;
1025  const string& accession = GetAccessionFromObjects(&entry, nullptr, *m_Scope, &version);
1026  x_AddValidErrItem(sv, et, msg, desc, entry, accession, version);
1027  }
1028 }
1029 
1030 
1032 (EDiagSev sv,
1033  EErrType et,
1034  const string& msg,
1035  const CBioSource& src)
1036 {
1037  // Adjust severity
1038  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1039  sv = eDiag_Error;
1040  }
1041 
1042  if (GenerateGoldenFile()) {
1043  m_ErrRepository->AddValidErrItem(sv, et, msg);
1044  return;
1045  }
1046 
1047  string desc = "BioSource: ";
1048  x_AddValidErrItem(sv, et, msg, desc, src, "", 0);
1049 }
1050 
1051 
1053 (EDiagSev sv,
1054  EErrType et,
1055  const string& msg,
1056  const COrg_ref& org)
1057 {
1058  // Adjust severity
1059  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1060  sv = eDiag_Error;
1061  }
1062 
1063  if (GenerateGoldenFile()) {
1064  m_ErrRepository->AddValidErrItem(sv, et, msg);
1065  return;
1066  }
1067 
1068  string desc = "Org-ref: ";
1069  x_AddValidErrItem(sv, et, msg, desc, org, "", 0);
1070 }
1071 
1072 
1074 (EDiagSev sv,
1075  EErrType et,
1076  const string& msg,
1077  const CPubdesc& pd)
1078 {
1079  // Adjust severity
1080  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1081  sv = eDiag_Error;
1082  }
1083 
1084  if (GenerateGoldenFile()) {
1085  m_ErrRepository->AddValidErrItem(sv, et, msg);
1086  return;
1087  }
1088 
1089  string desc = "Pubdesc: ";
1090  x_AddValidErrItem(sv, et, msg, desc, pd, "", 0);
1091 }
1092 
1093 
1095 (EDiagSev sv,
1096  EErrType et,
1097  const string& msg,
1098  const CSeq_submit& ss)
1099 {
1100  // Adjust severity
1101  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1102  sv = eDiag_Error;
1103  }
1104 
1105  if (GenerateGoldenFile()) {
1106  m_ErrRepository->AddValidErrItem(sv, et, msg);
1107  return;
1108  }
1109 
1110  string desc = "Seq-submit: ";
1111  x_AddValidErrItem(sv, et, msg, desc, ss, "", 0);
1112 }
1113 
1114 
1116  EDiagSev sev,
1117  EErrType type,
1118  const string& msg,
1119  const string& desc,
1120  const CSerialObject& obj,
1121  const string& accession,
1122  const int version)
1123 {
1124  if (IsHugeFileMode()) {
1125  m_ErrRepository->AddValidErrItem(sev, type, msg, desc, accession, version);
1126  return;
1127  }
1128  m_ErrRepository->AddValidErrItem(sev, type, msg, desc, obj, accession, version);
1129 }
1130 
1131 
1133 (EDiagSev sv,
1134  EErrType et,
1135  const string& msg,
1136  const CSerialObject& obj,
1137  const CSeq_entry *ctx)
1138 {
1139  if (!ctx) {
1140  PostErr (sv, et, msg, obj);
1141  } else if (obj.GetThisTypeInfo() == CSeqdesc::GetTypeInfo()) {
1142  PostErr(sv, et, msg, *ctx, *(dynamic_cast <const CSeqdesc*> (&obj)));
1143  } else {
1144  PostErr(sv, et, msg, obj);
1145  }
1146 
1147 }
1148 
1149 
1151 (EDiagSev sv,
1152  const string& msg,
1153  int flags,
1154  const CSerialObject& obj,
1155  const CSeq_entry *ctx)
1156 {
1157  string reasons = GetDateErrorDescription(flags);
1158 
1159  NStr::TruncateSpacesInPlace (reasons);
1160  reasons = msg + " - " + reasons;
1161 
1162  PostObjErr (sv, eErr_GENERIC_BadDate, reasons, obj, ctx);
1163 }
1164 
1165 
1167 (const CSeq_entry& se,
1168  const CCit_sub* cs,
1169  CScope* scope)
1170 {
1171  CSeq_entry_Handle seh;
1172  try {
1173  seh = scope->GetSeq_entryHandle(se);
1174  } catch (const CException& ) { ; }
1175  if (! seh) {
1176  seh = scope->AddTopLevelSeqEntry(se);
1177  if (!seh) {
1178  return false;
1179  }
1180  }
1181 
1182  return Validate(seh, cs);
1183 }
1184 
1185 static bool s_IsPhage(const COrg_ref& org)
1186 {
1187  if (org.IsSetDivision() && NStr::Equal(org.GetDivision(), "PHG")) {
1188  return true;
1189  } else {
1190  return false;
1191  }
1192 }
1193 
1194 
1196 {
1197  bool has_mult = false;
1198  int first_id = 0;
1199  int phage_id = 0;
1200 
1201  for (CBioseq_CI bi(seh); bi; ++bi) {
1202  for (CSeqdesc_CI desc_ci(*bi, CSeqdesc::e_Source);
1203  desc_ci && !has_mult;
1204  ++desc_ci) {
1205  if (desc_ci->GetSource().IsSetOrg()) {
1206  const COrg_ref& org = desc_ci->GetSource().GetOrg();
1207  if (org.IsSetDb()) {
1208  ITERATE(COrg_ref::TDb, it, org.GetDb()) {
1209  if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "taxon") &&
1210  (*it)->IsSetTag() && (*it)->GetTag().IsId()) {
1211  int this_id = (*it)->GetTag().GetId();
1212  if (this_id > 0) {
1213  if (s_IsPhage(org)) {
1214  phage_id = this_id;
1215  } else if (first_id == 0) {
1216  first_id = this_id;
1217  } else if (first_id != this_id) {
1218  has_mult = true;
1219  }
1220  }
1221  }
1222  }
1223  }
1224  }
1225  }
1226  }
1227  if (has_mult || (phage_id > 0 && first_id > 0)) {
1229  "There are multiple taxonIDs in this RefSeq record.",
1230  *m_TSE);
1231  }
1232 }
1233 
1234 
1236 {
1237  return *m_pEntryInfo;
1238 }
1239 
1240 
1242 {
1243  if (!m_pEntryInfo) {
1244  m_pEntryInfo.reset(new CValidatorEntryInfo());
1245  }
1246 
1247  return *m_pEntryInfo;
1248 }
1249 
1250 
1252 (const CSeq_entry_Handle& seh,
1253  const CCit_sub* cs)
1254 {
1255  _ASSERT(seh);
1256 
1257  if ( m_PrgCallback ) {
1259  if ( m_PrgCallback(&m_PrgInfo) ) {
1260  return false;
1261  }
1262  }
1263 
1264  // Check that CSeq_entry has data
1265  if (seh.Which() == CSeq_entry::e_not_set) {
1266  ERR_POST_X(2, Warning << "Seq_entry not set");
1267  return false;
1268  }
1269 
1270  Setup(seh);
1271 
1272  // Seq-submit has submission citationTest_Descr_LatLonValue
1273  if (cs) {
1274  x_SetEntryInfo().SetNoPubs(false);
1276  }
1277 
1278  // Get first CBioseq object pointer for PostErr below.
1280  if (!seq) {
1282  "No Bioseqs in this entire record.", seh.GetCompleteSeq_entry()->GetSet());
1283  return true;
1284  }
1285 
1286  // If m_NonASCII is true, then this flag was set by the caller
1287  // of validate to indicate that a non ascii character had been
1288  // read from a file being used to create a CSeq_entry, that the
1289  // error had been corrected, but that the error needs to be reported
1290  // by Validate. Note, Validate is not doing anything other than
1291  // reporting an error if m_NonASCII is true;
1292  if (m_NonASCII) {
1294  "Non-ascii chars in input ASN.1 strings", *seq);
1295  // Only report the error once
1296  m_NonASCII = false;
1297  }
1298 
1299  // Iterate thru components of record and validate each
1300 
1301  // also want to know if we have gi
1302  bool has_gi = false;
1303  // also want to know if there are any nucleotide sequences
1304  bool has_nucleotide_sequence = false;
1305 
1307  bi && (!IsINSDInSep() || !has_gi || !has_nucleotide_sequence);
1308  ++bi) {
1309  FOR_EACH_SEQID_ON_BIOSEQ (it, *(bi->GetCompleteBioseq())) {
1310  if ((*it)->IsGi()) {
1311  has_gi = true;
1312  }
1313  }
1314  if (bi->IsSetInst_Mol() && bi->IsNa()) {
1315  has_nucleotide_sequence = true;
1316  }
1317  }
1318 
1319  if (IsINSDInSep() && m_pEntryInfo->IsRefSeq()) {
1320  // NOTE: We use m_IsRefSeq to indicate the actual presence of RefSeq IDs in
1321  // the record, rather than IsRefSeq(), which indicates *either* RefSeq IDs are
1322  // present *OR* the refseq flag has been used
1324  "INSD and RefSeq records should not be present in the same set", *m_TSE);
1325  }
1326 
1327 #if 0
1328  // disabled for now
1329  // look for long IDs that would collide if truncated at 30 characters
1330  vector<string> id_strings;
1332  bi;
1333  ++bi) {
1334  FOR_EACH_SEQID_ON_BIOSEQ (it, *(bi->GetCompleteBioseq())) {
1335  if (!IsNCBIFILESeqId(**it)) {
1336  string label;
1337  (*it)->GetLabel(&label);
1338  id_strings.push_back(label);
1339  }
1340  }
1341  }
1342  stable_sort (id_strings.begin(), id_strings.end());
1343  for (vector<string>::iterator id_str_it = id_strings.begin();
1344  id_str_it != id_strings.end();
1345  ++id_str_it) {
1346  string pattern = (*id_str_it).substr(0, 30);
1347  string first_id = *id_str_it;
1348  vector<string>::iterator cmp_it = id_str_it;
1349  ++cmp_it;
1350  while (cmp_it != id_strings.end() && NStr::StartsWith(*cmp_it, pattern)) {
1351  CRef<CSeq_id> id(new CSeq_id(*cmp_it));
1354  "First 30 characters of " + first_id + " and " +
1355  *cmp_it + " are identical", *(bsh.GetCompleteBioseq()));
1356  ++id_str_it;
1357  ++cmp_it;
1358  }
1359  }
1360 #endif
1361 
1362  // look for colliding feature IDs
1363  vector < int > feature_ids;
1364  for (CFeat_CI fi(GetTSEH()); fi; ++fi) {
1365  const CSeq_feat& sf = fi->GetOriginalFeature();
1366  if (sf.IsSetId() && sf.GetId().IsLocal() && sf.GetId().GetLocal().IsId()) {
1367  feature_ids.push_back(sf.GetId().GetLocal().GetId());
1368  }
1369  }
1370 
1371  if (feature_ids.size() > 0) {
1372  const CTSE_Handle& tse = seh.GetTSE_Handle ();
1373  stable_sort (feature_ids.begin(), feature_ids.end());
1374  vector <int>::iterator it = feature_ids.begin();
1375  int id = *it;
1376  ++it;
1377  while (it != feature_ids.end()) {
1378  if (*it == id) {
1379  vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, id);
1380  ITERATE( vector<CSeq_feat_Handle>, feat_it, handles ) {
1382  "Colliding feature ID " + NStr::NumericToString (id), *(feat_it->GetSeq_feat()));
1383  }
1384  while (it != feature_ids.end() && *it == id) {
1385  ++it;
1386  }
1387  if (it != feature_ids.end()) {
1388  id = *it;
1389  ++it;
1390  }
1391  } else {
1392  id = *it;
1393  ++it;
1394  }
1395  }
1396  }
1397 
1398  // look for mixed gps and non-gps sets
1399  bool has_nongps = false;
1400  bool has_gps = false;
1401 
1402  for (CTypeConstIterator<CBioseq_set> si(*m_TSE); si && (!has_nongps || !has_gps); ++si) {
1403  if (si->IsSetClass()) {
1404  if (si->GetClass() == CBioseq_set::eClass_mut_set
1405  || si->GetClass() == CBioseq_set::eClass_pop_set
1406  || si->GetClass() == CBioseq_set::eClass_phy_set
1407  || si->GetClass() == CBioseq_set::eClass_eco_set
1408  || si->GetClass() == CBioseq_set::eClass_wgs_set
1409  || si->GetClass() == CBioseq_set::eClass_small_genome_set) {
1410  has_nongps = true;
1411  } else if (si->GetClass() == CBioseq_set::eClass_gen_prod_set) {
1412  has_gps = true;
1413  }
1414  }
1415  }
1416 
1417  if (has_nongps && has_gps) {
1419  "Genomic product set and mut/pop/phy/eco set records should not be present in the same set",
1420  *m_TSE);
1421  }
1422 
1423  // count inference accessions - if there are too many, temporarily disable inference checking
1424  bool old_inference_acc_check = m_ValidateInferenceAccessions;
1426  size_t num_inferences = 0, num_accessions = 0;
1427  CFeat_CI feat_inf(seh);
1428  while (feat_inf) {
1429  FOR_EACH_GBQUAL_ON_FEATURE (qual, *feat_inf) {
1430  if ((*qual)->IsSetQual() && (*qual)->IsSetVal() && NStr::Equal((*qual)->GetQual(), "inference")) {
1431  num_inferences++;
1432  string prefix, remainder;
1433  bool same_species;
1434  vector<string> accessions = CValidError_feat::GetAccessionsFromInferenceString ((*qual)->GetVal(), prefix, remainder, same_species);
1435  for (size_t i = 0; i < accessions.size(); i++) {
1436  NStr::TruncateSpacesInPlace (accessions[i]);
1437  string acc_prefix, accession;
1438  if (CValidError_feat::GetPrefixAndAccessionFromInferenceAccession (accessions[i], acc_prefix, accession)) {
1439  if (NStr::EqualNocase (acc_prefix, "INSD") || NStr::EqualNocase (acc_prefix, "RefSeq")) {
1440  num_accessions++;
1441  }
1442  }
1443  }
1444  }
1445  }
1446  ++feat_inf;
1447  }
1448  if (/* num_inferences > 1000 || */ num_accessions > 1000) {
1449  // warn about too many inferences
1451  "Skipping validation of " + NStr::SizetToString (num_inferences) + " /inference qualifiers with "
1452  + NStr::SizetToString (num_accessions) + " accessions",
1453  *m_TSE);
1454 
1455  // disable inference checking
1457  }
1458  }
1459 
1460  // validate the main data
1461  if (seh.IsSeq()) {
1462  const CBioseq& seq2 = seh.GetCompleteSeq_entry()->GetSeq();
1463  CValidError_bioseq bioseq_validator(*this);
1464  try {
1465  bioseq_validator.ValidateBioseq(seq2);
1466  } catch ( const exception& e ) {
1468  string("Exception while validating bioseq. EXCEPTION: ") +
1469  e.what(), seq2);
1470  return true;
1471  }
1472  } else if (seh.IsSet()) {
1473  const CBioseq_set& set = seh.GetCompleteSeq_entry()->GetSet();
1474  CValidError_bioseqset bioseqset_validator(*this);
1475 
1476  try {
1477  bioseqset_validator.ValidateBioseqSet(set);
1478 
1479  } catch ( const exception& e ) {
1481  string("Exception while validating bioseq set. EXCEPTION: ") +
1482  e.what(), set);
1483  return true;
1484  }
1485  }
1486 
1487  // put flag for validating inference accessions back to original value
1488  m_ValidateInferenceAccessions = old_inference_acc_check;
1489 
1490  // validation from data collected during previous step
1491 
1492  if (!GetContext().PreprocessHugeFile) {
1493  if ( m_NumTpaWithHistory > 0 &&
1494  m_NumTpaWithoutHistory > 0 ) {
1496  "There are " +
1498  " TPAs with history and " +
1500  " without history in this record.", *seq);
1501  }
1502  if ( m_NumTpaWithoutHistory > 0 && has_gi) {
1504  "There are " +
1506  " TPAs without history in this record, but the record has a gi number assignment.", *m_TSE);
1507  }
1508  }
1509 
1510  if (IsIndexerVersion() && DoesAnyProteinHaveGeneralID() && !IsRefSeq() && has_nucleotide_sequence) {
1511  call_once(SetContext().ProteinHaveGeneralIDOnceFlag,
1512  [](CValidError_imp* imp, CSeq_entry_Handle seh2) {
1514  "INDEXER_ONLY - Protein bioseqs have general seq-id.",
1515  *(seh2.GetCompleteSeq_entry()));
1516  }, this, seh);
1517  }
1518 
1519  ReportMissingPubs(*m_TSE, cs);
1521 
1522  if (m_NumMisplacedFeatures > 1) {
1524  "There are " + NStr::SizetToString (m_NumMisplacedFeatures) + " mispackaged features in this record.",
1525  *(seh.GetCompleteSeq_entry()));
1526  } else if (m_NumMisplacedFeatures == 1) {
1528  "There is 1 mispackaged feature in this record.",
1529  *(seh.GetCompleteSeq_entry()));
1530  }
1531  if (m_NumSmallGenomeSetMisplaced > 1) {
1533  "There are " + NStr::SizetToString (m_NumSmallGenomeSetMisplaced) + " mispackaged features in this small genome set record.",
1534  *(seh.GetCompleteSeq_entry()));
1535  } else if (m_NumSmallGenomeSetMisplaced == 1) {
1537  "There is 1 mispackaged feature in this small genome set record.",
1538  *(seh.GetCompleteSeq_entry()));
1539  }
1540  if ( !GetContext().PreprocessHugeFile ) {
1541  if ( m_NumGenes == 0 && m_NumGeneXrefs > 0 ) {
1543  "There are " + NStr::SizetToString(m_NumGeneXrefs) +
1544  " gene xrefs and no gene features in this record.", *m_TSE);
1545  }
1546  }
1547  ValidateCitations (seh);
1548 
1549 
1550  if ( m_NumMisplacedGraphs > 0 ) {
1553  string("There ") + ((m_NumMisplacedGraphs > 1) ? "are " : "is ") + num +
1554  " mispackaged graph" + ((m_NumMisplacedGraphs > 1) ? "s" : "") + " in this record.",
1555  *m_TSE);
1556  }
1557 
1558  if ( IsRefSeq() && ! IsWP() ) {
1560  }
1561 
1562 
1565  if (!GetContext().PreprocessHugeFile) {
1567  }
1568 
1569  if (m_FarFetchFailure) {
1571  "Far fetch failures caused some validator tests to be bypassed",
1572  *m_TSE);
1573  }
1574 
1575  if (m_DoTaxLookup) {
1577  }
1578 
1579  // validate cit-sub
1580  if (cs) {
1582  }
1583 
1584  // optional barcode tests
1585  if (m_DoBarcodeTests) {
1586  x_DoBarcodeTests(seh);
1587  }
1588  return true;
1589 }
1590 
1591 
1593 {
1594  if (block.IsSetHup() && block.GetHup() && block.IsSetReldate() &&
1595  IsDateInPast(block.GetReldate())) {
1597  "Record release date has already passed", ss);
1598  }
1599 
1600  if (block.IsSetContact() && block.GetContact().IsSetContact()) {
1601  const CAuthor& author = block.GetContact().GetContact();
1602  if (author.IsSetAffil() && author.GetAffil().IsStd()) {
1603  ValidateAffil(author.GetAffil().GetStd(), ss, nullptr);
1604  }
1605  const CPerson_id& pid = author.GetName();
1606  if (pid.IsName()) {
1607  const CName_std& nstd = pid.GetName();
1608  string first = "";
1609  string last = "";
1610  if (nstd.IsSetLast()) {
1611  last = nstd.GetLast();
1614  "Bad last name '" + last + "'", ss);
1615  }
1616  }
1617  if (nstd.IsSetFirst()) {
1618  first = nstd.GetFirst();
1621  "Bad first name '" + first + "'", ss);
1622  }
1623  }
1624  if (first != "" && last != "" && NStr::EqualNocase(last, "last") && NStr::EqualNocase(first, "first")) {
1626  "Bad first and last name", ss);
1627  }
1628  }
1629  }
1630  if (block.IsSetCit()) {
1631  const CCit_sub& sub = block.GetCit();
1632  if (sub.IsSetAuthors()) {
1633  const CAuth_list& auth_list = sub.GetAuthors();
1634  const CAuth_list::TNames& names = auth_list.GetNames();
1635  if (names.IsStd()) {
1636  ITERATE ( CAuth_list::C_Names::TStd, name, names.GetStd() ) {
1637  if ( (*name)->GetName().IsName() ) {
1638  const CName_std& nstd = (*name)->GetName().GetName();
1639  string first = "";
1640  string last = "";
1641  if (nstd.IsSetLast()) {
1642  last = nstd.GetLast();
1645  "Bad last name '" + last + "'", ss);
1646  }
1647  }
1648  if (nstd.IsSetFirst()) {
1649  first = nstd.GetFirst();
1652  "Bad first name '" + first + "'", ss);
1653  }
1654  }
1655  if (first != "" && last != "" && NStr::EqualNocase(last, "last") && NStr::EqualNocase(first, "first")) {
1657  "Bad first and last name", ss);
1658  }
1659  }
1660  }
1661  }
1662  }
1663  }
1664 }
1665 
1666 
1668  const CSeq_submit& ss, CScope* scope)
1669 {
1670  // Check that ss is type e_Entrys
1671  if ( ss.GetData().Which() != CSeq_submit::C_Data::e_Entrys ) {
1672  return;
1673  }
1674 
1676  if (ss.IsSetSub()) {
1677  if (IsHugeFileMode()) {
1678  call_once(SetContext().SubmitBlockOnceFlag,
1679  [this, &ss](){ ValidateSubmitBlock(ss.GetSub(), ss); });
1680  }
1681  else {
1682  ValidateSubmitBlock(ss.GetSub(), ss);
1683  }
1684  }
1685 
1686  // Get CCit_sub pointer
1687  const CCit_sub* cs = &ss.GetSub().GetCit();
1688 
1689  if (ss.IsSetSub() && ss.GetSub().IsSetTool() && NStr::StartsWith(ss.GetSub().GetTool(), "Geneious")) {
1691  }
1692 
1693  // Just loop thru CSeq_entrys
1694  FOR_EACH_SEQENTRY_ON_SEQSUBMIT (se_itr, ss) {
1695  const CSeq_entry& se = **se_itr;
1696  if(se.IsSet())
1697  {
1698  const CBioseq_set &set = se.GetSet();
1699  if(set.IsSetClass() &&
1700  set.GetClass() == CBioseq_set::eClass_wgs_set)
1701  {
1703  CSeq_entry_Handle seh;
1704  seh = scope->GetSeq_entryHandle(se);
1705  Setup(seh);
1706  call_once(SetContext().WgsSetInSeqSubmitOnceFlag,
1707  [this, seh]() {
1709  "File was created as a wgs-set, but should be a batch submission instead.",
1710  seh.GetCompleteSeq_entry()->GetSet());
1711  });
1712  } else {
1713  CSeq_entry_Handle seh;
1714  seh = scope->GetSeq_entryHandle(se);
1715  Setup(seh);
1717  "File was created as a wgs-set, but should be a batch submission instead.",
1718  seh.GetCompleteSeq_entry()->GetSet());
1719  }
1720  }
1721  }
1722  Validate (se, cs, scope);
1723  }
1724 }
1725 
1726 
1728  const CSeq_annot_Handle& sah)
1729 {
1730  Setup(sah);
1731 
1732  // Iterate thru components of record and validate each
1733 
1734  CValidError_annot annot_validator(*this);
1735  annot_validator.ValidateSeqAnnot(sah);
1736 
1737  switch (sah.Which()) {
1739  {
1740  CValidError_feat feat_validator(*this);
1741  for (CFeat_CI fi (sah); fi; ++fi) {
1742  const CSeq_feat& sf = fi->GetOriginalFeature();
1743  feat_validator.ValidateSeqFeat(sf);
1744  }
1745  }
1746  break;
1747 
1749  {
1750  if (IsValidateAlignments()) {
1751  CValidError_align align_validator(*this);
1752  int order = 1;
1753  for (CAlign_CI ai(sah); ai; ++ai) {
1754  const CSeq_align& sa = ai.GetOriginalSeq_align();
1755  align_validator.ValidateSeqAlign(sa, order++);
1756  }
1757  }
1758  }
1759  break;
1760 
1762  {
1763  CValidError_graph graph_validator(*this);
1764  // for (CTypeConstIterator <CSeq_graph> gi (sa); gi; ++gi) {
1765  for (CGraph_CI gi(sah); gi; ++gi) {
1766  const CSeq_graph& sg = gi->GetOriginalGraph();
1767  graph_validator.ValidateSeqGraph(sg);
1768  }
1769  }
1770  break;
1771  default:
1772  break;
1773  }
1777 }
1778 
1779 
1780 void CValidError_imp::Validate(const CSeq_feat& feat, CScope* scope)
1781 {
1782  // automatically restores m_Scope to its old value when we leave
1783  // the function
1784  CScopeRestorer scopeRestorer( m_Scope );
1785 
1786  if( scope ) {
1787  m_Scope.Reset(scope);
1788  }
1789  if (!m_Scope) {
1790  // set up a temporary local scope if there is no scope set already
1791  m_Scope.Reset(new CScope(*m_ObjMgr));
1792  }
1793 
1794  CValidError_feat feat_validator(*this);
1795  feat_validator.SetScope(*m_Scope);
1797  feat_validator.SetTSE(empty);
1798  feat_validator.ValidateSeqFeat(feat);
1799  if (feat.IsSetData() && feat.GetData().IsBiosrc()) {
1800  const CBioSource& src = feat.GetData().GetBiosrc();
1801  if (src.IsSetOrg()) {
1803  }
1804  }
1805  FindEmbeddedScript(feat);
1806  FindNonAsciiText(feat);
1808 }
1809 
1810 
1812 {
1813  // automatically restores m_Scope to its old value when we leave
1814  // the function
1815  CScopeRestorer scopeRestorer( m_Scope );
1816 
1817  if( scope ) {
1818  m_Scope.Reset(scope);
1819  }
1820  if (!m_Scope) {
1821  // set up a temporary local scope if there is no scope set already
1822  m_Scope.Reset(new CScope(*m_ObjMgr));
1823  }
1824 
1825  ValidateBioSource(src, src);
1826  if (src.IsSetOrg()) {
1828  }
1829  FindEmbeddedScript(src);
1830  FindNonAsciiText(src);
1832 }
1833 
1834 
1835 void CValidError_imp::Validate(const CPubdesc& pubdesc, CScope* scope)
1836 {
1837  // automatically restores m_Scope to its old value when we leave
1838  // the function
1839  CScopeRestorer scopeRestorer( m_Scope );
1840 
1841  if( scope ) {
1842  m_Scope.Reset(scope);
1843  }
1844  if (!m_Scope) {
1845  // set up a temporary local scope if there is no scope set already
1846  m_Scope.Reset(new CScope(*m_ObjMgr));
1847  }
1848 
1849  ValidatePubdesc(pubdesc, pubdesc);
1850  FindEmbeddedScript(pubdesc);
1851  FindNonAsciiText(pubdesc);
1852  FindCollidingSerialNumbers(pubdesc);
1853 }
1854 
1856 {
1857  CValidError_desc seqdesc_validator(*this);
1858  m_Scope.Reset(new CScope(*m_ObjMgr));
1860  seqdesc_validator.ValidateSeqDesc(desc,ctx);
1861 }
1862 
1863 
1866  void* user_data)
1867 {
1868  m_PrgCallback = callback;
1869  m_PrgInfo.m_UserData = user_data;
1870 }
1871 
1872 
1874 (const CDbtag& xref,
1875  const CSerialObject& obj,
1876  bool biosource,
1877  const CSeq_entry *ctx)
1878 {
1879  bool refseq_or_gps = IsRefSeq() || IsGPS();
1881  refseq_or_gps);
1882 
1883  const string& db = xref.IsSetDb() ? xref.GetDb() : kEmptyStr;
1884 
1887  "dbxref value " + xref.GetTag().GetStr() + " has SGML",
1888  obj, ctx);
1889  }
1892  "dbxref value " + xref.GetTag().GetStr() + " contains space character",
1893  obj, ctx);
1894  }
1895  if (flags & CValidator::eDbHasSgml) {
1897  "dbxref database " + db + " has SGML",
1898  obj, ctx);
1899  }
1900 
1901  bool isStr = false;
1902  string dbv;
1903  if (xref.IsSetTag() && xref.GetTag().IsStr()) {
1904  dbv = xref.GetTag().GetStr();
1905  isStr = true;
1906  } else if (xref.IsSetTag() && xref.GetTag().IsId()) {
1907  dbv = NStr::NumericToString(xref.GetTag().GetId());
1908  }
1909 
1912  "Illegal db_xref type " + db + " (" + dbv + ")", obj, ctx);
1913  }
1915  // capitalization is bad
1916  bool refseq_db = false, src_db = false;
1917  string correct_caps;
1918  xref.GetDBFlags(refseq_db, src_db, correct_caps);
1919  string message = "Illegal db_xref type " + db + " (" + dbv + "), legal capitalization is " + correct_caps;
1921  message += ", but should not be used on an OrgRef";
1922  } else if (flags & CValidator::eOnlyForSource) {
1923  message += ", but should only be used on an OrgRef";
1924  }
1925 
1927  } else {
1931  "RefSeq-specific db_xref type " + db + " (" + dbv + ") should not be used on a non-RefSeq OrgRef",
1932  obj, ctx);
1933  } else {
1935  "db_xref type " + db + " (" + dbv + ") is only legal for RefSeq",
1936  obj, ctx);
1937  }
1938  } else if (flags & CValidator::eNotForSource) {
1941  "RefSeq-specific db_xref type " + db + " (" + dbv + ") should not be used on an OrgRef",
1942  obj, ctx);
1943  } else {
1945  "db_xref type " + db + " (" + dbv + ") should not be used on an OrgRef",
1946  obj, ctx);
1947  }
1948  } else if (flags & CValidator::eOnlyForSource) {
1950  "db_xref type " + db + " (" + dbv + ") should only be used on an OrgRef",
1951  obj, ctx);
1952  }
1953  }
1954 
1955  if (isStr && db == "GeneID") {
1957  "db_xref type " + db + " (" + dbv + ") is required to be an integer",
1958  obj, ctx);
1959  }
1960 }
1961 
1962 
1964 (TDbtags& xref_list,
1965  const CSerialObject& obj,
1966  bool biosource,
1967  const CSeq_entry *ctx)
1968 {
1969  string last_db;
1970 
1971  ITERATE( TDbtags, xref, xref_list) {
1972  if (biosource
1973  && (*xref)->IsSetDb()) {
1974  if (!NStr::IsBlank(last_db)
1975  && NStr::EqualNocase((*xref)->GetDb(), last_db)) {
1977  "BioSource uses db " + last_db + " multiple times",
1978  obj, ctx);
1979  }
1980  last_db = (*xref)->GetDb();
1981  }
1982  ValidateDbxref(**xref, obj, biosource, ctx);
1983  }
1984 }
1985 
1986 
1988 (const CPacked_seqint& packed_int,
1989  SLocCheck& lc,
1990  const CSerialObject& obj)
1991 {
1992  ITERATE(CPacked_seqint::Tdata, it, packed_int.Get()) {
1993  lc.int_cur = (*it);
1994  lc.chk &= x_CheckSeqInt(lc.id_cur, lc.int_cur, lc.strand_cur);
1995 
1997 
1998  lc.id_prv = lc.id_cur;
1999  lc.strand_prv = lc.strand_cur;
2000  lc.int_prv = lc.int_cur;
2001  }
2002 }
2003 
2004 
2006  CConstRef<CSeq_id>& id_cur,
2007  const CSeq_interval* int_cur,
2008  ENa_strand& strand_cur)
2009 {
2010  strand_cur = int_cur->IsSetStrand() ?
2011  int_cur->GetStrand() : eNa_strand_unknown;
2012  id_cur = &int_cur->GetId();
2013  bool chk = IsValid(*int_cur, m_Scope);
2014  return chk;
2015 }
2016 
2017 
2019 {
2020  ITERATE(CPacked_seqint::Tdata, it, packed_int.Get()) {
2021  x_ReportInvalidFuzz(**it, obj);
2022  }
2023 }
2024 
2025 
2026 static const string kSpaceLeftFirst = "Should not specify 'space to left' at first position of non-circular sequence";
2027 static const string kSpaceRightLast = "Should not specify 'space to right' at last position of non-circular sequence";
2028 
2029 static const string kSpaceLeftCircle = "Should not specify 'circle to left' except at first position of circular sequence";
2030 static const string kSpaceRightCircle = "Should not specify 'circle to right' except at last position of circular sequence";
2031 
2033 {
2036  bool has_fuzz_from = false;
2037  bool has_fuzz_to = false;
2038 
2039  if (interval.IsSetFuzz_from() && interval.GetFuzz_from().IsLim()) {
2040  fuzz_from = interval.GetFuzz_from().GetLim();
2041  has_fuzz_from = true;
2042  }
2043  if (interval.IsSetFuzz_to() && interval.GetFuzz_to().IsLim()) {
2044  fuzz_to = interval.GetFuzz_to().GetLim();
2045  has_fuzz_to = true;
2046  }
2047  if (! has_fuzz_from && ! has_fuzz_to) {
2048  return;
2049  }
2050 
2051  // check for invalid fuzz on both ends of Interval
2052  if (has_fuzz_from && has_fuzz_to && fuzz_from == fuzz_to) {
2053  if (fuzz_from == CInt_fuzz::eLim_tl) {
2056  "Should not specify 'space to left' for both ends of interval", obj);
2057  }
2058  else if (fuzz_from == CInt_fuzz::eLim_tr) {
2061  "Should not specify 'space to right' for both ends of interval", obj);
2062  }
2063  else if (fuzz_from == CInt_fuzz::eLim_circle) {
2066  "Should not specify 'origin of circle' for both ends of interval", obj);
2067  }
2068  }
2069 
2070  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(interval.GetId());
2071  if (! bsh) {
2072  return;
2073  }
2074 
2076  if (bsh.IsSetInst_Topology()) {
2077  top = bsh.GetInst_Topology();
2078  }
2079 
2080  if (top != CSeq_inst::eTopology_circular) {
2081 
2082  // VR-15
2083  // look for space to left at beginning of sequence or space to right at end
2084  if (fuzz_from == CInt_fuzz::eLim_tl && interval.IsSetFrom() && interval.GetFrom() == 0) {
2086  }
2087  if (fuzz_to == CInt_fuzz::eLim_tr && interval.IsSetTo() && interval.GetTo() == bsh.GetBioseqLength() - 1) {
2089  }
2090 
2091  } else if (fuzz_from == CInt_fuzz::eLim_circle || fuzz_to == CInt_fuzz::eLim_circle) {
2092 
2093  if (obj.GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
2094  const CSeq_feat* sfp = dynamic_cast<const CSeq_feat*>(&obj);
2095  if (sfp && sfp->IsSetExcept() && sfp->CanGetExcept_text() && NStr::FindNoCase(sfp->GetExcept_text(), "ribosomal slippage") != NPOS) {
2096  return;
2097  }
2098  }
2099 
2100  // VR-832
2101  if (fuzz_from == CInt_fuzz::eLim_circle && interval.IsSetFrom() && interval.GetFrom() != 0) {
2103  }
2104  if (fuzz_to == CInt_fuzz::eLim_circle && interval.IsSetTo() && interval.GetTo() != bsh.GetBioseqLength() - 1) {
2106  }
2107  }
2108 }
2109 
2110 
2112 {
2113  // VR-15
2114  if (!point.IsSetFuzz() || !point.GetFuzz().IsLim() ||
2115  (point.GetFuzz().GetLim() != CInt_fuzz::eLim_tl && point.GetFuzz().GetLim() != CInt_fuzz::eLim_tr) ||
2116  !point.IsSetId() || !point.IsSetPoint()) {
2117  return;
2118  }
2119  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(point.GetId());
2120  if (!bsh) {
2121  return;
2122  }
2124  return;
2125  }
2126  if (point.GetPoint() == 0 && point.GetFuzz().GetLim() == CInt_fuzz::eLim_tl) {
2128  }
2129  if (point.GetPoint() == bsh.GetBioseqLength() - 1) {
2131  }
2132 }
2133 
2134 
2135 void CValidError_imp::x_ReportInvalidFuzz(const CSeq_loc& loc, const CSerialObject& obj)
2136 {
2138  for (; lit; ++lit) {
2139  CSeq_loc::E_Choice loc_choice = lit->Which();
2140  switch (loc_choice) {
2141  case CSeq_loc::e_Int:
2142  x_ReportInvalidFuzz(lit->GetInt(), obj);
2143  break;
2145  x_ReportInvalidFuzz(lit->GetPacked_int(), obj);
2146  break;
2147  case CSeq_loc::e_Pnt:
2148  x_ReportInvalidFuzz(lit->GetPnt(), obj);
2149  break;
2150  default:
2151  break;
2152  }
2153  }
2154 }
2155 
2156 
2157 unsigned int s_CountMix(const CSeq_loc& loc)
2158 {
2159  unsigned int num_mix = 0;
2161  for (; lit; ++lit) {
2162  if (lit->IsMix()) {
2163  num_mix++;
2164  }
2165  }
2166  return num_mix;
2167 }
2168 
2169 
2171 {
2172  lc.chk = true;
2173  lc.unmarked_strand = false;
2174  lc.mixed_strand = false;
2175  lc.has_other = false;
2176  lc.has_not_other = false;
2177  lc.id_cur = nullptr;
2178  lc.id_prv = nullptr;
2179  lc.int_cur = nullptr;
2180  lc.int_prv = nullptr;
2181  lc.strand_cur = eNa_strand_unknown;
2182  lc.strand_prv = eNa_strand_unknown;
2183  lc.prefix = prefix;
2184 }
2185 
2187 {
2188  if (lc.strand_prv != eNa_strand_other &&
2189  lc.strand_cur != eNa_strand_other) {
2190  if (lc.id_cur && lc.id_prv &&
2191  IsSameBioseq(*lc.id_cur, *lc.id_prv, m_Scope)) {
2192  if (lc.strand_prv != lc.strand_cur) {
2193  if ((lc.strand_prv == eNa_strand_plus &&
2194  lc.strand_cur == eNa_strand_unknown) ||
2195  (lc.strand_prv == eNa_strand_unknown &&
2196  lc.strand_cur == eNa_strand_plus)) {
2197  lc.unmarked_strand = true;
2198  } else {
2199  lc.mixed_strand = true;
2200  }
2201  }
2202  }
2203  }
2204  if (lc.strand_cur == eNa_strand_other) {
2205  lc.has_other = true;
2206  } else if (lc.strand_cur == eNa_strand_minus || lc.strand_cur == eNa_strand_plus) {
2207  lc.has_not_other = true;
2208  }
2209 
2210 }
2211 
2212 void CValidError_imp::x_CheckLoc(const CSeq_loc& loc, const CSerialObject& obj, SLocCheck& lc, bool lowerSev)
2213 {
2214  try {
2215  switch (loc.Which()) {
2216  case CSeq_loc::e_Int:
2217  lc.int_cur = &loc.GetInt();
2218  lc.chk = x_CheckSeqInt(lc.id_cur, lc.int_cur, lc.strand_cur);
2219  if (lc.strand_cur == eNa_strand_other) {
2220  lc.has_other = true;
2221  }
2222  if ((!lc.chk) && lowerSev) {
2223  TSeqPos length = GetLength(loc.GetInt().GetId(), m_Scope);
2224  TSeqPos fr = loc.GetInt().GetFrom();
2225  TSeqPos to = loc.GetInt().GetTo();
2226  if (fr < length && to >= length) {
2227  // RefSeq variation feature with dbSNP xref and interval flanking the length is ERROR
2228  } else {
2229  // otherwise keep severity at REJECT
2230  lowerSev = false;
2231  }
2232  }
2233  break;
2234  case CSeq_loc::e_Pnt:
2235  lc.strand_cur = loc.GetPnt().IsSetStrand() ?
2236  loc.GetPnt().GetStrand() : eNa_strand_unknown;
2237  if (lc.strand_cur == eNa_strand_other) {
2238  lc.has_other = true;
2239  }
2240  lc.id_cur = &loc.GetPnt().GetId();
2241  lc.chk = IsValid(loc.GetPnt(), m_Scope);
2242  lc.int_prv = nullptr;
2243  break;
2245  lc.strand_cur = loc.GetPacked_pnt().IsSetStrand() ?
2246  loc.GetPacked_pnt().GetStrand() : eNa_strand_unknown;
2247  if (lc.strand_cur == eNa_strand_other) {
2248  lc.has_other = true;
2249  }
2250  lc.id_cur = &loc.GetPacked_pnt().GetId();
2251  lc.chk = IsValid(loc.GetPacked_pnt(), m_Scope);
2252  lc.int_prv = nullptr;
2253  break;
2255  x_CheckPackedInt(loc.GetPacked_int(), lc, obj);
2256  break;
2257  case CSeq_loc::e_Null:
2258  break;
2259  case CSeq_loc::e_Mix:
2260  for (auto l : loc.GetMix().Get()) {
2261  x_CheckLoc(*l, obj, lc, lowerSev);
2263  }
2264  break;
2265  default:
2266  lc.strand_cur = eNa_strand_other;
2267  lc.id_cur = nullptr;
2268  lc.int_prv = nullptr;
2269  break;
2270  }
2271  if (!lc.chk) {
2272  string lbl = GetValidatorLocationLabel (loc, *m_Scope);
2273  EDiagSev sev = eDiag_Critical;
2274  if (lowerSev) {
2275  sev = eDiag_Error;
2276  }
2278  lc.prefix + ": SeqLoc [" + lbl + "] out of range", obj);
2279  }
2280 
2281  if (loc.Which() != CSeq_loc::e_Null) {
2283 
2284  lc.strand_prv = lc.strand_cur;
2285  lc.id_prv = lc.id_cur;
2286  }
2287  } catch( const exception& e ) {
2288  string label = GetValidatorLocationLabel(loc, *m_Scope);
2290  "Exception caught while validating location " +
2291  label + ". Exception: " + e.what(), obj);
2292 
2293  lc.strand_cur = eNa_strand_other;
2294  lc.id_cur = nullptr;
2295  lc.int_prv = nullptr;
2296  }
2297 }
2298 
2300 (const CSeq_loc& loc,
2301  const CBioseq_Handle& seq,
2302  bool report_abutting,
2303  const string& prefix,
2304  const CSerialObject& obj,
2305  bool lowerSev)
2306 {
2307  SLocCheck lc;
2308 
2310 
2311  x_CheckLoc(loc, obj, lc, lowerSev);
2312 
2313  if (lc.has_other && lc.has_not_other) {
2314  string label = GetValidatorLocationLabel(loc, *m_Scope);
2316  prefix + ": Inconsistent use of other strand SeqLoc [" + label + "]", obj);
2317  } else if (lc.has_other && NStr::Equal(prefix, "Location")) {
2320  "Strand 'other' in location", obj);
2321  }
2322 
2323  x_ReportInvalidFuzz(loc, obj);
2324 
2328  "Duplicate exons in location", obj);
2329  }
2330 
2331  if (s_CountMix(loc) > 1) {
2332  string label;
2333  loc.GetLabel(&label);
2335  prefix + ": SeqLoc [" + label + "] has nested SEQLOC_MIX elements",
2336  obj);
2337  }
2338 
2339  // Warn if different parts of a seq-loc refer to the same bioseq using
2340  // differnt id types (i.e. gi and accession)
2341  ValidateSeqLocIds(loc, obj);
2342 
2343  bool trans_splice = false;
2344  bool circular_rna = false;
2345  bool exception = false;
2346  const CSeq_feat* sfp = nullptr;
2347  if (obj.GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
2348  sfp = dynamic_cast<const CSeq_feat*>(&obj);
2349  }
2350  if (sfp) {
2351  // primer_bind intervals MAY be in on opposite strands
2353  lc.mixed_strand = false;
2354  lc.unmarked_strand = false;
2355  }
2356 
2357  exception = sfp->IsSetExcept() ? sfp->GetExcept() : false;
2358  if (exception && sfp->CanGetExcept_text()) {
2359  if (NStr::FindNoCase(sfp->GetExcept_text(), "trans-splicing") != NPOS) {
2360  // trans splicing exception turns off both mixed_strand and
2361  // out_of_order messages
2362  trans_splice = true;
2363  } else if (NStr::FindNoCase(sfp->GetExcept_text(), "circular RNA") != NPOS) {
2364  // circular RNA exception turns off out_of_order message
2365  circular_rna = true;
2366  }
2367  }
2368  }
2369 
2370  string loc_lbl;
2371  if (report_abutting && (!sfp || !CSeqFeatData::AllowAdjacentIntervals(sfp->GetData().GetSubtype())) &&
2373  loc_lbl = GetValidatorLocationLabel(loc, *m_Scope);
2374 
2375  EDiagSev sev = exception ? eDiag_Warning : eDiag_Error;
2377  prefix + ": Adjacent intervals in SeqLoc [" +
2378  loc_lbl + "]", obj);
2379  }
2380 
2381  if (trans_splice && !NStr::Equal(prefix, "Product")) {
2382  CSeq_loc_CI li(loc);
2383  ++li;
2384  if (!li) {
2385  PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadTranssplicedInterval, "Trans-spliced feature should have multiple intervals", obj);
2386  }
2387  return;
2388  }
2389 
2390  bool ordered = true;
2391  bool circular = false;
2392  if ( seq &&
2393  seq.IsSetInst() && seq.GetInst().IsSetTopology() &&
2395  circular = true;
2396  }
2397  try {
2398  if (m_Scope && (!sfp || CSeqFeatData::RequireLocationIntervalsInBiologicalOrder(sfp->GetData().GetSubtype())) && !circular) {
2400  }
2401  } catch ( const CException& ex) {
2402  string label;
2403  loc.GetLabel(&label);
2405  "Exception caught while validating location " +
2406  label + ". Exception: " + ex.what(), obj);
2407  }
2408 
2409  if (lc.mixed_strand || lc.unmarked_strand || !ordered) {
2410  if (loc_lbl.empty()) {
2411  loc_lbl = GetValidatorLocationLabel(loc, *m_Scope);
2412  }
2413  if (lc.mixed_strand) {
2414  if (IsSmallGenomeSet()) {
2416  prefix + ": Mixed strands in SeqLoc ["
2417  + loc_lbl + "] in small genome set - set trans-splicing exception if appropriate", obj);
2418  } else {
2419  EDiagSev sev = eDiag_Error;
2420  if (IsGeneious() || (sfp && sequence::IsPseudo(*sfp, *m_Scope))) {
2421  sev = eDiag_Warning;
2422  }
2424  prefix + ": Mixed strands in SeqLoc ["
2425  + loc_lbl + "]", obj);
2426  }
2427  } else if (lc.unmarked_strand) {
2429  prefix + ": Mixed plus and unknown strands in SeqLoc ["
2430  + loc_lbl + "]", obj);
2431  }
2432  if (!ordered && !circular_rna) {
2433  if (IsSmallGenomeSet()) {
2435  prefix + ": Intervals out of order in SeqLoc [" +
2436  loc_lbl + "]", obj);
2437  } else {
2439  prefix + ": Intervals out of order in SeqLoc [" +
2440  loc_lbl + "]", obj);
2441  }
2442  }
2443  return;
2444  }
2445 
2446  if ( seq &&
2447  seq.IsSetInst_Repr() &&
2448  seq.GetInst_Repr() != CSeq_inst::eRepr_seg ) {
2449  return;
2450  }
2451 
2452  // Check for intervals out of order on segmented Bioseq
2453  if ( seq && BadSeqLocSortOrder(seq, loc) && !circular_rna ) {
2454  if (loc_lbl.empty()) {
2455  loc.GetLabel(&loc_lbl);
2456  }
2458  prefix + "Intervals out of order in SeqLoc [" +
2459  loc_lbl + "]", obj);
2460  }
2461 
2462  // Check for mixed strand on segmented Bioseq
2463  if ( IsMixedStrands(loc) ) {
2464  if (loc_lbl.empty()) {
2465  loc.GetLabel(&loc_lbl);
2466  }
2468  prefix + ": Mixed strands in SeqLoc [" +
2469  loc_lbl + "]", obj);
2470  }
2471 }
2472 
2473 
2475 {
2476  if (!SeqIsPatent(seq)) {
2477  m_BioseqWithNoSource.push_back(CConstRef<CBioseq>(&seq));
2478  }
2479 }
2480 
2481 
2483 {
2484  if (!SeqIsPatent (seq)) {
2486  "The product name is missing from this protein.", *(seq.GetCompleteBioseq()));
2487  }
2488 }
2489 
2490 
2492 {
2493  bool wgs = false;
2494 
2495  FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
2496  if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech()
2497  && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_wgs) {
2498  wgs = true;
2499  break;
2500  }
2501  }
2502  if (!wgs) {
2503  return false;
2504  }
2505 
2506  bool is_other = false;
2507  bool has_gi = false;
2508 
2509  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2510  if ((*it)->IsOther()) {
2511  is_other = true;
2512  break;
2513  } else if ((*it)->IsGi()) {
2514  has_gi = true;
2515  break;
2516  }
2517  }
2518  if (!is_other || has_gi) {
2519  return false;
2520  }
2521 
2522  return true;
2523 }
2524 
2525 
2527 {
2528  bool tsa = false;
2529 
2530  FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
2531  if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech()
2532  && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_tsa) {
2533  tsa = true;
2534  break;
2535  }
2536  }
2537  if (!tsa) {
2538  return false;
2539  }
2540 
2541  bool is_other = false;
2542  bool has_gi = false;
2543 
2544  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2545  if ((*it)->IsOther()) {
2546  is_other = true;
2547  break;
2548  } else if ((*it)->IsGi()) {
2549  has_gi = true;
2550  break;
2551  }
2552  }
2553  if (!is_other || has_gi) {
2554  return false;
2555  }
2556 
2557  return true;
2558 }
2559 
2560 
2562 {
2563  if (GetContext().PreprocessHugeFile) {
2564  if (m_pEntryInfo->IsNoBioSource() && !GetContext().IsPatent && !GetContext().IsPDB) {
2565  return;
2566  }
2567  }
2568  else if (m_pEntryInfo->IsNoBioSource() && !m_pEntryInfo->IsPatent() && !m_pEntryInfo->IsPDB()) {
2570  "No source information included on this record.", se);
2571 
2572  if (!GetContext().PostprocessHugeFile) {
2573  return;
2574  }
2575  }
2576 
2577  size_t num_no_source = m_BioseqWithNoSource.size();
2578 
2579  for ( size_t i = 0; i < num_no_source; ++i ) {
2581  "No organism name included in the source. Other qualifiers may exist.",
2582  *(m_BioseqWithNoSource[i]));
2583  }
2584 }
2585 
2586 
2588 {
2589  CConstRef<CSeq_feat> feat;
2590 
2592 
2593  if ( bsh ) {
2594  if ( IsNT() && m_TSE ) {
2595  // In case of a NT bioseq limit the search to features packaged on the
2596  // NT (we assume features have been pulled from the segments to the NT).
2598  sel.SetByProduct()
2600  CFeat_CI fi(bsh, sel);
2601  if ( fi ) {
2602  // return the first one (should be the one packaged on the
2603  // nuc-prot set).
2604  feat.Reset(&(fi->GetOriginalFeature()));
2605  }
2606  } else {
2608  sel.SetByProduct();
2609  CFeat_CI fi(bsh, sel);
2610  if ( fi ) {
2611  // return the first one (should be the one packaged on the
2612  // nuc-prot set).
2613  feat.Reset(&(fi->GetOriginalFeature()));
2614  }
2615  }
2616  }
2617 
2618  return feat;
2619 }
2620 
2621 
2623 {
2625  return GetmRNAGivenProduct(bsh);
2626 }
2627 
2628 
2630 {
2631  CConstRef<CSeq_feat> feat;
2632  if ( bsh ) {
2633  // In case of a NT bioseq limit the search to features packaged on the
2634  // NT (we assume features have been pulled from the segments to the NT).
2635  CSeq_entry_Handle limit;
2636  if ( IsNT() && m_TSE ) {
2637  limit = m_Scope->GetSeq_entryHandle(*m_TSE);
2638  }
2639 
2640  if (limit) {
2642  sel.SetByProduct() .SetLimitTSE(limit);
2643  CFeat_CI fi(bsh, sel);
2644  if ( fi ) {
2645  // return the first one (should be the one packaged on the
2646  // nuc-prot set).
2647  feat.Reset(&(fi->GetOriginalFeature()));
2648  }
2649  } else {
2651  sel.SetByProduct();
2652  CFeat_CI fi(bsh, sel);
2653  if ( fi ) {
2654  // return the first one (should be the one packaged on the
2655  // nuc-prot set).
2656  feat.Reset(&(fi->GetOriginalFeature()));
2657  }
2658  }
2659  }
2660 
2661  return feat;
2662 }
2663 
2664 
2666 (const CBioseq& seq,
2667  CBioseq_set::EClass clss)
2668 {
2669  const CSeq_entry* parent = nullptr;
2670  for ( parent = seq.GetParentEntry();
2671  parent;
2672  parent = parent->GetParentEntry() ) {
2673  if ( parent->IsSet() ) {
2674  const CBioseq_set& set = parent->GetSet();
2675  if ( set.IsSetClass() && set.GetClass() == clss ) {
2676  break;
2677  }
2678  }
2679  }
2680  return parent;
2681 }
2682 
2683 
2684 bool CValidError_imp::IsSerialNumberInComment(const string& comment)
2685 {
2686  size_t pos = comment.find('[', 0);
2687  while ( pos != string::npos ) {
2688  ++pos;
2689  bool okay = true;
2690  if ( isdigit((unsigned char) comment[pos]) ) {
2691  // skip if first character after bracket is 0
2692  if (comment[pos] == '0') {
2693  okay = false;
2694  }
2695  while ( isdigit((unsigned char) comment[pos]) ) {
2696  ++pos;
2697  }
2698  if ( comment[pos] == ']' && okay ) {
2699  return true;
2700  }
2701  }
2702 
2703  pos = comment.find('[', pos);
2704  }
2705  return false;
2706 }
2707 
2708 
2710 {
2711  // okay to have far RefSeq product, but only if genomic product set
2712  if ( sid && sid->IsOther() ) {
2713  if ( IsGPS() ) {
2714  return false;
2715  }
2716  }
2717  // or just a bioseq
2718  if ( GetTSE().IsSeq() ) {
2719  return false;
2720  }
2721 
2722  // or in a standalone Seq-annot
2723  if (IsStandaloneAnnot() ) {
2724  return false;
2725  }
2726  return true;
2727 }
2728 
2729 
2731  vector<TEntrezId>& pmids, vector<TEntrezId>& muids, vector<int>& serials,
2732  vector<string>& published_labels, vector<string>& unpublished_labels)
2733 {
2734  FOR_EACH_SEQDESC_ON_SEQENTRY (it, se) {
2735  if ((*it)->IsPub()) {
2736  CCleanup::GetPubdescLabels ((*it)->GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
2737  }
2738  }
2739 
2740  if (se.IsSet()) {
2741  FOR_EACH_SEQENTRY_ON_SEQSET (it, se.GetSet()) {
2742  s_CollectPubDescriptorLabels (**it, pmids, muids, serials, published_labels, unpublished_labels);
2743  }
2744  }
2745 }
2746 
2747 
2749 {
2750  vector<TEntrezId> pmids;
2751  vector<TEntrezId> muids;
2752  vector<int> serials;
2753  vector<string> published_labels;
2754  vector<string> unpublished_labels;
2755 
2756  // collect labels for pubs on record
2757  s_CollectPubDescriptorLabels (*(seh.GetCompleteSeq_entry()), pmids, muids, serials, published_labels, unpublished_labels);
2758 
2760  while (feat) {
2761  CCleanup::GetPubdescLabels (feat->GetData().GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
2762  ++feat;
2763  }
2764 
2765  // now examine citations to determine whether they match a pub on the record
2766  CFeat_CI f (seh);
2767  while (f) {
2768  if (f->IsSetCit() && f->GetCit().IsPub()) {
2769  ITERATE (CPub_set::TPub, cit_it, f->GetCit().GetPub()) {
2770  bool found = false;
2771 
2772  if ((*cit_it)->IsPmid()) {
2773  vector<TEntrezId>::iterator it = pmids.begin();
2774  while (it != pmids.end() && !found) {
2775  if (*it == (*cit_it)->GetPmid()) {
2776  found = true;
2777  }
2778  ++it;
2779  }
2780  if (!found) {
2782  "Citation on feature refers to uid ["
2783  + NStr::NumericToString((*cit_it)->GetPmid().Get())
2784  + "] not on a publication in the record",
2785  f->GetOriginalFeature());
2786  }
2787  } else if ((*cit_it)->IsMuid()) {
2788  vector<TEntrezId>::iterator it = muids.begin();
2789  while (it != muids.end() && !found) {
2790  if (*it == (*cit_it)->GetMuid()) {
2791  found = true;
2792  }
2793  ++it;
2794  }
2795  if (!found) {
2797  "Citation on feature refers to uid ["
2798  + NStr::NumericToString((*cit_it)->GetMuid())
2799  + "] not on a publication in the record",
2800  f->GetOriginalFeature());
2801  }
2802  } else if ((*cit_it)->IsEquiv()) {
2803  continue;
2804  } else {
2805  string label;
2806  (*cit_it)->GetLabel(&label, CPub::eContent, CPub::fLabel_Unique);
2807 
2808  if (NStr::EndsWith (label, ">")) {
2809  label = label.substr(0, label.length() - 2);
2810  }
2811  if(NStr::EndsWith (label, "|")) {
2812  label = label.substr(0, label.length() - 1);
2813  }
2814  if (NStr::EndsWith (label, " ")) {
2815  label = label.substr(0, label.length() - 1);
2816  }
2817  size_t len = label.length();
2818  vector<string>::iterator unpub_it = unpublished_labels.begin();
2819  while (unpub_it != unpublished_labels.end() && !found) {
2820  size_t it_len =(*unpub_it).length();
2821  if (NStr::EqualNocase (*unpub_it, 0, it_len > len ? len : it_len, label)) {
2822  found = true;
2823  }
2824  ++unpub_it;
2825  }
2826  vector<string>::iterator pub_it = published_labels.begin();
2827 
2828  while (pub_it != published_labels.end() && !found) {
2829  size_t it_len =(*pub_it).length();
2830  if (NStr::EqualNocase (*pub_it, 0, it_len > len ? len : it_len, label)) {
2832  "Citation on feature needs to be updated to published uid",
2833  f->GetOriginalFeature());
2834  found = true;
2835  }
2836  ++pub_it;
2837  }
2838  if (!found) {
2840  "Citation on feature refers to a publication not in the record",
2841  f->GetOriginalFeature());
2842  }
2843  }
2844  }
2845  }
2846  ++f;
2847  }
2848 }
2849 
2850 
2851 // =============================================================================
2852 // Private
2853 // =============================================================================
2854 
2855 
2856 
2858 {
2860  for( ; it; ++it) {
2861  const string& str = *it;
2862  FOR_EACH_CHAR_IN_STRING(c_it, str) {
2863  const char& ch = *c_it;
2864  unsigned char chu = ch;
2865  if (ch > 127 || (ch < 32 && ch != '\t' && ch != '\r' && ch != '\n')) {
2867  "Non-ASCII character '" + NStr::NumericToString(chu) + "' found in item", obj);
2868  break;
2869  }
2870  }
2871  }
2872 }
2873 
2874 
2876 {
2877  class CScriptTagTextFsm : public CTextFsm<int>
2878  {
2879  public:
2880  CScriptTagTextFsm() {
2881  const char * script_tags[] = {
2882  "<script", "<object", "<applet", "<embed", "<form",
2883  "javascript:", "vbscript:"};
2884  ITERATE_0_IDX(idx, ArraySize(script_tags)) {
2885  AddWord(script_tags[idx], true);
2886  }
2887  Prime();
2888  }
2889 
2890  // Returns true if the given string matches any of the strings
2891  // in the fsm anywhere.
2892  bool DoesStrHaveFsmHits(const string &str) {
2893  int state = GetInitialState();
2894  ITERATE(string, str_it, str) {
2895  state = GetNextState(state, *str_it);
2896  if( IsMatchFound(state) ) {
2897  return true;
2898  }
2899  }
2900 
2901  return false;
2902  }
2903  };
2904  static CScriptTagTextFsm s_ScriptTagFsm;
2905 
2906 
2908  for( ; it; ++it) {
2909  if (s_ScriptTagFsm.DoesStrHaveFsmHits(*it)) {
2911  "Script tag found in item", obj);
2912  return;
2913  }
2914 }
2915 }
2916 
2917 
2918 bool CValidError_imp::IsMixedStrands(const CSeq_loc& loc)
2919 {
2920  if ( SeqLocCheck(loc, m_Scope) == eSeqLocCheck_warning ) {
2921  return false;
2922  }
2923 
2924  CSeq_loc_CI curr(loc);
2925  if ( !curr ) {
2926  return false;
2927  }
2928  CSeq_loc_CI prev = curr;
2929  ++curr;
2930 
2931  while ( curr ) {
2932  ENa_strand curr_strand = curr.GetStrand();
2933  ENa_strand prev_strand = prev.GetStrand();
2934 
2935  if ( (prev_strand == eNa_strand_minus &&
2936  curr_strand != eNa_strand_minus) ||
2937  (prev_strand != eNa_strand_minus &&
2938  curr_strand == eNa_strand_minus) ) {
2939  return true;
2940  }
2941 
2942  prev = curr;
2943  ++curr;
2944  }
2945 
2946  return false;
2947 }
2948 
2949 
2950 static bool s_SeqLocHasGI (const CSeq_loc& loc)
2951 {
2952  bool rval = false;
2953 
2954  for ( CSeq_loc_CI it(loc); it && !rval; ++it ) {
2955  if (it.GetSeq_id().IsGi()) {
2956  rval = true;
2957  }
2958  }
2959  return rval;
2960 }
2961 
2962 
2964 {
2965  m_TSEH = seh;
2967  m_GeneCache.Clear();
2968 }
2969 
2970 
2972 {
2974  return true;
2975  } else {
2976  return false;
2977  }
2978 }
2979 
2980 
2982 {
2983  if (se.IsSeq()) {
2984  return 1;
2985  } else if (!se.IsSet()) {
2986  return 0;
2987  }
2988  if (se.GetSet().IsSetClass()) {
2991  return 1;
2992  }
2993  }
2994  size_t count = 0;
2995  if (se.GetSet().IsSetSeq_set()) {
2996  for (auto it = se.GetSet().GetSeq_set().begin(); it != se.GetSet().GetSeq_set().end(); it++) {
2997  count += s_CountTopSetSiblings(**it);
2998  }
2999  }
3000  return count;
3001 }
3002 
3003 
3005 {
3006  // "Save" the Seq-entry
3007  SetTSE(seh);
3008 
3011 
3012  // If no Pubs/BioSource in CSeq_entry, post only one error
3013  if (GetContext().PreprocessHugeFile) {
3014  x_SetEntryInfo().SetNoPubs(GetContext().NoPubsFound);
3015  x_SetEntryInfo().SetNoCitSubPubs(GetContext().NoCitSubsFound);
3016  x_SetEntryInfo().SetNoBioSource(GetContext().NoBioSource);
3017  } else {
3019  x_SetEntryInfo().SetNoPubs(!pub);
3020  while (pub && !pub->IsSub()) {
3021  ++pub;
3022  }
3026  }
3027 
3028 
3029  // Look for genomic product set
3031  if (si->IsSetClass ()) {
3032  if (si->GetClass () == CBioseq_set::eClass_gen_prod_set) {
3033  x_SetEntryInfo().SetGPS();
3034  }
3035  if (si->GetClass () == CBioseq_set::eClass_small_genome_set) {
3037  }
3038  }
3039  }
3040 
3041  // Examine all Seq-ids on Bioseqs
3042  for (CTypeConstIterator <CBioseq> bi (*m_TSE); bi; ++bi) {
3043  FOR_EACH_SEQID_ON_BIOSEQ (sid_itr, *bi) {
3044  const CSeq_id& sid = **sid_itr;
3045  const CTextseq_id* tsid = sid.GetTextseq_Id();
3046  CSeq_id::E_Choice typ = sid.Which();
3047  switch (typ) {
3048  case CSeq_id::e_not_set:
3049  break;
3050  case CSeq_id::e_Local:
3051  break;
3052  case CSeq_id::e_Gibbsq:
3053  break;
3054  case CSeq_id::e_Gibbmt:
3055  break;
3056  case CSeq_id::e_Giim:
3057  break;
3058  case CSeq_id::e_Genbank:
3061  x_SetEntryInfo().SetGED();
3062  break;
3063  case CSeq_id::e_Embl:
3065  x_SetEntryInfo().SetGED();
3066  x_SetEntryInfo().SetEmbl();
3067  break;
3068  case CSeq_id::e_Pir:
3069  break;
3070  case CSeq_id::e_Swissprot:
3071  break;
3072  case CSeq_id::e_Patent:
3074  break;
3075  case CSeq_id::e_Other:
3077  // and do RefSeq subclasses up front as well
3078  if (sid.GetOther().IsSetAccession()) {
3079  string acc = sid.GetOther().GetAccession().substr(0, 3);
3080  if (acc == "NC_") {
3081  m_IsNC = true;
3082  } else if (acc == "NG_") {
3083  m_IsNG = true;
3084  } else if (acc == "NM_") {
3085  m_IsNM = true;
3086  } else if (acc == "NP_") {
3087  m_IsNP = true;
3088  } else if (acc == "NR_") {
3089  m_IsNR = true;
3090  } else if (acc == "NZ_") {
3091  m_IsNZ = true;
3092  } else if (acc == "NS_") {
3093  m_IsNS = true;
3094  } else if (acc == "NT_") {
3095  m_IsNT = true;
3096  } else if (acc == "NW_") {
3097  m_IsNW = true;
3098  } else if (acc == "WP_") {
3099  m_IsWP = true;
3100  } else if (acc == "XR_") {
3101  m_IsXR = true;
3102  }
3103  }
3104  break;
3105  case CSeq_id::e_General:
3106  if ((*bi).IsAa() && !sid.GetGeneral().IsSkippable()) {
3108  }
3109  break;
3110  case CSeq_id::e_Gi:
3111  x_SetEntryInfo().SetGI();
3113  break;
3114  case CSeq_id::e_Ddbj:
3116  x_SetEntryInfo().SetGED();
3117  x_SetEntryInfo().SetDdbj();
3118  break;
3119  case CSeq_id::e_Prf:
3120  break;
3121  case CSeq_id::e_Pdb:
3122  x_SetEntryInfo().SetPDB();
3123  break;
3124  case CSeq_id::e_Tpg:
3126  break;
3127  case CSeq_id::e_Tpe:
3128  x_SetEntryInfo().SetTPE();
3130  break;
3131  case CSeq_id::e_Tpd:
3133  break;
3134  case CSeq_id::e_Gpipe:
3136  break;
3137  default:
3138  break;
3139  }
3140  if ( tsid && tsid->IsSetAccession() && tsid->IsSetVersion() && tsid->GetVersion() >= 1 ) {
3142  }
3143  if (typ != CSeq_id::e_Local && typ != CSeq_id::e_General) {
3145  }
3146  }
3147  }
3148 
3149  // search all source descriptors for genomic source
3150  for (CSeqdesc_CI desc_ci (seh, CSeqdesc::e_Source);
3151  desc_ci && !m_pEntryInfo->IsGenomic();
3152  ++desc_ci) {
3153  if (desc_ci->GetSource().IsSetGenome()
3154  && desc_ci->GetSource().GetGenome() == CBioSource::eGenome_genomic) {
3156  }
3157  }
3158 
3159  // search genome build and annotation pipeline user object descriptors
3160  for (CSeqdesc_CI desc_ci (seh, CSeqdesc::e_User);
3161  desc_ci && !m_pEntryInfo->IsGpipe();
3162  ++desc_ci) {
3163  if ( desc_ci->GetUser().IsSetType() ) {
3164  const CUser_object& obj = desc_ci->GetUser();
3165  const CObject_id& oi = obj.GetType();
3166  if ( ! oi.IsStr() ) continue;
3167  if ( NStr::CompareNocase(oi.GetStr(), "GenomeBuild") == 0 ) {
3169  } else if ( NStr::CompareNocase(oi.GetStr(), "StructuredComment") == 0 ) {
3170  ITERATE (CUser_object::TData, field, obj.GetData()) {
3171  if ((*field)->IsSetLabel() && (*field)->GetLabel().IsStr()) {
3172  if (NStr::EqualNocase((*field)->GetLabel().GetStr(), "Annotation Pipeline")) {
3173  if (NStr::EqualNocase((*field)->GetData().GetStr(), "NCBI eukaryotic genome annotation pipeline")) {
3175  }
3176  }
3177  }
3178  }
3179  }
3180  }
3181  }
3182 
3183  // examine features for location gi, product gi, and locus tag
3184  for (CFeat_CI feat_ci (seh);
3186  ++feat_ci) {
3187  if (s_SeqLocHasGI(feat_ci->GetLocation())) {
3189  }
3190  if (feat_ci->IsSetProduct() && s_SeqLocHasGI(feat_ci->GetProduct())) {
3192  }
3193  if (feat_ci->IsSetData() && feat_ci->GetData().IsGene()
3194  && feat_ci->GetData().GetGene().IsSetLocus_tag()
3195  && !NStr::IsBlank (feat_ci->GetData().GetGene().GetLocus_tag())) {
3197  }
3198  }
3199 
3200  if ( m_PrgCallback ) {
3201  m_NumAlign = 0;
3202  for (CTypeConstIterator<CSeq_align> i(*m_TSE); i; ++i) {
3203  m_NumAlign++;
3204  }
3205  m_NumAnnot = 0;
3206  for (CTypeConstIterator<CSeq_annot> i(*m_TSE); i; ++i) {
3207  m_NumAnnot++;
3208  }
3209  m_NumBioseq = 0;
3210  for (CTypeConstIterator<CBioseq> i(*m_TSE); i; ++i) {
3211  m_NumBioseq++;
3212  }
3213  m_NumBioseq_set = 0;
3214  for (CTypeConstIterator<CBioseq_set> i(*m_TSE); i; ++i) {
3215  m_NumBioseq_set++;
3216  }
3217  m_NumDesc = 0;
3218  for (CTypeConstIterator<CSeqdesc> i(*m_TSE); i; ++i) {
3219  m_NumDesc++;
3220  }
3221  m_NumDescr = 0;
3222  for (CTypeConstIterator<CSeq_descr> i(*m_TSE); i; ++i) {
3223  m_NumDescr++;
3224  }
3225  m_NumFeat = 0;
3226  for (CTypeConstIterator<CSeq_feat> i(*m_TSE); i; ++i) {
3227  m_NumFeat++;
3228  }
3229  m_NumGraph = 0;
3230  for (CTypeConstIterator<CSeq_graph> i(*m_TSE); i; ++i) {
3231  m_NumGraph++;
3232  }
3235  m_NumGraph;
3236  }
3237 
3238  if (CNcbiApplication::Instance()->GetProgramDisplayName() == "table2asn") {
3239  m_IsTbl2Asn = true;
3240  }
3241 }
3242 
3243 
3245 {
3246  m_Scope.Reset(new CScope(*m_ObjMgr));
3247  m_Scope->AddTopLevelSeqEntry(*const_cast<CSeq_entry*>(&se));
3248  m_Scope->AddDefaults();
3249 }
3250 
3251 
3253 {
3254  m_IsStandaloneAnnot = true;
3255  if (! m_Scope) {
3256  m_Scope.Reset(& sah.GetScope());
3257  }
3259  m_TSE.Reset(new CSeq_entry); // set a dummy Seq-entry
3261 }
3262 
3263 
3265 {
3266  m_Scope.Reset(new CScope(*m_ObjMgr));
3267  CRef<CSeq_entry> tmp_entry(new CSeq_entry());
3268  tmp_entry->SetSeq().Assign(seq);
3269  m_TSE.Reset(tmp_entry);
3271  Setup(m_TSEH);
3272  return m_TSEH;
3273 }
3274 
3275 
3277 (const CSeq_loc& loc,
3278  const CSerialObject& obj)
3279 {
3280  for ( CSeq_loc_CI lit(loc); lit; ++lit ) {
3281  const CSeq_id& id1 = lit.GetSeq_id();
3282  CSeq_loc_CI lit2 = lit;
3283  for ( ++lit2; lit2; ++lit2 ) {
3284  const CSeq_id& id2 = lit2.GetSeq_id();
3285  if ( IsSameBioseq(id1, id2, m_Scope) && !id1.Match(id2) ) {
3288  "Two ids refer to the same bioseq but are of "
3289  "different type", obj);
3290  }
3291  }
3292  if (IsTemporary(id1)) {
3294  "Feature locations should not use Seq-ids that will be stripped during ID load", obj);
3295  }
3296  }
3299  "Feature location intervals should all be on the same sequence", obj);
3300  }
3301 }
3302 
3303 
3305 {
3306  return validator::IsInOrganelleSmallGenomeSet(id, scope);
3307 }
3308 
3309 
3310 // all ids in a location should point to the same sequence, unless the sequences are
3311 // in an organelle small genome set
3312 bool CValidError_imp::BadMultipleSequenceLocation(const CSeq_loc& loc, CScope& scope)
3313 {
3314  return validator::BadMultipleSequenceLocation(loc, scope);
3315 }
3316 
3317 
3318 bool CValidError_imp::x_IsFarFetchFailure (const CSeq_loc& loc)
3319 {
3321  && IsFarLocation(loc, GetTSEH())) {
3322  return true;
3323  } else {
3324  return false;
3325  }
3326 }
3327 
3328 
3329 //LCOV_EXCL_START
3330 // not used by asnvalidate, used by external programs
3332 {
3333  bool rval = false;
3334  Setup(se);
3335  CValidError_bioseq bioseq_validator(*this);
3337  while (bi) {
3338  rval |= bioseq_validator.GetTSANStretchErrors(*(bi->GetCompleteBioseq()));
3339  ++bi;
3340  }
3341  return rval;
3342 }
3343 
3344 
3346 {
3347  CSeq_entry_Handle seh = Setup(seq);
3348  CValidError_bioseq bioseq_validator(*this);
3349  return bioseq_validator.GetTSANStretchErrors(*(seh.GetSeq().GetCompleteBioseq()));
3350 }
3351 
3352 
3354 {
3355  bool rval = false;
3356  Setup(se);
3357  CValidError_feat feat_validator(*this);
3358  CFeat_CI fi(se);
3359  while (fi) {
3360  CBioseq_Handle bsh = se.GetScope().GetBioseqHandle(fi->GetLocation());
3361  if (bsh) {
3362  rval |= feat_validator.GetTSACDSOnMinusStrandErrors(*(fi->GetSeq_feat()), *(bsh.GetCompleteBioseq()));
3363  }
3364  ++fi;
3365  }
3366 
3367  return rval;
3368 }
3369 
3370 
3372 {
3373  CSeq_entry_Handle seh = Setup(seq);
3374  CValidError_feat feat_validator(*this);
3375  return feat_validator.GetTSACDSOnMinusStrandErrors(f, *(seh.GetSeq().GetCompleteBioseq()));
3376 }
3377 
3378 
3380 {
3381  bool rval = false;
3382  Setup(se);
3383  CValidError_bioseq bioseq_validator(*this);
3385  while (bi) {
3386  rval |= bioseq_validator.GetTSAConflictingBiomolTechErrors(*(bi->GetCompleteBioseq()));
3387  ++bi;
3388  }
3389  return rval;
3390 }
3391 
3392 
3394 {
3395  CSeq_entry_Handle seh = Setup(seq);
3396  CValidError_bioseq bioseq_validator(*this);
3397  return bioseq_validator.GetTSAConflictingBiomolTechErrors(*(seh.GetSeq().GetCompleteBioseq()));
3398 }
3399 //LCOV_EXCL_STOP
3400 
3401 const string kTooShort = "Too Short";
3402 const string kMissingPrimers = "Missing Primers";
3403 const string kMissingCountry = "Missing Country";
3404 const string kMissingVoucher = "Missing Voucher";
3405 const string kBadCollectionDate = "Bad Collection Date";
3406 const string kTooManyNs = "Too Many Ns";
3407 const string kMissingOrderAssignment = "Missing Order Assignment";
3408 const string kLowTrace = "Low Trace";
3409 const string kFrameShift = "Frame Shift";
3410 const string kStructuredVoucher = "Structured Voucher";
3411 
3412 #define ADD_BARCODE_ERR(TestName) \
3413  PostErr(eDiag_Warning, eErr_GENERIC_Barcode##TestName, k##TestName, sq); \
3414  if (!msg.empty()) { \
3415  msg += ","; \
3416  } \
3417  msg += k##TestName;
3418 
3420 {
3421  TBarcodeResults results = GetBarcodeValues(seh);
3422  for (auto r : results) {
3423  const CBioseq& sq = *(r.bsh.GetCompleteBioseq());
3424  if (BarcodeTestFails(r)){
3425  string msg;
3426  if (r.length) {
3427  ADD_BARCODE_ERR(TooShort)
3428  }
3429  if (r.primers) {
3430  ADD_BARCODE_ERR(MissingPrimers)
3431  }
3432  if (r.country) {
3433  ADD_BARCODE_ERR(MissingCountry)
3434  }
3435  if (r.voucher) {
3436  ADD_BARCODE_ERR(MissingVoucher)
3437  }
3438  if (!r.percent_n.empty()) {
3440  if (!msg.empty()) {
3441  msg += ",";
3442  }
3443  msg += kTooManyNs + ":" + r.percent_n;
3444  }
3445  if (r.collection_date) {
3446  ADD_BARCODE_ERR(BadCollectionDate)
3447  }
3448  if (r.order_assignment) {
3449  ADD_BARCODE_ERR(MissingOrderAssignment)
3450  }
3451  if (r.low_trace) {
3452  ADD_BARCODE_ERR(LowTrace)
3453  }
3454  if (r.frame_shift) {
3455  ADD_BARCODE_ERR(FrameShift)
3456  }
3457  if (!r.structured_voucher) {
3458  ADD_BARCODE_ERR(StructuredVoucher)
3459  }
3460  PostErr(eDiag_Info, eErr_GENERIC_BarcodeTestFails, "FAIL (" + msg + ")", sq);
3461  } else {
3463  }
3464  }
3465 }
3466 
3467 
3471 bool CValidError_imp::IsGPS() const { return GetEntryInfo().IsGPS(); }
3472 bool CValidError_imp::IsGED() const { return GetEntryInfo().IsGED(); }
3473 bool CValidError_imp::IsPDB() const { return GetEntryInfo().IsPDB(); }
3476 bool CValidError_imp::IsEmbl() const { return GetEntryInfo().IsEmbl(); }
3477 bool CValidError_imp::IsDdbj() const { return GetEntryInfo().IsDdbj(); }
3478 bool CValidError_imp::IsTPE() const { return GetEntryInfo().IsTPE(); }
3479 bool CValidError_imp::IsNC() const { return m_IsNC; }
3480 bool CValidError_imp::IsNG() const { return m_IsNG; }
3481 bool CValidError_imp::IsNM() const { return m_IsNM; }
3482 bool CValidError_imp::IsNP() const { return m_IsNP; }
3483 bool CValidError_imp::IsNR() const { return m_IsNR; }
3484 bool CValidError_imp::IsNS() const { return m_IsNS; }
3485 bool CValidError_imp::IsNT() const { return m_IsNT; }
3486 bool CValidError_imp::IsNW() const { return m_IsNW; }
3487 bool CValidError_imp::IsNZ() const { return m_IsNZ; }
3488 bool CValidError_imp::IsWP() const { return m_IsWP; }
3489 bool CValidError_imp::IsXR() const { return m_IsXR; }
3490 bool CValidError_imp::IsGI() const { return GetEntryInfo().IsGI(); }
3492 bool CValidError_imp::IsGpipe() const { return GetEntryInfo().IsGpipe(); }
3505 
3506 
3507 
3508 // =============================================================================
3509 // CValidError_base Implementation
3510 // =============================================================================
3511 
3512 
3514  m_Imp(imp), m_Scope(imp.GetScope())
3515 {
3516 }
3517 
3518 
3520 {
3521 }
3522 
3523 
3525 (EDiagSev sv,
3526  EErrType et,
3527  const string& msg,
3528  const CSerialObject& obj)
3529 {
3530  m_Imp.PostErr(sv, et, msg, obj);
3531 }
3532 
3533 
3534 //void CValidError_base::PostErr
3535 //(EDiagSev sv,
3536 // EErrType et,
3537 // const string& msg,
3538 // TDesc ds)
3539 //{
3540 // m_Imp.PostErr(sv, et, msg, ds);
3541 //}
3542 
3543 
3545 (EDiagSev sv,
3546  EErrType et,
3547  const string& msg,
3548  const CSeq_feat& ft)
3549 {
3550  m_Imp.PostErr(sv, et, msg, ft);
3551 }
3552 
3553 
3555 (EDiagSev sv,
3556  EErrType et,
3557  const string& msg,
3558  const CBioseq& sq)
3559 {
3560  m_Imp.PostErr(sv, et, msg, sq);
3561 }
3562 
3563 
3565 (EDiagSev sv,
3566  EErrType et,
3567  const string& msg,
3568  const CSeq_entry& ctx,
3569  const CSeqdesc& ds)
3570 {
3571  m_Imp.PostErr(sv, et, msg, ctx, ds);
3572 }
3573 
3574 
3576 (EDiagSev sv,
3577  EErrType et,
3578  const string& msg,
3579  const CBioseq_set& set)
3580 {
3581  m_Imp.PostErr(sv, et, msg, set);
3582 }
3583 
3584 
3586 (EDiagSev sv,
3587  EErrType et,
3588  const string& msg,
3589  const CSeq_annot& annot)
3590 {
3591  m_Imp.PostErr(sv, et, msg, annot);
3592 }
3593 
3595 (EDiagSev sv,
3596  EErrType et,
3597  const string& msg,
3598  const CSeq_graph& graph)
3599 {
3600  m_Imp.PostErr(sv, et, msg, graph);
3601 }
3602 
3603 
3605 (EDiagSev sv,
3606  EErrType et,
3607  const string& msg,
3608  const CBioseq& sq,
3609  const CSeq_graph& graph)
3610 {
3611  m_Imp.PostErr(sv, et, msg, sq, graph);
3612 }
3613 
3614 
3616 (EDiagSev sv,
3617  EErrType et,
3618  const string& msg,
3619  const CSeq_align& align)
3620 {
3621  m_Imp.PostErr(sv, et, msg, align);
3622 }
3623 
3624 
3626 (EDiagSev sv,
3627  EErrType et,
3628  const string& msg,
3629  const CSeq_entry& entry)
3630 {
3631  m_Imp.PostErr(sv, et, msg, entry);
3632 }
3633 
3634 CCacheImpl&
3636 {
3637  return m_Imp.GetCache();
3638 }
3639 
3640 
3642 {
3643  CSeq_entry_Handle parent = seh.GetParentEntry();
3644  if (!parent || !parent.IsSet()) {
3645  return false;
3646  }
3648  if (!pset) {
3649  return false;
3650  }
3651  if (pset->IsSetSeq_set() && pset->GetSeq_set().size() > 10) {
3652  return true;
3653  } else {
3654  return s_HasTopSetSiblings(parent);
3655  }
3656 }
3657 
3658 
3660 {
3661  CSeq_entry_Handle appropriate_parent;
3662 
3663  CSeq_entry_Handle np;
3664  CSeq_entry_Handle gps;
3665  if (seh.IsSet() && seh.GetSet().IsSetClass()) {
3666  if (seh.GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
3667  np = seh;
3668  } else if (s_IsGoodTopSetClass(seh.GetSet().GetClass())) {
3669  gps = seh;
3670  }
3671  } else if (seh.IsSeq()) {
3673  if (p && p.IsSet() && p.GetSet().IsSetClass()) {
3675  np = p;
3676  } else if (s_IsGoodTopSetClass(p.GetSet().GetClass())) {
3677  gps = p;
3678  }
3679  }
3680  }
3681  if (gps) {
3682  appropriate_parent = gps;
3683  } else if (np) {
3685  if (gp && gp.IsSet() && gp.GetSet().IsSetClass() &&
3687  appropriate_parent = gp;
3688  } else {
3689  appropriate_parent = np;
3690  }
3691  } else {
3692  appropriate_parent = seh;
3693  }
3694  return appropriate_parent;
3695 }
3696 
3697 
3700  CConstRef<CPubdesc> pub)
3701 {
3702  // first, try to receive from cache
3704  m_pubdescCache.find(pub);
3705  if( find_iter != m_pubdescCache.end() ) {
3706  return *find_iter->second;
3707  }
3708 
3709  CRef<CPubdescInfo> pInfo(new CPubdescInfo);
3711  *pub, pInfo->m_pmids, pInfo->m_muids,
3712  pInfo->m_serials, pInfo->m_published_labels,
3713  pInfo->m_unpublished_labels);
3714  m_pubdescCache[pub] = pInfo;
3715  return *pInfo;
3716 }
3717 
3718 bool
3720  const SFeatKey & rhs) const
3721 {
3722  if( feat_type != rhs.feat_type ) {
3723  return feat_type < rhs.feat_type;
3724  } else if( feat_subtype != rhs.feat_subtype ) {
3725  return feat_subtype < rhs.feat_subtype;
3726  } else {
3727  return bioseq_h < rhs.bioseq_h;
3728  }
3729 }
3730 
3731 bool
3733  const SFeatKey & rhs) const
3734 {
3735  return (feat_type == rhs.feat_type) &&
3736  (feat_subtype == rhs.feat_subtype) && (bioseq_h == rhs.bioseq_h);
3737 }
3738 
3739 const CCacheImpl::TFeatValue &
3741  const CCacheImpl::SFeatKey & featKey)
3742 {
3743  // check common case where already in the cache
3744  TFeatCache::iterator find_iter = m_featCache.find(featKey);
3745  if( find_iter != m_featCache.end() ) {
3746  return find_iter->second;
3747  }
3748 
3749  // check if bioseq already processed, but had no entry requested above
3750  SFeatKey bioseq_check_key(
3752  TFeatCache::const_iterator bioseq_find_iter =
3753  m_featCache.find(bioseq_check_key);
3754  if( bioseq_find_iter != m_featCache.end() ) {
3755  // bioseq was already processed,
3756  // it just happened to not have an entry here
3757  return kEmptyFeatValue;
3758  }
3759 
3760  // bioseq never added to cache, so calculate that now
3761 
3762  // to avoid expensive constructions of CFeat_CI's,
3763  // we iterate through all the seqs on
3764  // the bioseq and load them into the cache.
3765  CFeat_CI feat_ci(featKey.bioseq_h);
3766  for( ; feat_ci; ++feat_ci ) {
3767  SFeatKey inner_feat_key(
3768  feat_ci->GetFeatType(), feat_ci->GetFeatSubtype(), featKey.bioseq_h);
3769 
3770  m_featCache[inner_feat_key].push_back(*feat_ci);
3771 
3772  // also add "don't care" entries for partial searches
3773  // (e.g. if caller just wants to search on type but not on
3774  // subtype they can set subtype to kAnyFeatSubtype)
3775  SFeatKey any_type_key = inner_feat_key;
3776  any_type_key.feat_type = kAnyFeatType;
3777  m_featCache[any_type_key].push_back(*feat_ci);
3778 
3779  SFeatKey any_subtype_key = inner_feat_key;
3780  any_subtype_key.feat_subtype = kAnyFeatSubtype;
3781  m_featCache[any_subtype_key].push_back(*feat_ci);
3782 
3783  // for when the caller wants all feats on a bioseq
3784  SFeatKey any_type_or_subtype_key = inner_feat_key;
3785  any_type_or_subtype_key.feat_type = kAnyFeatType;
3786  any_type_or_subtype_key.feat_subtype = kAnyFeatSubtype;
3787  m_featCache[any_type_or_subtype_key].push_back(*feat_ci);
3788  }
3789 
3790  // in case a bioseq has no features, we add a dummy key just to
3791  // remember that so we don't use CFeat_CI again on the same bioseq
3792  m_featCache[bioseq_check_key]; // gets default val
3793 
3794  return m_featCache[featKey];
3795 }
3796 
3799  const vector<SFeatKey> &featKeys)
3800 {
3801  if( featKeys.empty() ) {
3802  return new TFeatValue;
3803  }
3804 
3805  // all featKeys must have the same bioseq
3806  const CBioseq_Handle & bioseq_h = featKeys[0].bioseq_h;
3807  ITERATE(vector<SFeatKey>, feat_it, featKeys) {
3808  if( feat_it->bioseq_h != bioseq_h ) {
3809  throw runtime_error("GetFeatFromCacheMulti must be called with only 1 bioseq in its args");
3810  }
3811  }
3812 
3813  // set prevents dups
3814  set<TFeatValue::value_type> set_of_feats;
3815 
3816  // combine the answers from every key into the set
3817  ITERATE(vector<SFeatKey>, key_it, featKeys ) {
3818  const TFeatValue & feat_value = GetFeatFromCache(*key_it);
3819  copy(BEGIN_COMMA_END(feat_value), inserter(
3820  set_of_feats, set_of_feats.begin()));
3821  }
3822 
3823  // go through every feature on the bioseq and remember any that match what's in the set
3824  // (The purpose of this step is to return the feats in the same
3825  // order they were on the original bioseq. In the future, we may
3826  // consider adding a flag to avoid sorting for time purposes).
3827  AutoPtr<TFeatValue> answer(new TFeatValue);
3828  SFeatKey all_feats_key(
3829  kAnyFeatType, kAnyFeatSubtype, bioseq_h);
3830  const TFeatValue & all_feats_vec = GetFeatFromCache(all_feats_key);
3831  ITERATE(TFeatValue, feat_it, all_feats_vec) {
3832  if( set_of_feats.find(*feat_it) != set_of_feats.end() ) {
3833  answer->push_back(*feat_it);
3834  }
3835  }
3836 
3837  return answer;
3838 }
3839 
3840 
3841 //LCOV_EXCL_START
3842 //not used
3843 bool
3845 {
3846  if( m_eFeatKeyStr != rhs.m_eFeatKeyStr ) {
3847  return m_eFeatKeyStr < rhs.m_eFeatKeyStr;
3848  }
3849  if( m_bioseq != rhs.m_bioseq ) {
3850  return m_bioseq < rhs.m_bioseq;
3851  }
3852  return s_QuickStringLess(m_feat_str, rhs.m_feat_str);
3853 }
3854 
3855 
3856 bool
3858 {
3859  if( m_eFeatKeyStr != rhs.m_eFeatKeyStr ) {
3860  return false;
3861  }
3862  if( m_bioseq != rhs.m_bioseq ) {
3863  return false;
3864  }
3865  return (m_feat_str == rhs.m_feat_str);
3866 }
3867 
3868 
3869 const CCacheImpl::TFeatValue &
3871  const SFeatStrKey & feat_str_key, const CTSE_Handle & tse_arg)
3872 {
3873  const CBioseq_Handle & search_bsh = feat_str_key.m_bioseq;
3874 
3875  // caller must give us something to work with
3876  _ASSERT(search_bsh || tse_arg);
3877 
3878  const CTSE_Handle & tse = (tse_arg ? tse_arg : search_bsh.GetTSE_Handle());
3879 
3880  // load cache if empty
3882  // (for now just indexes genes, but more may be added in the future)
3884  AutoPtr<CFeat_CI> p_gene_ci;
3885  // if we have TSE, get all features on it; otherwise, just get
3886  // the features from the bioseq
3887  if( tse ) {
3888  p_gene_ci.reset(new CFeat_CI(tse, sel));
3889  } else {
3890  p_gene_ci.reset(new CFeat_CI(search_bsh, sel));
3891  }
3892  CFeat_CI & gene_ci = *p_gene_ci; // for convenience
3893 
3894  for( ; gene_ci; ++gene_ci ) {
3895  CBioseq_Handle bsh = tse.GetScope().GetBioseqHandle(gene_ci->GetLocation());
3896  string label;
3897  const CGene_ref & gene_ref = gene_ci->GetData().GetGene();
3898 
3899  // for each one, add an entry for using given Bioseq and the
3900  // kAnyBioseq (so users can search on any bioseq)
3901  gene_ref.GetLabel(&label);
3902  SFeatStrKey label_key(eFeatKeyStr_Label, bsh, label);
3903  m_featStrKeyToFeatsCache[label_key].push_back(*gene_ci);
3904  if( bsh ) {
3905  label_key.m_bioseq = kAnyBioseq;
3906  m_featStrKeyToFeatsCache[label_key].push_back(*gene_ci);
3907  }
3908 
3909  const string & locus_tag = (
3910  gene_ref.IsSetLocus_tag() ? gene_ref.GetLocus_tag() :
3911  kEmptyStr);
3912  SFeatStrKey locus_tag_key(eFeatKeyStr_LocusTag, bsh, locus_tag);
3913  m_featStrKeyToFeatsCache[locus_tag_key].push_back(*gene_ci);
3914  if( bsh ) {
3915  locus_tag_key.m_bioseq = kAnyBioseq;
3916  m_featStrKeyToFeatsCache[locus_tag_key].push_back(*gene_ci);
3917  }
3918  }
3919  }
3920 
3921  // get from cache, if possible
3923  m_featStrKeyToFeatsCache.find(feat_str_key);
3924  if( find_iter != m_featStrKeyToFeatsCache.end() ) {
3925  return find_iter->second;
3926  } else {
3927  // nothing found
3928  return kEmptyFeatValue;
3929  }
3930 }
3931 
3932 
3935  const CCacheImpl::TFeatToBioseqKey & feat_to_bioseq_key,
3936  const CTSE_Handle & tse)
3937 {
3938  // load cache if empty
3939  if( m_featToBioseqCache.empty() ) {
3940  CBioseq_CI bioseq_ci(tse);
3941  for( ; bioseq_ci; ++bioseq_ci ) {
3942  CFeat_CI feat_ci(*bioseq_ci);
3943  for( ; feat_ci; ++feat_ci ) {
3944  m_featToBioseqCache[*feat_ci].insert(*bioseq_ci);
3945  }
3946  }
3947  }
3948 
3949  // we're being given the map to a feature, so we should've loaded
3950  // at least one feature when we loaded the cache
3952 
3953  // load from the cache
3955  m_featToBioseqCache.find(feat_to_bioseq_key);
3956  if( find_iter != m_featToBioseqCache.end() ) {
3957  return find_iter->second;
3958  } else {
3959  const static TFeatToBioseqValue kEmptyFeatToBioseqCache;
3960  return kEmptyFeatToBioseqCache;
3961  }
3962 }
3963 //LCOV_EXCL_STOP
3964 
3968  const CTSE_Handle & tse)
3969 {
3970  _ASSERT(tse);
3971 
3972  // load cache if empty
3973  if( m_IdToBioseqCache.empty() ) {
3974  CBioseq_CI bioseq_ci(tse);
3975  for( ; bioseq_ci; ++bioseq_ci ) {
3976  const CBioseq_Handle::TId & ids = bioseq_ci->GetId();
3977  ITERATE(CBioseq_Handle::TId, id_it, ids) {
3978  m_IdToBioseqCache[id_it->GetSeqId()] = *bioseq_ci;
3979  }
3980  }
3981  }
3982 
3983  // there should be at least one Bioseq otherwise there wouldn't
3984  // be anything to validate.
3986 
3988  if( find_iter != m_IdToBioseqCache.end() ) {
3989  return find_iter->second;
3990  } else {
3991  static const TIdToBioseqValue s_EmptyResult;
3992  return s_EmptyResult;
3993  }
3994 }
3995 
3998  CScope *scope, const CSeq_loc& loc, const CTSE_Handle & tse)
3999 {
4000  _ASSERT(scope || tse);
4001  if( ! tse || (!tse.GetTopLevelEntry().IsSet() && !tse.GetTopLevelEntry().IsSeq())) {
4002  // fall back on old style
4003  return BioseqHandleFromLocation(scope, loc);
4004  }
4005 
4006 
4007  for ( CSeq_loc_CI citer (loc); citer; ++citer) {
4008  CConstRef<CSeq_id> id(&citer.GetSeq_id());
4009  const TIdToBioseqValue & bioseq = GetIdToBioseq(id, tse);
4010  if( bioseq ) {
4011  return bioseq;
4012  }
4013  }
4014 
4015  // nothing found, so fall back on old style if possible
4016  if( scope ) {
4017  return BioseqHandleFromLocation(scope, loc);
4018  } else {
4019  return kEmptyBioseqHandle;
4020  }
4021 }
4022 
4023 
4025 {
4027  m_featCache.clear();
4031 }
4032 
4033 
4034 
4035 
4036 
4037 END_SCOPE(validator)
static CRef< CScope > m_Scope
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
EErrType
@ eErr_SEQ_FEAT_WrongQualOnImpFeat
@ eErr_SEQ_DESCR_ObsoleteSourceQual
@ eErr_SEQ_DESCR_ObsoleteSourceLocation
@ eErr_SEQ_INST_FarFetchFailure
@ eErr_SEQ_FEAT_WholeLocation
@ eErr_SEQ_INST_ShortSeq
@ eErr_GENERIC_MissingPubRequirement
@ eErr_SEQ_FEAT_EcNumberProblem
@ eErr_SEQ_FEAT_DuplicateAnticodonInterval
@ eErr_SEQ_INST_CompleteGenomeHasGaps
@ eErr_SEQ_FEAT_CDShasTooManyXs
@ eErr_SEQ_FEAT_TranslExceptPhase
@ eErr_SEQ_FEAT_MinusStrandProtein
@ eErr_SEQ_INST_CompleteTitleProblem
@ eErr_SEQ_PKG_EmptySet
@ eErr_SEQ_DESCR_UnwantedCompleteFlag
@ eErr_SEQ_FEAT_GeneXrefWithoutLocus
@ eErr_SEQ_FEAT_BadLocation
@ eErr_SEQ_FEAT_GenesInconsistent
@ eErr_SEQ_INST_HighNContentStretch
@ eErr_SEQ_PKG_NoBioseqFound
@ eErr_SEQ_FEAT_PseudoRnaHasProduct
@ eErr_SEQ_DESCR_InconsistentBioSources
@ eErr_GENERIC_PastReleaseDate
@ eErr_SEQ_DESCR_BioSourceDbTagConflict
@ eErr_SEQ_FEAT_UnknownImpFeatQual
@ eErr_SEQ_FEAT_DuplicateExonInterval
@ eErr_GENERIC_UnnecessaryPubEquiv
@ eErr_SEQ_DESCR_BioSourceOnProtein
@ eErr_SEQ_DESCR_LatLonRange
@ eErr_SEQ_FEAT_UnnecessaryTranslExcept
@ eErr_SEQ_GRAPH_GraphBioseqId
@ eErr_SEQ_FEAT_MixedStrand
@ eErr_SEQ_FEAT_BadRRNAcomponentOrder
@ eErr_SEQ_DESCR_DuplicatePCRPrimerSequence
@ eErr_SEQ_FEAT_BadGeneOntologyFormat
@ eErr_SEQ_DESCR_LatLonCountry
@ eErr_SEQ_PKG_NucProtSetHasTitle
@ eErr_SEQ_FEAT_IllegalDbXref
@ eErr_GENERIC_SgmlPresentInText
@ eErr_SEQ_FEAT_BadAnticodonAA
@ eErr_SEQ_FEAT_MissingCDSproduct
@ eErr_SEQ_FEAT_FeatureBeginsOrEndsInGap
@ eErr_SEQ_FEAT_TranslExceptAndRnaEditing
@ eErr_GENERIC_BarcodeTooManyNs
@ eErr_SEQ_PKG_BioseqSetClassNotSet
@ eErr_SEQ_DESCR_NoOrgFound
@ eErr_SEQ_FEAT_MissingProteinName
@ eErr_SEQ_DESCR_BadPCRPrimerSequence
@ eErr_SEQ_FEAT_GeneXrefWithoutGene
@ eErr_SEQ_DESCR_TransgenicProblem
@ eErr_SEQ_PKG_MissingSetTitle
@ eErr_SEQ_FEAT_InvalidQualifierValue
@ eErr_SEQ_FEAT_GeneOntologyTermMissingGOID
@ eErr_SEQ_FEAT_ProtRefHasNoData
@ eErr_SEQ_GRAPH_GraphSeqLocLen
@ eErr_SEQ_DESCR_InvalidForType
@ eErr_SEQ_DESCR_LatLonValue
@ eErr_SEQ_FEAT_TransLen
@ eErr_SEQ_FEAT_FeatureCitationProblem
@ eErr_SEQ_DESCR_IdenticalInstitutionCode
@ eErr_SEQ_PKG_ImproperlyNestedSets
@ eErr_SEQ_INST_UnknownLengthGapNot100
@ eErr_SEQ_FEAT_WrongQualOnFeature
@ eErr_SEQ_FEAT_MultipleProtRefs
@ eErr_SEQ_FEAT_MultipleEquivPublications
@ eErr_SEQ_PKG_SeqSubmitWithWgsSet
@ eErr_SEQ_PKG_InconsistentMoltypeSet
@ eErr_SEQ_INST_ConflictingBiomolTech
@ eErr_SEQ_FEAT_MissingQualOnImpFeat
@ eErr_SEQ_PKG_INSDRefSeqPackaging
@ eErr_SEQ_FEAT_LocusCollidesWithLocusTag
@ eErr_SEQ_PKG_GPSnonGPSPackaging
@ eErr_SEQ_DESCR_BadCollectionDate
@ eErr_SEQ_FEAT_MultipleEquivBioSources
@ eErr_SEQ_FEAT_CDSwithNoMRNAOverlap
@ eErr_SEQ_DESCR_BadInstitutionCode
@ eErr_SEQ_FEAT_PeptideFeatOutOfFrame
@ eErr_SEQ_FEAT_ProteinNameHasPMID
@ eErr_SEQ_FEAT_RepeatRegionNeedsNote
@ eErr_SEQ_DESCR_BadAltitude
@ eErr_SEQ_FEAT_GeneXrefStrandProblem
@ eErr_SEQ_FEAT_MissingTrnaAA
@ eErr_GENERIC_NonAsciiAsn
@ eErr_SEQ_FEAT_CDSwithMultipleMRNAs
@ eErr_SEQ_FEAT_CollidingFeatureIDs
@ eErr_SEQ_DESCR_IncorrectlyFormattedVoucherID
@ eErr_SEQ_FEAT_OrfCdsHasProduct
@ eErr_SEQ_FEAT_ImproperBondLocation
@ eErr_SEQ_PKG_GraphPackagingProblem
@ eErr_SEQ_INST_OverlappingDeltaRange
@ eErr_SEQ_FEAT_BadTranssplicedInterval
@ eErr_SEQ_INST_SeqLocLength
@ eErr_SEQ_DESCR_MultipleTaxonIDs
@ eErr_SEQ_DESCR_BadKeyword
@ eErr_SEQ_FEAT_UnknownImpFeatKey
@ eErr_SEQ_DESCR_Inconsistent
@ eErr_SEQ_PKG_ArchaicFeatureLocation
@ eErr_GENERIC_BadDate
@ eErr_GENERIC_BarcodeTestFails
@ eErr_SEQ_FEAT_NestedSeqLocMix
@ eErr_SEQ_FEAT_ShortIntron
@ eErr_SEQ_FEAT_UnknownFeatureQual
@ eErr_SEQ_DESCR_MultipleChromosomes
@ eErr_SEQ_FEAT_Range
@ eErr_SEQ_FEAT_InconsistentGeneOntologyTermAndId
@ eErr_SEQ_PKG_MisplacedMolInfo
@ eErr_GENERIC_EmbeddedScript
@ eErr_GENERIC_BarcodeTestPasses
@ eErr_SEQ_GRAPH_GraphAbove
@ eErr_SEQ_FEAT_FeatureInsideGap
@ eErr_SEQ_FEAT_DifferntIdTypesInSeqLoc
@ eErr_SEQ_FEAT_BadFullLengthFeature
@ eErr_SEQ_FEAT_RNAtype0
@ eErr_SEQ_FEAT_BadCharInAuthorName
@ eErr_SEQ_FEAT_FarLocation
@ eErr_SEQ_INST_BadHTGSeq
@ eErr_SEQ_FEAT_InvalidFuzz
@ eErr_SEQ_FEAT_InvalidInferenceValue
@ eErr_SEQ_FEAT_GeneXrefNeeded
@ eErr_SEQ_INST_UnexpectedIdentifierChange
@ eErr_SEQ_FEAT_InconsistentRRNAstrands
@ eErr_SEQ_PKG_ArchaicFeatureProduct
@ eErr_SEQ_DESCR_MultipleSourceQualifiers
@ eErr_SEQ_FEAT_BadRRNAcomponentOverlap
@ eErr_SEQ_FEAT_BadTrailingCharacter
@ eErr_SEQ_DESCR_WrongVoucherType
@ eErr_SEQ_INST_ProteinsHaveGeneralID
@ eErr_SEQ_GRAPH_GraphOutOfOrder
@ eErr_SEQ_FEAT_BadInternalCharacter
@ eErr_SEQ_DESCR_NoSourceDescriptor
@ eErr_SEQ_DESCR_BadCollectionCode
@ eErr_SEQ_FEAT_BadProteinName
@ eErr_SEQ_FEAT_FeatureProductInconsistency
@ eErr_GENERIC_PublicationInconsistency
@ eErr_GENERIC_BadSubmissionAuthorName
@ eErr_GENERIC_CollidingSerialNumbers
@ eErr_SEQ_PKG_ComponentMissingTitle
@ eErr_SEQ_DESCR_DBLinkMissingUserObject
@ eErr_SEQ_PKG_InternalGenBankSet
@ eErr_SEQ_DESCR_BioSourceMissing
@ eErr_SEQ_FEAT_BadAnticodonCodon
@ eErr_SEQ_FEAT_BadTrailingHyphen
@ eErr_SEQ_FEAT_OldLocusTagMismtach
@ eErr_SEQ_DESCR_MolInfoConflictsWithBioSource
@ eErr_SEQ_FEAT_UTRdoesNotAbutCDS
@ eErr_SEQ_FEAT_PseudoRnaViaGeneHasProduct
@ eErr_SEQ_FEAT_ConflictFlagSet
@ eErr_SEQ_FEAT_StrandOther
@ eErr_SEQ_PKG_FeaturePackagingProblem
@ eErr_SEQ_DESCR_MultipleNames
@ eErr_SEQ_INST_BadSeqIdFormat
@ eErr_SEQ_PKG_GenomicProductPackagingProblem
@ eErr_INTERNAL_Exception
@ eErr_SEQ_FEAT_BadEcNumberFormat
@ eErr_SEQ_FEAT_CDSproductPackagingProblem
@ eErr_SEQ_FEAT_RedundantFields
@ eErr_SEQ_INST_InternalNsInSeqRaw
@ eErr_SEQ_DESCR_BadOrgMod
@ eErr_SEQ_INST_TerminalNs
@ eErr_SEQ_DESCR_BadOrganelleLocation
@ eErr_SEQ_FEAT_NoNameForProtein
@ eErr_SEQ_FEAT_RptUnitRangeProblem
@ eErr_SEQ_FEAT_SeqLocOrder
@ eErr_SEQ_DESCR_TaxonomyIsSpeciesProblem
@ eErr_SEQ_FEAT_CDSmRNAXrefLocationProblem
@ eErr_SEQ_PKG_SingleItemSet
@ eErr_SEQ_DESCR_BioSourceNeedsChromosome
@ eErr_SEQ_FEAT_VectorContamination
@ eErr_SEQ_FEAT_AbuttingIntervals
@ eErr_SEQ_FEAT_CDSrange
@ eErr_SEQ_FEAT_LocusTagProblem
@ eErr_SEQ_DESCR_BioSourceInconsistency
@ eErr_SEQ_FEAT_OnlyGeneXrefs
@ eErr_SEQ_FEAT_TranslExcept
@ eErr_SEQ_INST_InternalGapsInSeqRaw
@ eErr_SEQ_FEAT_GeneRefHasNoData
@ eErr_SEQ_INST_DuplicateSegmentReferences
@ eErr_SEQ_FEAT_TooManyInferenceAccessions
@ eErr_SEQ_FEAT_TerminalXDiscrepancy
@ eErr_SEQ_FEAT_MiscFeatureNeedsNote
@ eErr_SEQ_DESCR_CollidingPublications
@ eErr_SEQ_FEAT_GenomeSetMixedStrand
@ eErr_SEQ_FEAT_BadCharInAuthorLastName
@ eErr_SEQ_FEAT_HypotheticalProteinMismatch
@ eErr_SEQ_INST_TpaAssemblyProblem
@ eErr_SEQ_FEAT_MissingGeneXref
AutoPtr –.
Definition: ncbimisc.hpp:401
CAlign_CI –.
Definition: align_ci.hpp:63
@Auth_list.hpp User-defined methods of the data storage class.
Definition: Auth_list.hpp:57
CAuthor –.
Definition: Author.hpp:59
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
CSeq_entry * GetParentEntry(void) const
Definition: Bioseq.hpp:174
static void GetPubdescLabels(const CPubdesc &pd, vector< TEntrezId > &pmids, vector< TEntrezId > &muids, vector< int > &serials, vector< string > &published_labels, vector< string > &unpublished_labels)
For Publication Citations Get labels for a pubdesc.
Definition: cleanup.cpp:3140
Definition: Dbtag.hpp:53
bool GetDBFlags(bool &is_refseq, bool &is_src, string &correct_caps) const
Definition: Dbtag.cpp:327
bool IsSkippable(void) const
Definition: Dbtag.cpp:281
CFeat_CI –.
Definition: feat_ci.hpp:64
void Clear()
Definition: gene_cache.hpp:89
CConstRef< CSeq_feat > GetGeneFromCache(const CSeq_feat *feat, CScope &scope)
Definition: gene_cache.cpp:106
void GetLabel(string *label) const
Definition: Gene_ref.cpp:57
CGraph_CI –.
Definition: graph_ci.hpp:234
CMappedFeat –.
Definition: mapped_feat.hpp:59
@Name_std.hpp User-defined methods of the data storage class.
Definition: Name_std.hpp:56
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:244
CObjectManager –.
const string & GetDivision(void) const
Definition: Org_ref.cpp:164
bool IsSetDivision(void) const
Definition: Org_ref.cpp:159
@ eContent
Definition: Pub.hpp:66
@Pubdesc.hpp User-defined methods of the data storage class.
Definition: Pubdesc.hpp:54
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
ESubtype GetSubtype(void) const
static bool RequireLocationIntervalsInBiologicalOrder(ESubtype subtype)
static bool AllowAdjacentIntervals(ESubtype subtype)
@ eSubtype_bad
These no longer need to match the FEATDEF values in the C toolkit's objfdef.h.
CSeq_annot_Handle –.
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
@ eContent
Definition: Seq_entry.hpp:93
void GetLabel(string *label, ELabelType type) const
Definition: Seq_entry.cpp:274
CSeq_entry * GetParentEntry(void) const
Definition: Seq_entry.hpp:131
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
Base class for all serializable objects.
Definition: serialbase.hpp:150
CSubmit_block –.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
Definition: tse_handle.cpp:205
TSeq_feat_Handles GetFeaturesWithId(CSeqFeatData::E_Choice type, TFeatureIdInt id) const
Definition: tse_handle.cpp:604
CScope & GetScope(void) const
Returns scope.
Definition: tse_handle.hpp:325
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
CTypeInfo class contains all information about C++ types (both basic and classes): members and layout...
Definition: typeinfo.hpp:76
Thrown on an attempt to write unassigned data member.
Definition: exception.hpp:84
static string GetFeatureBioseqLabel(const CSeq_feat &ft, CRef< CScope > scope, bool suppress_context)
static string GetDescriptorContent(const CSeqdesc &ds)
static string GetFeatureLocationLabel(const CSeq_feat &ft, CRef< CScope > scope, bool suppress_context)
static string GetFeatureProductLocLabel(const CSeq_feat &ft, CRef< CScope > scope, bool suppress_context)
static string GetDescriptorLabel(const CSeqdesc &ds, const CSeq_entry &ctx, CRef< CScope > scope, bool suppress_context)
static string GetFeatureContentLabel(const CSeq_feat &feat, CRef< CScope > scope)
static string GetFeatureIdLabel(const CSeq_feat &ft)
static string GetBioseqSetLabel(const CBioseq_set &st, CRef< CScope > scope, bool suppress_context)
void ValidateSeqAlign(const CSeq_align &align, int order=-1)
void ValidateSeqAnnot(const CSeq_annot_Handle &annot)
CCacheImpl & GetCache()
virtual ~CValidError_base()
static CSeq_entry_Handle GetAppropriateXrefParent(CSeq_entry_Handle seh)
CValidError_imp & m_Imp
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
CValidError_base(CValidError_imp &imp)
void ValidateBioseq(const CBioseq &seq)
bool GetTSAConflictingBiomolTechErrors(const CBioseq &seq)
bool GetTSANStretchErrors(const CBioseq &seq)
void ValidateBioseqSet(const CBioseq_set &seqset)
void ValidateSeqDesc(const CSeqdesc &desc, const CSeq_entry &ctx)
Validate descriptors as stand alone objects (no context)
void SetScope(CScope &scope)
void SetTSE(CSeq_entry_Handle seh)
bool GetTSACDSOnMinusStrandErrors(const CSeq_feat &feat, const CBioseq &seq)
static bool GetPrefixAndAccessionFromInferenceAccession(string inf_accession, string &prefix, string &accession)
void ValidateSeqFeat(const CSeq_feat &feat)
static vector< string > GetAccessionsFromInferenceString(string inference, string &prefix, string &remainder, bool &same_species)
void ValidateSeqGraph(const CSeq_graph &graph)
void x_ReportInvalidFuzz(const CPacked_seqint &packed_int, const CSerialObject &obj)
CRef< CObjectManager > m_ObjMgr
bool IsGED() const
void SetScope(const CSeq_entry &se)
void FindCollidingSerialNumbers(const CSerialObject &obj)
Definition: valid_pub.cpp:1323
const CSeq_entry_Handle & GetTSEH()
static bool BadMultipleSequenceLocation(const CSeq_loc &loc, CScope &scope)
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
Definition: validatorp.cpp:358
static bool IsTSAIntermediate(const CBioseq &seq)
void x_CheckPackedInt(const CPacked_seqint &packed_int, SLocCheck &lc, const CSerialObject &obj)
static bool IsInOrganelleSmallGenomeSet(const CSeq_id &id, CScope &scope)
bool IsNC() const
const CBioSourceKind & BioSourceKind() const
SIZE_TYPE m_NumPseudogene
bool IsNS() const
CRef< CScope > m_Scope
bool HasGiOrAccnVer() const
SIZE_TYPE m_NumTpaWithHistory
void SetTSE(const CSeq_entry_Handle &seh)
const SValidatorContext & GetContext() const
Definition: validatorp.cpp:204
CValidator::TProgressCallback m_PrgCallback
bool IsPDB() const
CValidError * m_ErrRepository
CConstRef< CSeq_feat > GetmRNAGivenProduct(const CBioseq &seq)
bool IsValidateAlignments() const
CBioseq_Handle GetBioseqHandleFromTSE(const CSeq_id &id)
Definition: validatorp.cpp:248
void ValidateCitations(const CSeq_entry_Handle &seh)
bool DoesAnyFeatLocHaveGI() const
void FindNonAsciiText(const CSerialObject &obj)
void AddBioseqWithNoBiosource(const CBioseq &seq)
void ValidateSeqLocIds(const CSeq_loc &loc, const CSerialObject &obj)
bool GenerateGoldenFile() const
bool IsStandaloneAnnot() const
void x_DoBarcodeTests(CSeq_entry_Handle seh)
CConstRef< CSeq_annot > m_SeqAnnot
bool IsNM() const
bool DoesAnyProductLocHaveGI() const
bool GetTSAConflictingBiomolTechErrors(const CSeq_entry_Handle &se)
void x_AddValidErrItem(EDiagSev sev, EErrType type, const string &msg, const string &desc, const CSerialObject &obj, const string &accession, const int version)
unique_ptr< CValidatorEntryInfo > m_pEntryInfo
SIZE_TYPE m_NumMisplacedGraphs
bool IsNT() const
bool IsGenbank() const
void PostObjErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
bool IsNZ() const
void Setup(const CSeq_entry_Handle &seh)
bool Validate(const CSeq_entry &se, const CCit_sub *cs=nullptr, CScope *scope=nullptr)
SIZE_TYPE m_NumTpaWithoutHistory
static bool IsWGSIntermediate(const CBioseq &seq)
CValidator::CProgressInfo m_PrgInfo
void ValidateDbxref(const CDbtag &xref, const CSerialObject &obj, bool biosource=false, const CSeq_entry *ctx=nullptr)
bool IsSerialNumberInComment(const string &comment)
bool IsGenomic() const
void ValidateTaxonomy(const CSeq_entry &se)
bool IsFarSequence(const CSeq_id &id)
Definition: validatorp.cpp:234
const CTSE_Handle & GetTSE_Handle()
void FindEmbeddedScript(const CSerialObject &obj)
bool IsHugeFileMode() const
Definition: validatorp.cpp:211
SIZE_TYPE m_NumSmallGenomeSetMisplaced
void ValidateCitSub(const CCit_sub &cs, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
Definition: valid_pub.cpp:1078
void SetOptions(Uint4 options)
Definition: validatorp.cpp:270
bool m_ValidateInferenceAccessions
void ValidateSubmitBlock(const CSubmit_block &block, const CSeq_submit &ss)
bool IsNoCitSubPubs() const
void SetErrorRepository(CValidError *errors)
Definition: validatorp.cpp:304
bool IsNP() const
vector< CConstRef< CBioseq > > m_BioseqWithNoSource
void ValidateAffil(const CAffil::TStd &std, const CSerialObject &obj, const CSeq_entry *ctx)
Definition: valid_pub.cpp:988
CConstRef< CSeq_feat > GetCDSGivenProduct(const CBioseq &seq)
CBioseq_Handle GetLocalBioseqHandle(const CSeq_id &id)
Definition: validatorp.cpp:257
bool IsRefSeq() const
bool IsGPS() const
bool IsINSDInSep() const
bool IsNG() const
const CSeq_entry * GetAncestor(const CBioseq &seq, CBioseq_set::EClass clss)
bool IsGeneious() const
SIZE_TYPE m_NumGeneXrefs
bool x_IsFarFetchFailure(const CSeq_loc &loc)
bool IsNoPubs() const
CValidError_imp(CObjectManager &objmgr, shared_ptr< SValidatorContext > pContext, CValidError *errors, Uint4 options=0)
Definition: validatorp.cpp:169
void PostBadDateError(EDiagSev sv, const string &msg, int flags, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
void AddProtWithoutFullRef(const CBioseq_Handle &seq)
void ValidateBioSource(const CBioSource &bsrc, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
const CValidatorEntryInfo & GetEntryInfo() const
CSeq_entry_Handle m_TSEH
bool RaiseGenomeSeverity(EErrType et)
Definition: validatorp.cpp:601
SIZE_TYPE m_NumBioseq_set
bool RequireLocalProduct(const CSeq_id *sid) const
bool IsGI() const
bool IsGpipe() const
bool IsFarFetchCDSproducts() const
bool IsPatent() const
bool DoesAnyProteinHaveGeneralID() const
virtual ~CValidError_imp()
Definition: validatorp.cpp:189
void x_Init(Uint4 options)
Definition: validatorp.cpp:180
bool IsWP() const
void ReportMissingPubs(const CSeq_entry &se, const CCit_sub *cs)
Definition: valid_pub.cpp:1277
bool IsNoBioSource() const
bool IsMixedStrands(const CSeq_loc &loc)
bool IsLocalGeneralOnly() const
CBioSourceKind m_biosource_kind
CConstRef< CSeq_entry > m_TSE
CGeneCache m_GeneCache
void x_InitLocCheck(SLocCheck &lc, const string &prefix)
SValidatorContext & SetContext()
Definition: validatorp.cpp:194
bool IsIndexerVersion() const
CGeneCache & GetGeneCache()
bool IsSmallGenomeSet() const
bool IsEmbl() const
void SetProgressCallback(CValidator::TProgressCallback callback, void *user_data)
bool GetTSACDSOnMinusStrandErrors(const CSeq_entry_Handle &se)
void ValidateMultipleTaxIds(const CSeq_entry_Hand