NCBI C++ ToolKit
validatorp.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: validatorp.cpp 99998 2023-06-01 19:14:33Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko, Mati Shomrat, ....
27  *
28  * File Description:
29  * Implementation of private parts of the validator
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/ncbiapp.hpp>
38 
48 
49 #include <serial/iterator.hpp>
50 #include <serial/enumvalues.hpp>
51 
55 
57 
60 
61 #include <objects/seq/Bioseq.hpp>
63 #include <objects/seq/Seqdesc.hpp>
65 #include <objects/seq/Pubdesc.hpp>
66 #include <objects/seq/MolInfo.hpp>
73 
78 
80 
83 
84 #include <objmgr/bioseq_ci.hpp>
85 #include <objmgr/seqdesc_ci.hpp>
86 #include <objmgr/graph_ci.hpp>
87 #include <objmgr/seq_annot_ci.hpp>
88 #include <objmgr/util/feature.hpp>
89 #include <objmgr/util/sequence.hpp>
90 
91 #include <objmgr/feat_ci.hpp>
92 #include <objmgr/align_ci.hpp>
93 #include <objmgr/seq_vector.hpp>
94 #include <objmgr/scope.hpp>
95 
96 #include <objects/pub/Pub.hpp>
98 
110 #include <objects/biblio/Title.hpp>
112 #include <objects/biblio/Affil.hpp>
115 #include <objects/taxon3/taxon3.hpp>
117 
124 
125 #include <objtools/error_codes.hpp>
131 #include <util/sgml_entity.hpp>
132 #include <util/line_reader.hpp>
133 #include <util/util_misc.hpp>
134 #include <util/static_set.hpp>
135 
136 #include <algorithm>
137 
138 
139 #include <serial/iterator.hpp>
140 
141 #define NCBI_USE_ERRCODE_X Objtools_Validator
142 
145 BEGIN_SCOPE(validator)
146 using namespace sequence;
147 
148 namespace {
149  // avoid creating a PQuickStringLess for every comparison
150  PQuickStringLess s_QuickStringLess;
151 };
152 
153 
154 // =============================================================================
155 // CValidError_imp Public
156 // =============================================================================
157 
163 
167 
169 (CObjectManager& objmgr,
170  shared_ptr<SValidatorContext> pContext,
171  CValidError* errs,
172  Uint4 options) :
173  m_ObjMgr{&objmgr},
174  m_ErrRepository{errs},
175  m_pContext{pContext}
176 {
177  x_Init(options);
178 }
179 
181 {
182  SetOptions(options);
183  Reset();
184 
186 }
187 
188 // Destructor
190 {
191 }
192 
193 
195 {
196  // if (!m_pContext) {
197  // m_pContext = make_shared<SValidatorContext>();
198  // }
200  return *m_pContext;
201 }
202 
203 
205 {
207  return *m_pContext;
208 }
209 
210 
212 {
213  const auto& context = GetContext();
214  return context.PreprocessHugeFile ||
215  context.PostprocessHugeFile;
216 }
217 
218 
219 bool CValidError_imp::IsHugeSet(const CBioseq_set& bioseqSet) const
220 {
221  if (bioseqSet.IsSetClass()) {
222  return IsHugeSet(bioseqSet.GetClass());
223  }
224  return false;
225 }
226 
227 
229 {
230  return edit::CHugeAsnReader::IsHugeSet(setClass);
231 }
232 
233 
234 bool CValidError_imp::IsFarSequence(const CSeq_id& id) // const
235 {
236  if (IsHugeFileMode() && GetContext().IsIdInBlob) {
237  return !GetContext().IsIdInBlob(id);
238  }
239 
240  _ASSERT(m_Scope);
241  if (GetBioseqHandleFromTSE(id)) {
242  return false;
243  }
244  return true;
245 }
246 
247 
249 {
250  if (m_Scope) {
252  }
253  return CBioseq_Handle();
254 }
255 
256 
258 {
259  if (!IsHugeFileMode()) {
260  return GetBioseqHandleFromTSE(id);
261  }
262  // Huge-file mode
263  if (!IsFarSequence(id)) {
264  return m_Scope->GetBioseqHandle(id);
265  }
266  return CBioseq_Handle();
267 }
268 
269 
271 {
272  m_NonASCII = (options & CValidator::eVal_non_ascii) != 0;
275  m_ValidateExons = (options & CValidator::eVal_val_exons) != 0;
276  m_OvlPepErr = (options & CValidator::eVal_ovl_pep_err) != 0;
279  m_RemoteFetch = (options & CValidator::eVal_remote_fetch) != 0;
285  m_UseEntrez = (options & CValidator::eVal_use_entrez) != 0;
299 }
300 
301 
302 //LCOV_EXCL_START
303 //not used by asnvalidate
305 {
306  m_ErrRepository = errors;
307 }
308 //LCOV_EXCL_STOP
309 
310 
312 {
313  m_Scope = nullptr;
314  m_TSE = nullptr;
315  m_IsStandaloneAnnot = false;
316  m_SeqAnnot.Reset();
317 
318  m_pEntryInfo.reset(new CValidatorEntryInfo());
319 
320  m_IsNC = false;
321  m_IsNG = false;
322  m_IsNM = false;
323  m_IsNP = false;
324  m_IsNR = false;
325  m_IsNZ = false;
326  m_IsNS = false;
327  m_IsNT = false;
328  m_IsNW = false;
329  m_IsWP = false;
330  m_IsXR = false;
331 
332  m_PrgCallback = nullptr;
333  m_NumAlign = 0;
334  m_NumAnnot = 0;
335  m_NumBioseq = 0;
336  m_NumBioseq_set = 0;
338  m_NumDesc = 0;
339  m_NumDescr = 0;
340  m_NumFeat = 0;
341  m_NumGraph = 0;
345  m_NumGenes = 0;
346  m_NumGeneXrefs = 0;
349  m_NumPseudo = 0;
350  m_NumPseudogene = 0;
351  m_FarFetchFailure = false;
352  m_IsTbl2Asn = false;
353 }
354 
355 
356 // Error post methods
358 (EDiagSev sv,
359  EErrType et,
360  const string& msg,
361  const CSerialObject& obj)
362 {
363  const CTypeInfo* type_info = obj.GetThisTypeInfo();
364  if (type_info == CSeqdesc::GetTypeInfo()) {
365  const CSeqdesc* desc = dynamic_cast < const CSeqdesc* > (&obj);
366  ERR_POST_X(1, Warning << "Seqdesc validation error using default context.");
367  PostErr (sv, et, msg, GetTSE(), *desc);
368  } else if (type_info == CSeq_feat::GetTypeInfo()) {
369  const CSeq_feat* feat = dynamic_cast < const CSeq_feat* > (&obj);
370  PostErr (sv, et, msg, *feat);
371  } else if (type_info == CBioseq::GetTypeInfo()) {
372  const CBioseq* seq = dynamic_cast < const CBioseq* > (&obj);
373  PostErr (sv, et, msg, *seq);
374  } else if (type_info == CBioseq_set::GetTypeInfo()) {
375  const CBioseq_set* set = dynamic_cast < const CBioseq_set* > (&obj);
376  PostErr (sv, et, msg, *set);
377  } else if (type_info == CSeq_annot::GetTypeInfo()) {
378  const CSeq_annot* annot = dynamic_cast < const CSeq_annot* > (&obj);
379  PostErr (sv, et, msg, *annot);
380  } else if (type_info == CSeq_graph::GetTypeInfo()) {
381  const CSeq_graph* graph = dynamic_cast < const CSeq_graph* > (&obj);
382  PostErr (sv, et, msg, *graph);
383  } else if (type_info == CSeq_align::GetTypeInfo()) {
384  const CSeq_align* align = dynamic_cast < const CSeq_align* > (&obj);
385  PostErr (sv, et, msg, *align);
386  } else if (type_info == CSeq_entry::GetTypeInfo()) {
387  const CSeq_entry* entry = dynamic_cast < const CSeq_entry* > (&obj);
388  PostErr (sv, et, msg, *entry);
389  } else if (type_info == CBioSource::GetTypeInfo()) {
390  const CBioSource* src = dynamic_cast < const CBioSource* > (&obj);
391  PostErr (sv, et, msg, *src);
392  } else if (type_info == COrg_ref::GetTypeInfo()) {
393  const COrg_ref* org = dynamic_cast < const COrg_ref* > (&obj);
394  PostErr (sv, et, msg, *org);
395  } else if (type_info == CPubdesc::GetTypeInfo()) {
396  const CPubdesc* pd = dynamic_cast < const CPubdesc* > (&obj);
397  PostErr (sv, et, msg, *pd);
398  } else if (type_info == CSeq_submit::GetTypeInfo()) {
399  const CSeq_submit* ss = dynamic_cast < const CSeq_submit* > (&obj);
400  PostErr (sv, et, msg, *ss);
401  } else {
402  ERR_POST_X(1, Warning << "Unknown data type in PostErr.");
403  }
404 }
405 
406 
407 /*
408 void CValidError_imp::PostErr
409 (EDiagSev sv,
410  EErrType et,
411  const string& msg,
412  TDesc ds)
413 {
414  // Append Descriptor label
415  string desc = "DESCRIPTOR: ";
416  ds.GetLabel (&desc, CSeqdesc::eBoth);
417  desc += ", NO Descriptor Context";
418  m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, *m_Scope);
419 }
420 */
421 
422 static const EErrType sc_ValidGenomeRaise[] = {
580 };
581 
583 
589 };
590 
592 
593 
596 };
597 
599 
600 
602  EErrType et
603 )
604 
605 {
606  if (sc_GenomeRaiseExceptEmblDdbjRefSeqArray.find(et) != sc_GenomeRaiseExceptEmblDdbjRefSeqArray.end()) {
607  if (IsEmbl() || IsDdbj() || IsRefSeq()) {
608  return false;
609  } else {
610  return true;
611  }
612  }
613  if (sc_GenomeRaiseExceptEmblDdbjArray.find(et) != sc_GenomeRaiseExceptEmblDdbjArray.end()) {
614  if (IsEmbl() || IsDdbj()) {
615  return false;
616  } else {
617  return true;
618  }
619  }
620  if (sc_GenomeRaiseArray.find (et) != sc_GenomeRaiseArray.end()) {
621  return true;
622  }
623  return false;
624 }
625 
627 (EDiagSev sv,
628  EErrType et,
629  const string& msg,
630  TFeat ft)
631 {
633 
634  // Adjust severity
636  sv = eDiag_Error;
637  }
638 
639  item->SetSev(sv);
640  item->SetErrIndex(et);
641  item->SetMsg(msg);
642  item->SetObject(ft);
643 
644  if (GenerateGoldenFile()) {
646  return;
647  }
648 
649  string content_label = CValidErrorFormat::GetFeatureContentLabel(ft, m_Scope);
650  item->SetObj_content(content_label);
651 
652  string feature_id = CValidErrorFormat::GetFeatureIdLabel(ft);
653  if (!NStr::IsBlank(feature_id)) {
654  item->SetFeatureId(feature_id);
655  }
656 
658  if (!NStr::IsBlank(bioseq_label)) {
659  item->SetBioseq(bioseq_label);
660  }
661 
662  // Calculate sequence offset
663  TSeqPos offset = 0;
664  string location;
665  if (ft.IsSetLocation()) {
668  if (!NStr::IsBlank(loc_label)) {
669  item->SetLocation(loc_label);
670  }
671  item->SetSeqOffset(offset);
672  }
673 
674 
676  if (!NStr::IsBlank(product_label)) {
677  item->SetProduct_loc(product_label);
678  }
679 
680  int version = 0;
681  string accession;
682  if (m_Scope) {
683  accession = GetAccessionFromObjects(&ft, nullptr, *m_Scope, &version);
684  }
685  item->SetAccession(accession);
686  if (version > 0) {
687  item->SetAccnver(accession + "." + NStr::IntToString(version));
688  item->SetVersion(version);
689  } else {
690  item->SetAccnver(accession);
691  }
692 
693  if (ft.IsSetData()) {
694  if (ft.GetData().IsGene()) {
695  if (ft.GetData().GetGene().IsSetLocus_tag() &&
697  item->SetLocus_tag(ft.GetData().GetGene().GetLocus_tag());
698  }
699  } else {
700  if (m_CollectLocusTags) {
701  // TODO: this should be part of post-processing
703  if (gene && gene->GetData().GetGene().IsSetLocus_tag() &&
704  !NStr::IsBlank(gene->GetData().GetGene().GetLocus_tag())) {
705  item->SetLocus_tag(gene->GetData().GetGene().GetLocus_tag());
706  }
707  }
708  }
709  }
710 
711  item->SetFeatureObjDescFromFields();
713 }
714 
715 
717 (EDiagSev sv,
718  EErrType et,
719  const string& msg,
720  TBioseq sq)
721 {
722  // Adjust severity
724  sv = eDiag_Error;
725  }
726 
727  if (GenerateGoldenFile()) {
728  m_ErrRepository->AddValidErrItem(sv, et, msg);
729  return;
730  }
731 
732  // Append bioseq label
733  string desc;
735  int version = 0;
736  const string& accession = GetAccessionFromBioseq(sq, &version);
737  // GetAccessionFromObjects(&sq, nullptr, *m_Scope, &version);
738  x_AddValidErrItem(sv, et, msg, desc, sq, accession, version);
739 }
740 
741 
743 (EDiagSev sv,
744  EErrType et,
745  const string& msg,
746  TSet st)
747 {
748  // Adjust severity
750  sv = eDiag_Error;
751  }
752 
753  if (GenerateGoldenFile()) {
754  m_ErrRepository->AddValidErrItem(sv, et, msg);
755  return;
756  }
757 
758  // Append Bioseq_set label
759 
760  const auto isSetClass = st.IsSetClass();
761 
762  if (isSetClass && GetContext().PreprocessHugeFile) {
763  if (auto setClass = st.GetClass(); IsHugeSet(setClass)) {
764  string desc =
766  x_AddValidErrItem(sv, et, msg, desc, st, GetContext().HugeSetId, 0);
767  return;
768  }
769  }
770 
771  int version = 0;
772  const string& accession = GetAccessionFromBioseqSet(st, &version);
773  //string desc = CValidErrorFormat::GetBioseqSetLabel(st, m_SuppressContext);
774  string desc = CValidErrorFormat::GetBioseqSetLabel(accession,
775  isSetClass ? st.GetClass() : CBioseq_set::eClass_not_set,
776  isSetClass ? m_SuppressContext : true);
777  x_AddValidErrItem(sv, et, msg, desc, st, accession, version);
778 }
779 
780 
782 (EDiagSev sv,
783  EErrType et,
784  const string& msg,
785  TEntry ctx,
786  TDesc ds)
787 {
788  // Adjust severity
790  sv = eDiag_Error;
791  }
792 
793  if (GenerateGoldenFile()) {
794  m_ErrRepository->AddValidErrItem(sv, et, msg);
795  return;
796  }
797 
798 
799  if (GetContext().PreprocessHugeFile &&
800  ctx.IsSet() && ctx.GetSet().IsSetClass()) {
801  if (auto setClass = ctx.GetSet().GetClass(); IsHugeSet(setClass)) {
802  string desc{"DESCRIPTOR: "};
803  desc += CValidErrorFormat::GetDescriptorContent(ds) + " ";
804  desc += "BIOSEQ-SET: ";
805  if (!m_SuppressContext) {
806  if (setClass == CBioseq_set::eClass_genbank) {
807  desc += "genbank: ";
808  }
809  else {
810  desc += "wgs-set: ";
811  }
812  }
813  desc += GetContext().HugeSetId;
814  m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, GetContext().HugeSetId, 0);
815  return;
816  }
817  }
818 
819  // Append Descriptor label
821  int version = 0;
822  const string& accession = GetAccessionFromObjects(&ds, &ctx, *m_Scope, &version);
823  m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, ctx, accession, version);
824 }
825 
826 
827 //void CValidError_imp::PostErr
828 //(EDiagSev sv,
829 // EErrType et,
830 // const string& msg,
831 // TBioseq sq,
832 // TDesc ds)
833 //{
834 // // Append Descriptor label
835 // string desc("DESCRIPTOR: ");
836 // ds.GetLabel(&desc, CSeqdesc::eBoth);
837 //
838 // s_AppendBioseqLabel(desc, sq, m_SuppressContext);
839 // m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, *m_Scope);
840 // //PostErr(sv, et, msg, sq);
841 //}
842 
843 
844 //void CValidError_imp::PostErr
845 //(EDiagSev sv,
846 // EErrType et,
847 // const string& msg,
848 // TSet st,
849 // TDesc ds)
850 //{
851 // // Append Descriptor label
852 // string desc = " DESCRIPTOR: ";
853 // ds.GetLabel(&desc, CSeqdesc::eBoth);
854 // s_AppendSetLabel(desc, st, m_SuppressContext);
855 // m_ErrRepository->AddValidErrItem(sv, et, msg, desc, st, *m_Scope);
856 //
857 //}
858 
859 
861 (EDiagSev sv,
862  EErrType et,
863  const string& msg,
864  TAnnot an)
865 {
866  // Adjust severity
868  sv = eDiag_Error;
869  }
870 
871  if (GenerateGoldenFile()) {
872  m_ErrRepository->AddValidErrItem(sv, et, msg);
873  return;
874  }
875 
876  // Append Annotation label
877  string desc = "ANNOTATION: ";
878 
879  // !!! need to decide on the message
880 
881  int version = 0;
882  const string& accession = GetAccessionFromObjects(&an, nullptr, *m_Scope, &version);
883  x_AddValidErrItem(sv, et, msg, desc, an, accession, version);
884 }
885 
886 
888 (EDiagSev sv,
889  EErrType et,
890  const string& msg,
891  TGraph graph)
892 {
893  // Adjust severity
895  sv = eDiag_Error;
896  }
897 
898  if (GenerateGoldenFile()) {
899  m_ErrRepository->AddValidErrItem(sv, et, msg);
900  return;
901  }
902 
903  // Append Graph label
904  string desc = "GRAPH: ";
905  if (graph.IsSetTitle()) {
906  desc += graph.GetTitle();
907  } else {
908  desc += "<Unnamed>";
909  }
910  desc += " ";
911  graph.GetLoc().GetLabel(&desc);
912 
913  int version = 0;
914  const string& accession = GetAccessionFromObjects(&graph, nullptr, *m_Scope, &version);
915  x_AddValidErrItem(sv, et, msg, desc, graph, accession, version);
916 }
917 
918 
920 (EDiagSev sv,
921  EErrType et,
922  const string& msg,
923  TBioseq sq,
924  TGraph graph)
925 {
926  // Adjust severity
928  sv = eDiag_Error;
929  }
930 
931  if (GenerateGoldenFile()) {
932  m_ErrRepository->AddValidErrItem(sv, et, msg);
933  return;
934  }
935 
936  // Append Graph label
937  string desc("GRAPH: ");
938  if ( graph.IsSetTitle() ) {
939  desc += graph.GetTitle();
940  } else {
941  desc += "<Unnamed>";
942  }
943  desc += " ";
944  graph.GetLoc().GetLabel(&desc);
946  int version = 0;
947  const string& accession = GetAccessionFromObjects(&graph, nullptr, *m_Scope, &version);
948  x_AddValidErrItem(sv, et, msg, desc, graph, accession, version);
949 }
950 
951 
953 (EDiagSev sv,
954  EErrType et,
955  const string& msg,
956  TAlign align)
957 {
958  // Adjust severity
960  sv = eDiag_Error;
961  }
962 
963  if (GenerateGoldenFile()) {
964  m_ErrRepository->AddValidErrItem(sv, et, msg);
965  return;
966  }
967 
969  if (id) {
971  if (bsh) {
972  PostErr(sv, et, msg, *(bsh.GetCompleteBioseq()));
973  return;
974  }
975  }
976 
977  // Can't get bioseq for reporting, use other Alignment label
978  string desc = "ALIGNMENT: ";
979  if (align.IsSetType()) {
980  desc += align.ENUM_METHOD_NAME(EType)()->FindName(align.GetType(), true);
981  }
982  try {
983  CSeq_align::TDim dim = align.GetDim();
984  desc += ", dim=" + NStr::NumericToString(dim);
985  } catch ( const CUnassignedMember& ) {
986  desc += ", dim=UNASSIGNED";
987  }
988 
989  if (align.IsSetSegs()) {
990  desc += " SEGS: ";
991  desc += align.GetSegs().SelectionName(align.GetSegs().Which());
992  }
993 
994  int version = 0;
995  const string& accession = GetAccessionFromObjects(&align, nullptr, *m_Scope, &version);
996  x_AddValidErrItem(sv, et, msg, desc, align, accession, version);
997 }
998 
999 
1001 (EDiagSev sv,
1002  EErrType et,
1003  const string& msg,
1004  TEntry entry)
1005 {
1006  // Adjust severity
1007  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1008  sv = eDiag_Error;
1009  }
1010 
1011  if (GenerateGoldenFile()) {
1012  m_ErrRepository->AddValidErrItem(sv, et, msg);
1013  return;
1014  }
1015 
1016  if (entry.IsSeq()) {
1017  PostErr(sv, et, msg, entry.GetSeq());
1018  } else if (entry.IsSet()) {
1019  PostErr(sv, et, msg, entry.GetSet());
1020  } else {
1021  string desc = "SEQ-ENTRY: ";
1022  entry.GetLabel(&desc, CSeq_entry::eContent);
1023 
1024  int version = 0;
1025  const string& accession = GetAccessionFromObjects(&entry, nullptr, *m_Scope, &version);
1026  x_AddValidErrItem(sv, et, msg, desc, entry, accession, version);
1027  }
1028 }
1029 
1030 
1032 (EDiagSev sv,
1033  EErrType et,
1034  const string& msg,
1035  const CBioSource& src)
1036 {
1037  // Adjust severity
1038  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1039  sv = eDiag_Error;
1040  }
1041 
1042  if (GenerateGoldenFile()) {
1043  m_ErrRepository->AddValidErrItem(sv, et, msg);
1044  return;
1045  }
1046 
1047  string desc = "BioSource: ";
1048  x_AddValidErrItem(sv, et, msg, desc, src, "", 0);
1049 }
1050 
1051 
1053 (EDiagSev sv,
1054  EErrType et,
1055  const string& msg,
1056  const COrg_ref& org)
1057 {
1058  // Adjust severity
1059  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1060  sv = eDiag_Error;
1061  }
1062 
1063  if (GenerateGoldenFile()) {
1064  m_ErrRepository->AddValidErrItem(sv, et, msg);
1065  return;
1066  }
1067 
1068  string desc = "Org-ref: ";
1069  x_AddValidErrItem(sv, et, msg, desc, org, "", 0);
1070 }
1071 
1072 
1074 (EDiagSev sv,
1075  EErrType et,
1076  const string& msg,
1077  const CPubdesc& pd)
1078 {
1079  // Adjust severity
1080  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1081  sv = eDiag_Error;
1082  }
1083 
1084  if (GenerateGoldenFile()) {
1085  m_ErrRepository->AddValidErrItem(sv, et, msg);
1086  return;
1087  }
1088 
1089  string desc = "Pubdesc: ";
1090  x_AddValidErrItem(sv, et, msg, desc, pd, "", 0);
1091 }
1092 
1093 
1095 (EDiagSev sv,
1096  EErrType et,
1097  const string& msg,
1098  const CSeq_submit& ss)
1099 {
1100  // Adjust severity
1101  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1102  sv = eDiag_Error;
1103  }
1104 
1105  if (GenerateGoldenFile()) {
1106  m_ErrRepository->AddValidErrItem(sv, et, msg);
1107  return;
1108  }
1109 
1110  string desc = "Seq-submit: ";
1111  x_AddValidErrItem(sv, et, msg, desc, ss, "", 0);
1112 }
1113 
1114 
1116  EDiagSev sev,
1117  EErrType type,
1118  const string& msg,
1119  const string& desc,
1120  const CSerialObject& obj,
1121  const string& accession,
1122  const int version)
1123 {
1124  if (IsHugeFileMode()) {
1125  m_ErrRepository->AddValidErrItem(sev, type, msg, desc, accession, version);
1126  return;
1127  }
1128  m_ErrRepository->AddValidErrItem(sev, type, msg, desc, obj, accession, version);
1129 }
1130 
1131 
1133 (EDiagSev sv,
1134  EErrType et,
1135  const string& msg,
1136  const CSerialObject& obj,
1137  const CSeq_entry *ctx)
1138 {
1139  if (!ctx) {
1140  PostErr (sv, et, msg, obj);
1141  } else if (obj.GetThisTypeInfo() == CSeqdesc::GetTypeInfo()) {
1142  PostErr(sv, et, msg, *ctx, *(dynamic_cast <const CSeqdesc*> (&obj)));
1143  } else {
1144  PostErr(sv, et, msg, obj);
1145  }
1146 
1147 }
1148 
1149 
1151 (EDiagSev sv,
1152  const string& msg,
1153  int flags,
1154  const CSerialObject& obj,
1155  const CSeq_entry *ctx)
1156 {
1157  string reasons = GetDateErrorDescription(flags);
1158 
1159  NStr::TruncateSpacesInPlace (reasons);
1160  reasons = msg + " - " + reasons;
1161 
1162  PostObjErr (sv, eErr_GENERIC_BadDate, reasons, obj, ctx);
1163 }
1164 
1165 
1167 (const CSeq_entry& se,
1168  const CCit_sub* cs,
1169  CScope* scope)
1170 {
1171  CSeq_entry_Handle seh;
1172  try {
1173  seh = scope->GetSeq_entryHandle(se);
1174  } catch (const CException& ) { ; }
1175  if (! seh) {
1176  seh = scope->AddTopLevelSeqEntry(se);
1177  if (!seh) {
1178  return false;
1179  }
1180  }
1181 
1182  return Validate(seh, cs);
1183 }
1184 
1185 static bool s_IsPhage(const COrg_ref& org)
1186 {
1187  if (org.IsSetDivision() && NStr::Equal(org.GetDivision(), "PHG")) {
1188  return true;
1189  } else {
1190  return false;
1191  }
1192 }
1193 
1194 
1196 {
1197  bool has_mult = false;
1198  int first_id = 0;
1199  int phage_id = 0;
1200 
1201  for (CBioseq_CI bi(seh); bi; ++bi) {
1202  for (CSeqdesc_CI desc_ci(*bi, CSeqdesc::e_Source);
1203  desc_ci && !has_mult;
1204  ++desc_ci) {
1205  if (desc_ci->GetSource().IsSetOrg()) {
1206  const COrg_ref& org = desc_ci->GetSource().GetOrg();
1207  if (org.IsSetDb()) {
1208  ITERATE(COrg_ref::TDb, it, org.GetDb()) {
1209  if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "taxon") &&
1210  (*it)->IsSetTag() && (*it)->GetTag().IsId()) {
1211  int this_id = (*it)->GetTag().GetId();
1212  if (this_id > 0) {
1213  if (s_IsPhage(org)) {
1214  phage_id = this_id;
1215  } else if (first_id == 0) {
1216  first_id = this_id;
1217  } else if (first_id != this_id) {
1218  has_mult = true;
1219  }
1220  }
1221  }
1222  }
1223  }
1224  }
1225  }
1226  }
1227  if (has_mult || (phage_id > 0 && first_id > 0)) {
1229  "There are multiple taxonIDs in this RefSeq record.",
1230  *m_TSE);
1231  }
1232 }
1233 
1234 
1236 {
1237  return *m_pEntryInfo;
1238 }
1239 
1240 
1242 {
1243  if (!m_pEntryInfo) {
1244  m_pEntryInfo.reset(new CValidatorEntryInfo());
1245  }
1246 
1247  return *m_pEntryInfo;
1248 }
1249 
1250 
1252 (const CSeq_entry_Handle& seh,
1253  const CCit_sub* cs)
1254 {
1255  _ASSERT(seh);
1256 
1257  if ( m_PrgCallback ) {
1259  if ( m_PrgCallback(&m_PrgInfo) ) {
1260  return false;
1261  }
1262  }
1263 
1264  // Check that CSeq_entry has data
1265  if (seh.Which() == CSeq_entry::e_not_set) {
1266  ERR_POST_X(2, Warning << "Seq_entry not set");
1267  return false;
1268  }
1269 
1270  Setup(seh);
1271 
1272  // Seq-submit has submission citationTest_Descr_LatLonValue
1273  if (cs) {
1274  x_SetEntryInfo().SetNoPubs(false);
1276  }
1277 
1278  // Get first CBioseq object pointer for PostErr below.
1280  if (!seq) {
1282  "No Bioseqs in this entire record.", seh.GetCompleteSeq_entry()->GetSet());
1283  return true;
1284  }
1285 
1286  // If m_NonASCII is true, then this flag was set by the caller
1287  // of validate to indicate that a non ascii character had been
1288  // read from a file being used to create a CSeq_entry, that the
1289  // error had been corrected, but that the error needs to be reported
1290  // by Validate. Note, Validate is not doing anything other than
1291  // reporting an error if m_NonASCII is true;
1292  if (m_NonASCII) {
1294  "Non-ascii chars in input ASN.1 strings", *seq);
1295  // Only report the error once
1296  m_NonASCII = false;
1297  }
1298 
1299  // Iterate thru components of record and validate each
1300 
1301  // also want to know if we have gi
1302  bool has_gi = false;
1303  // also want to know if there are any nucleotide sequences
1304  bool has_nucleotide_sequence = false;
1305 
1307  bi && (!IsINSDInSep() || !has_gi || !has_nucleotide_sequence);
1308  ++bi) {
1309  FOR_EACH_SEQID_ON_BIOSEQ (it, *(bi->GetCompleteBioseq())) {
1310  if ((*it)->IsGi()) {
1311  has_gi = true;
1312  }
1313  }
1314  if (bi->IsSetInst_Mol() && bi->IsNa()) {
1315  has_nucleotide_sequence = true;
1316  }
1317  }
1318 
1319  if (IsINSDInSep() && m_pEntryInfo->IsRefSeq()) {
1320  // NOTE: We use m_IsRefSeq to indicate the actual presence of RefSeq IDs in
1321  // the record, rather than IsRefSeq(), which indicates *either* RefSeq IDs are
1322  // present *OR* the refseq flag has been used
1324  "INSD and RefSeq records should not be present in the same set", *m_TSE);
1325  }
1326 
1327 #if 0
1328  // disabled for now
1329  // look for long IDs that would collide if truncated at 30 characters
1330  vector<string> id_strings;
1332  bi;
1333  ++bi) {
1334  FOR_EACH_SEQID_ON_BIOSEQ (it, *(bi->GetCompleteBioseq())) {
1335  if (!IsNCBIFILESeqId(**it)) {
1336  string label;
1337  (*it)->GetLabel(&label);
1338  id_strings.push_back(label);
1339  }
1340  }
1341  }
1342  stable_sort (id_strings.begin(), id_strings.end());
1343  for (vector<string>::iterator id_str_it = id_strings.begin();
1344  id_str_it != id_strings.end();
1345  ++id_str_it) {
1346  string pattern = (*id_str_it).substr(0, 30);
1347  string first_id = *id_str_it;
1348  vector<string>::iterator cmp_it = id_str_it;
1349  ++cmp_it;
1350  while (cmp_it != id_strings.end() && NStr::StartsWith(*cmp_it, pattern)) {
1351  CRef<CSeq_id> id(new CSeq_id(*cmp_it));
1354  "First 30 characters of " + first_id + " and " +
1355  *cmp_it + " are identical", *(bsh.GetCompleteBioseq()));
1356  ++id_str_it;
1357  ++cmp_it;
1358  }
1359  }
1360 #endif
1361 
1362  // look for colliding feature IDs
1363  vector < int > feature_ids;
1364  for (CFeat_CI fi(GetTSEH()); fi; ++fi) {
1365  const CSeq_feat& sf = fi->GetOriginalFeature();
1366  if (sf.IsSetId() && sf.GetId().IsLocal() && sf.GetId().GetLocal().IsId()) {
1367  feature_ids.push_back(sf.GetId().GetLocal().GetId());
1368  }
1369  }
1370 
1371  if (feature_ids.size() > 0) {
1372  const CTSE_Handle& tse = seh.GetTSE_Handle ();
1373  stable_sort (feature_ids.begin(), feature_ids.end());
1374  vector <int>::iterator it = feature_ids.begin();
1375  int id = *it;
1376  ++it;
1377  while (it != feature_ids.end()) {
1378  if (*it == id) {
1379  vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, id);
1380  ITERATE( vector<CSeq_feat_Handle>, feat_it, handles ) {
1382  "Colliding feature ID " + NStr::NumericToString (id), *(feat_it->GetSeq_feat()));
1383  }
1384  while (it != feature_ids.end() && *it == id) {
1385  ++it;
1386  }
1387  if (it != feature_ids.end()) {
1388  id = *it;
1389  ++it;
1390  }
1391  } else {
1392  id = *it;
1393  ++it;
1394  }
1395  }
1396  }
1397 
1398  // look for mixed gps and non-gps sets
1399  bool has_nongps = false;
1400  bool has_gps = false;
1401 
1402  for (CTypeConstIterator<CBioseq_set> si(*m_TSE); si && (!has_nongps || !has_gps); ++si) {
1403  if (si->IsSetClass()) {
1404  if (si->GetClass() == CBioseq_set::eClass_mut_set
1405  || si->GetClass() == CBioseq_set::eClass_pop_set
1406  || si->GetClass() == CBioseq_set::eClass_phy_set
1407  || si->GetClass() == CBioseq_set::eClass_eco_set
1408  || si->GetClass() == CBioseq_set::eClass_wgs_set
1409  || si->GetClass() == CBioseq_set::eClass_small_genome_set) {
1410  has_nongps = true;
1411  } else if (si->GetClass() == CBioseq_set::eClass_gen_prod_set) {
1412  has_gps = true;
1413  }
1414  }
1415  }
1416 
1417  if (has_nongps && has_gps) {
1419  "Genomic product set and mut/pop/phy/eco set records should not be present in the same set",
1420  *m_TSE);
1421  }
1422 
1423  // count inference accessions - if there are too many, temporarily disable inference checking
1424  bool old_inference_acc_check = m_ValidateInferenceAccessions;
1426  size_t num_inferences = 0, num_accessions = 0;
1427  CFeat_CI feat_inf(seh);
1428  while (feat_inf) {
1429  FOR_EACH_GBQUAL_ON_FEATURE (qual, *feat_inf) {
1430  if ((*qual)->IsSetQual() && (*qual)->IsSetVal() && NStr::Equal((*qual)->GetQual(), "inference")) {
1431  num_inferences++;
1432  string prefix, remainder;
1433  bool same_species;
1434  vector<string> accessions = CValidError_feat::GetAccessionsFromInferenceString ((*qual)->GetVal(), prefix, remainder, same_species);
1435  for (size_t i = 0; i < accessions.size(); i++) {
1436  NStr::TruncateSpacesInPlace (accessions[i]);
1437  string acc_prefix, accession;
1438  if (CValidError_feat::GetPrefixAndAccessionFromInferenceAccession (remainder, acc_prefix, accession)) {
1439  if (NStr::EqualNocase (acc_prefix, "INSD") || NStr::EqualNocase (acc_prefix, "RefSeq")) {
1440  num_accessions++;
1441  }
1442  }
1443  }
1444  }
1445  }
1446  ++feat_inf;
1447  }
1448  if (/* num_inferences > 1000 || */ num_accessions > 1000) {
1449  // warn about too many inferences
1451  "Skipping validation of " + NStr::SizetToString (num_inferences) + " /inference qualifiers with "
1452  + NStr::SizetToString (num_accessions) + " accessions",
1453  *m_TSE);
1454 
1455  // disable inference checking
1457  }
1458  }
1459 
1460  // validate the main data
1461  if (seh.IsSeq()) {
1462  const CBioseq& seq = seh.GetCompleteSeq_entry()->GetSeq();
1463  CValidError_bioseq bioseq_validator(*this);
1464  try {
1465  bioseq_validator.ValidateBioseq(seq);
1466  } catch ( const exception& e ) {
1468  string("Exception while validating bioseq. EXCEPTION: ") +
1469  e.what(), seq);
1470  return true;
1471  }
1472  } else if (seh.IsSet()) {
1473  const CBioseq_set& set = seh.GetCompleteSeq_entry()->GetSet();
1474  CValidError_bioseqset bioseqset_validator(*this);
1475 
1476  try {
1477  bioseqset_validator.ValidateBioseqSet(set);
1478 
1479  } catch ( const exception& e ) {
1481  string("Exception while validating bioseq set. EXCEPTION: ") +
1482  e.what(), set);
1483  return true;
1484  }
1485  }
1486 
1487  // put flag for validating inference accessions back to original value
1488  m_ValidateInferenceAccessions = old_inference_acc_check;
1489 
1490  // validation from data collected during previous step
1491 
1492  if ( m_NumTpaWithHistory > 0 &&
1493  m_NumTpaWithoutHistory > 0 ) {
1495  "There are " +
1497  " TPAs with history and " +
1499  " without history in this record.", *seq);
1500  }
1501  if ( m_NumTpaWithoutHistory > 0 && has_gi) {
1503  "There are " +
1505  " TPAs without history in this record, but the record has a gi number assignment.", *m_TSE);
1506  }
1507  if (IsIndexerVersion() && DoesAnyProteinHaveGeneralID() && !IsRefSeq() && has_nucleotide_sequence) {
1508  call_once(SetContext().ProteinHaveGeneralIDOnceFlag,
1509  [] (CValidError_imp* imp, CSeq_entry_Handle seh) {
1511  "INDEXER_ONLY - Protein bioseqs have general seq-id.",
1512  *(seh.GetCompleteSeq_entry()));
1513  }, this, seh);
1514  }
1515 
1516  ReportMissingPubs(*m_TSE, cs);
1518 
1519  if (m_NumMisplacedFeatures > 1) {
1521  "There are " + NStr::SizetToString (m_NumMisplacedFeatures) + " mispackaged features in this record.",
1522  *(seh.GetCompleteSeq_entry()));
1523  } else if (m_NumMisplacedFeatures == 1) {
1525  "There is 1 mispackaged feature in this record.",
1526  *(seh.GetCompleteSeq_entry()));
1527  }
1528  if (m_NumSmallGenomeSetMisplaced > 1) {
1530  "There are " + NStr::SizetToString (m_NumSmallGenomeSetMisplaced) + " mispackaged features in this small genome set record.",
1531  *(seh.GetCompleteSeq_entry()));
1532  } else if (m_NumSmallGenomeSetMisplaced == 1) {
1534  "There is 1 mispackaged feature in this small genome set record.",
1535  *(seh.GetCompleteSeq_entry()));
1536  }
1537  if ( m_NumGenes == 0 &&
1538  m_NumGeneXrefs > 0 ) {
1540  "There are " + NStr::SizetToString(m_NumGeneXrefs) +
1541  " gene xrefs and no gene features in this record.", *m_TSE);
1542  }
1543  ValidateCitations (seh);
1544 
1545 
1546  if ( m_NumMisplacedGraphs > 0 ) {
1549  string("There ") + ((m_NumMisplacedGraphs > 1) ? "are " : "is ") + num +
1550  " mispackaged graph" + ((m_NumMisplacedGraphs > 1) ? "s" : "") + " in this record.",
1551  *m_TSE);
1552  }
1553 
1554  if ( IsRefSeq() && ! IsWP() ) {
1556  }
1557 
1558 
1561  if (!GetContext().PreprocessHugeFile) {
1563  }
1564 
1565  if (m_FarFetchFailure) {
1567  "Far fetch failures caused some validator tests to be bypassed",
1568  *m_TSE);
1569  }
1570 
1571  if (m_DoTaxLookup) {
1573  }
1574 
1575  // validate cit-sub
1576  if (cs) {
1578  }
1579 
1580  // optional barcode tests
1581  if (m_DoBarcodeTests) {
1582  x_DoBarcodeTests(seh);
1583  }
1584  return true;
1585 }
1586 
1587 
1589 {
1590  if (block.IsSetHup() && block.GetHup() && block.IsSetReldate() &&
1591  IsDateInPast(block.GetReldate())) {
1593  "Record release date has already passed", ss);
1594  }
1595 
1596  if (block.IsSetContact() && block.GetContact().IsSetContact()
1597  && block.GetContact().GetContact().IsSetAffil()
1598  && block.GetContact().GetContact().GetAffil().IsStd()) {
1599  ValidateAffil(block.GetContact().GetContact().GetAffil().GetStd(), ss, nullptr);
1600  }
1601 }
1602 
1603 
1605  const CSeq_submit& ss, CScope* scope)
1606 {
1607  // Check that ss is type e_Entrys
1608  if ( ss.GetData().Which() != CSeq_submit::C_Data::e_Entrys ) {
1609  return;
1610  }
1611 
1613  if (ss.IsSetSub()) {
1614  if (IsHugeFileMode()) {
1615  call_once(SetContext().SubmitBlockOnceFlag,
1616  [this, &ss](){ ValidateSubmitBlock(ss.GetSub(), ss); });
1617  }
1618  else {
1619  ValidateSubmitBlock(ss.GetSub(), ss);
1620  }
1621  }
1622 
1623  // Get CCit_sub pointer
1624  const CCit_sub* cs = &ss.GetSub().GetCit();
1625 
1626  if (ss.IsSetSub() && ss.GetSub().IsSetTool() && NStr::StartsWith(ss.GetSub().GetTool(), "Geneious")) {
1628  }
1629 
1630  // Just loop thru CSeq_entrys
1631  FOR_EACH_SEQENTRY_ON_SEQSUBMIT (se_itr, ss) {
1632  const CSeq_entry& se = **se_itr;
1633  if(se.IsSet())
1634  {
1635  const CBioseq_set &set = se.GetSet();
1636  if(set.IsSetClass() &&
1637  set.GetClass() == CBioseq_set::eClass_wgs_set)
1638  {
1640  CSeq_entry_Handle seh;
1641  seh = scope->GetSeq_entryHandle(se);
1642  Setup(seh);
1643  call_once(SetContext().WgsSetInSeqSubmitOnceFlag,
1644  [this, seh]() {
1646  "File was created as a wgs-set, but should be a batch submission instead.",
1647  seh.GetCompleteSeq_entry()->GetSet());
1648  });
1649  } else {
1650  CSeq_entry_Handle seh;
1651  seh = scope->GetSeq_entryHandle(se);
1652  Setup(seh);
1654  "File was created as a wgs-set, but should be a batch submission instead.",
1655  seh.GetCompleteSeq_entry()->GetSet());
1656  }
1657  }
1658  }
1659  Validate (se, cs, scope);
1660  }
1661 }
1662 
1663 
1665  const CSeq_annot_Handle& sah)
1666 {
1667  Setup(sah);
1668 
1669  // Iterate thru components of record and validate each
1670 
1671  CValidError_annot annot_validator(*this);
1672  annot_validator.ValidateSeqAnnot(sah);
1673 
1674  switch (sah.Which()) {
1676  {
1677  CValidError_feat feat_validator(*this);
1678  for (CFeat_CI fi (sah); fi; ++fi) {
1679  const CSeq_feat& sf = fi->GetOriginalFeature();
1680  feat_validator.ValidateSeqFeat(sf);
1681  }
1682  }
1683  break;
1684 
1686  {
1687  if (IsValidateAlignments()) {
1688  CValidError_align align_validator(*this);
1689  int order = 1;
1690  for (CAlign_CI ai(sah); ai; ++ai) {
1691  const CSeq_align& sa = ai.GetOriginalSeq_align();
1692  align_validator.ValidateSeqAlign(sa, order++);
1693  }
1694  }
1695  }
1696  break;
1697 
1699  {
1700  CValidError_graph graph_validator(*this);
1701  // for (CTypeConstIterator <CSeq_graph> gi (sa); gi; ++gi) {
1702  for (CGraph_CI gi(sah); gi; ++gi) {
1703  const CSeq_graph& sg = gi->GetOriginalGraph();
1704  graph_validator.ValidateSeqGraph(sg);
1705  }
1706  }
1707  break;
1708  default:
1709  break;
1710  }
1714 }
1715 
1716 
1717 void CValidError_imp::Validate(const CSeq_feat& feat, CScope* scope)
1718 {
1719  // automatically restores m_Scope to its old value when we leave
1720  // the function
1721  CScopeRestorer scopeRestorer( m_Scope );
1722 
1723  if( scope ) {
1724  m_Scope.Reset(scope);
1725  }
1726  if (!m_Scope) {
1727  // set up a temporary local scope if there is no scope set already
1728  m_Scope.Reset(new CScope(*m_ObjMgr));
1729  }
1730 
1731  CValidError_feat feat_validator(*this);
1732  feat_validator.SetScope(*m_Scope);
1734  feat_validator.SetTSE(empty);
1735  feat_validator.ValidateSeqFeat(feat);
1736  if (feat.IsSetData() && feat.GetData().IsBiosrc()) {
1737  const CBioSource& src = feat.GetData().GetBiosrc();
1738  if (src.IsSetOrg()) {
1740  }
1741  }
1742  FindEmbeddedScript(feat);
1743  FindNonAsciiText(feat);
1745 }
1746 
1747 
1749 {
1750  // automatically restores m_Scope to its old value when we leave
1751  // the function
1752  CScopeRestorer scopeRestorer( m_Scope );
1753 
1754  if( scope ) {
1755  m_Scope.Reset(scope);
1756  }
1757  if (!m_Scope) {
1758  // set up a temporary local scope if there is no scope set already
1759  m_Scope.Reset(new CScope(*m_ObjMgr));
1760  }
1761 
1762  ValidateBioSource(src, src);
1763  if (src.IsSetOrg()) {
1765  }
1766  FindEmbeddedScript(src);
1767  FindNonAsciiText(src);
1769 }
1770 
1771 
1772 void CValidError_imp::Validate(const CPubdesc& pubdesc, CScope* scope)
1773 {
1774  // automatically restores m_Scope to its old value when we leave
1775  // the function
1776  CScopeRestorer scopeRestorer( m_Scope );
1777 
1778  if( scope ) {
1779  m_Scope.Reset(scope);
1780  }
1781  if (!m_Scope) {
1782  // set up a temporary local scope if there is no scope set already
1783  m_Scope.Reset(new CScope(*m_ObjMgr));
1784  }
1785 
1786  ValidatePubdesc(pubdesc, pubdesc);
1787  FindEmbeddedScript(pubdesc);
1788  FindNonAsciiText(pubdesc);
1789  FindCollidingSerialNumbers(pubdesc);
1790 }
1791 
1793 {
1794  CValidError_desc seqdesc_validator(*this);
1795  m_Scope.Reset(new CScope(*m_ObjMgr));
1797  seqdesc_validator.ValidateSeqDesc(desc,ctx);
1798 }
1799 
1800 
1803  void* user_data)
1804 {
1805  m_PrgCallback = callback;
1806  m_PrgInfo.m_UserData = user_data;
1807 }
1808 
1809 
1811 (const CDbtag& xref,
1812  const CSerialObject& obj,
1813  bool biosource,
1814  const CSeq_entry *ctx)
1815 {
1816  bool refseq_or_gps = IsRefSeq() || IsGPS();
1818  refseq_or_gps);
1819 
1820  const string& db = xref.IsSetDb() ? xref.GetDb() : kEmptyStr;
1821 
1824  "dbxref value " + xref.GetTag().GetStr() + " has SGML",
1825  obj, ctx);
1826  }
1829  "dbxref value " + xref.GetTag().GetStr() + " contains space character",
1830  obj, ctx);
1831  }
1832  if (flags & CValidator::eDbHasSgml) {
1834  "dbxref database " + db + " has SGML",
1835  obj, ctx);
1836  }
1837 
1838  bool isStr = false;
1839  string dbv;
1840  if (xref.IsSetTag() && xref.GetTag().IsStr()) {
1841  dbv = xref.GetTag().GetStr();
1842  isStr = true;
1843  } else if (xref.IsSetTag() && xref.GetTag().IsId()) {
1844  dbv = NStr::NumericToString(xref.GetTag().GetId());
1845  }
1846 
1849  "Illegal db_xref type " + db + " (" + dbv + ")", obj, ctx);
1850  }
1852  // capitalization is bad
1853  bool refseq_db = false, src_db = false;
1854  string correct_caps;
1855  xref.GetDBFlags(refseq_db, src_db, correct_caps);
1856  string message = "Illegal db_xref type " + db + " (" + dbv + "), legal capitalization is " + correct_caps;
1858  message += ", but should not be used on an OrgRef";
1859  } else if (flags & CValidator::eOnlyForSource) {
1860  message += ", but should only be used on an OrgRef";
1861  }
1862 
1864  } else {
1868  "RefSeq-specific db_xref type " + db + " (" + dbv + ") should not be used on a non-RefSeq OrgRef",
1869  obj, ctx);
1870  } else {
1872  "db_xref type " + db + " (" + dbv + ") is only legal for RefSeq",
1873  obj, ctx);
1874  }
1875  } else if (flags & CValidator::eNotForSource) {
1878  "RefSeq-specific db_xref type " + db + " (" + dbv + ") should not be used on an OrgRef",
1879  obj, ctx);
1880  } else {
1882  "db_xref type " + db + " (" + dbv + ") should not be used on an OrgRef",
1883  obj, ctx);
1884  }
1885  } else if (flags & CValidator::eOnlyForSource) {
1887  "db_xref type " + db + " (" + dbv + ") should only be used on an OrgRef",
1888  obj, ctx);
1889  }
1890  }
1891 
1892  if (isStr && db == "GeneID") {
1894  "db_xref type " + db + " (" + dbv + ") is required to be an integer",
1895  obj, ctx);
1896  }
1897 }
1898 
1899 
1901 (TDbtags& xref_list,
1902  const CSerialObject& obj,
1903  bool biosource,
1904  const CSeq_entry *ctx)
1905 {
1906  string last_db;
1907 
1908  ITERATE( TDbtags, xref, xref_list) {
1909  if (biosource
1910  && (*xref)->IsSetDb()) {
1911  if (!NStr::IsBlank(last_db)
1912  && NStr::EqualNocase((*xref)->GetDb(), last_db)) {
1914  "BioSource uses db " + last_db + " multiple times",
1915  obj, ctx);
1916  }
1917  last_db = (*xref)->GetDb();
1918  }
1919  ValidateDbxref(**xref, obj, biosource, ctx);
1920  }
1921 }
1922 
1923 
1925 (const CPacked_seqint& packed_int,
1926  SLocCheck& lc,
1927  const CSerialObject& obj)
1928 {
1929  ITERATE(CPacked_seqint::Tdata, it, packed_int.Get()) {
1930  lc.int_cur = (*it);
1931  lc.chk &= x_CheckSeqInt(lc.id_cur, lc.int_cur, lc.strand_cur, obj);
1932 
1934 
1935  lc.id_prv = lc.id_cur;
1936  lc.strand_prv = lc.strand_cur;
1937  lc.int_prv = lc.int_cur;
1938  }
1939 }
1940 
1941 
1943 (CConstRef<CSeq_id>& id_cur,
1944  const CSeq_interval * int_cur,
1945  ENa_strand& strand_cur,
1946  const CSerialObject& obj)
1947 {
1948  strand_cur = int_cur->IsSetStrand() ?
1949  int_cur->GetStrand() : eNa_strand_unknown;
1950  id_cur = &int_cur->GetId();
1951  bool chk = IsValid(*int_cur, m_Scope);
1952  return chk;
1953 }
1954 
1955 
1957 {
1958  ITERATE(CPacked_seqint::Tdata, it, packed_int.Get()) {
1959  x_ReportInvalidFuzz(**it, obj);
1960  }
1961 }
1962 
1963 
1964 static const string kSpaceLeftFirst = "Should not specify 'space to left' at first position of non-circular sequence";
1965 static const string kSpaceRightLast = "Should not specify 'space to right' at last position of non-circular sequence";
1966 
1967 static const string kSpaceLeftCircle = "Should not specify 'circle to left' except at first position of circular sequence";
1968 static const string kSpaceRightCircle = "Should not specify 'circle to right' except at last position of circular sequence";
1969 
1971 {
1974  bool has_fuzz_from = false;
1975  bool has_fuzz_to = false;
1976 
1977  if (interval.IsSetFuzz_from() && interval.GetFuzz_from().IsLim()) {
1978  fuzz_from = interval.GetFuzz_from().GetLim();
1979  has_fuzz_from = true;
1980  }
1981  if (interval.IsSetFuzz_to() && interval.GetFuzz_to().IsLim()) {
1982  fuzz_to = interval.GetFuzz_to().GetLim();
1983  has_fuzz_to = true;
1984  }
1985  if (! has_fuzz_from && ! has_fuzz_to) {
1986  return;
1987  }
1988 
1989  // check for invalid fuzz on both ends of Interval
1990  if (has_fuzz_from && has_fuzz_to && fuzz_from == fuzz_to) {
1991  if (fuzz_from == CInt_fuzz::eLim_tl) {
1994  "Should not specify 'space to left' for both ends of interval", obj);
1995  }
1996  else if (fuzz_from == CInt_fuzz::eLim_tr) {
1999  "Should not specify 'space to right' for both ends of interval", obj);
2000  }
2001  else if (fuzz_from == CInt_fuzz::eLim_circle) {
2004  "Should not specify 'origin of circle' for both ends of interval", obj);
2005  }
2006  }
2007 
2008  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(interval.GetId());
2009  if (! bsh) {
2010  return;
2011  }
2012 
2014  if (bsh.IsSetInst_Topology()) {
2015  top = bsh.GetInst_Topology();
2016  }
2017 
2018  if (top != CSeq_inst::eTopology_circular) {
2019 
2020  // VR-15
2021  // look for space to left at beginning of sequence or space to right at end
2022  if (fuzz_from == CInt_fuzz::eLim_tl && interval.IsSetFrom() && interval.GetFrom() == 0) {
2024  }
2025  if (fuzz_to == CInt_fuzz::eLim_tr && interval.IsSetTo() && interval.GetTo() == bsh.GetBioseqLength() - 1) {
2027  }
2028 
2029  } else if (fuzz_from == CInt_fuzz::eLim_circle || fuzz_to == CInt_fuzz::eLim_circle) {
2030 
2031  if (obj.GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
2032  const CSeq_feat* sfp = dynamic_cast<const CSeq_feat*>(&obj);
2033  if (sfp && sfp->IsSetExcept() && sfp->CanGetExcept_text() && NStr::FindNoCase(sfp->GetExcept_text(), "ribosomal slippage") != NPOS) {
2034  return;
2035  }
2036  }
2037 
2038  // VR-832
2039  if (fuzz_from == CInt_fuzz::eLim_circle && interval.IsSetFrom() && interval.GetFrom() != 0) {
2041  }
2042  if (fuzz_to == CInt_fuzz::eLim_circle && interval.IsSetTo() && interval.GetTo() != bsh.GetBioseqLength() - 1) {
2044  }
2045  }
2046 }
2047 
2048 
2050 {
2051  // VR-15
2052  if (!point.IsSetFuzz() || !point.GetFuzz().IsLim() ||
2053  (point.GetFuzz().GetLim() != CInt_fuzz::eLim_tl && point.GetFuzz().GetLim() != CInt_fuzz::eLim_tr) ||
2054  !point.IsSetId() || !point.IsSetPoint()) {
2055  return;
2056  }
2057  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(point.GetId());
2058  if (!bsh) {
2059  return;
2060  }
2062  return;
2063  }
2064  if (point.GetPoint() == 0 && point.GetFuzz().GetLim() == CInt_fuzz::eLim_tl) {
2066  }
2067  if (point.GetPoint() == bsh.GetBioseqLength() - 1) {
2069  }
2070 }
2071 
2072 
2073 void CValidError_imp::x_ReportInvalidFuzz(const CSeq_loc& loc, const CSerialObject& obj)
2074 {
2076  for (; lit; ++lit) {
2077  CSeq_loc::E_Choice loc_choice = lit->Which();
2078  switch (loc_choice) {
2079  case CSeq_loc::e_Int:
2080  x_ReportInvalidFuzz(lit->GetInt(), obj);
2081  break;
2083  x_ReportInvalidFuzz(lit->GetPacked_int(), obj);
2084  break;
2085  case CSeq_loc::e_Pnt:
2086  x_ReportInvalidFuzz(lit->GetPnt(), obj);
2087  break;
2088  default:
2089  break;
2090  }
2091  }
2092 }
2093 
2094 
2095 unsigned int s_CountMix(const CSeq_loc& loc)
2096 {
2097  unsigned int num_mix = 0;
2099  for (; lit; ++lit) {
2100  if (lit->IsMix()) {
2101  num_mix++;
2102  }
2103  }
2104  return num_mix;
2105 }
2106 
2107 
2109 {
2110  lc.chk = true;
2111  lc.unmarked_strand = false;
2112  lc.mixed_strand = false;
2113  lc.has_other = false;
2114  lc.has_not_other = false;
2115  lc.id_cur = nullptr;
2116  lc.id_prv = nullptr;
2117  lc.int_cur = nullptr;
2118  lc.int_prv = nullptr;
2119  lc.strand_cur = eNa_strand_unknown;
2120  lc.strand_prv = eNa_strand_unknown;
2121  lc.prefix = prefix;
2122 }
2123 
2125 {
2126  if (lc.strand_prv != eNa_strand_other &&
2127  lc.strand_cur != eNa_strand_other) {
2128  if (lc.id_cur && lc.id_prv &&
2129  IsSameBioseq(*lc.id_cur, *lc.id_prv, m_Scope)) {
2130  if (lc.strand_prv != lc.strand_cur) {
2131  if ((lc.strand_prv == eNa_strand_plus &&
2132  lc.strand_cur == eNa_strand_unknown) ||
2133  (lc.strand_prv == eNa_strand_unknown &&
2134  lc.strand_cur == eNa_strand_plus)) {
2135  lc.unmarked_strand = true;
2136  } else {
2137  lc.mixed_strand = true;
2138  }
2139  }
2140  }
2141  }
2142  if (lc.strand_cur == eNa_strand_other) {
2143  lc.has_other = true;
2144  } else if (lc.strand_cur == eNa_strand_minus || lc.strand_cur == eNa_strand_plus) {
2145  lc.has_not_other = true;
2146  }
2147 
2148 }
2149 
2150 void CValidError_imp::x_CheckLoc(const CSeq_loc& loc, const CSerialObject& obj, SLocCheck& lc, bool lowerSev)
2151 {
2152  try {
2153  switch (loc.Which()) {
2154  case CSeq_loc::e_Int:
2155  lc.int_cur = &loc.GetInt();
2156  lc.chk = x_CheckSeqInt(lc.id_cur, lc.int_cur, lc.strand_cur, obj);
2157  if (lc.strand_cur == eNa_strand_other) {
2158  lc.has_other = true;
2159  }
2160  if ((!lc.chk) && lowerSev) {
2161  TSeqPos length = GetLength(loc.GetInt().GetId(), m_Scope);
2162  TSeqPos fr = loc.GetInt().GetFrom();
2163  TSeqPos to = loc.GetInt().GetTo();
2164  if (fr < length && to >= length) {
2165  // RefSeq variation feature with dbSNP xref and interval flanking the length is ERROR
2166  } else {
2167  // otherwise keep severity at REJECT
2168  lowerSev = false;
2169  }
2170  }
2171  break;
2172  case CSeq_loc::e_Pnt:
2173  lc.strand_cur = loc.GetPnt().IsSetStrand() ?
2174  loc.GetPnt().GetStrand() : eNa_strand_unknown;
2175  if (lc.strand_cur == eNa_strand_other) {
2176  lc.has_other = true;
2177  }
2178  lc.id_cur = &loc.GetPnt().GetId();
2179  lc.chk = IsValid(loc.GetPnt(), m_Scope);
2180  lc.int_prv = nullptr;
2181  break;
2183  lc.strand_cur = loc.GetPacked_pnt().IsSetStrand() ?
2184  loc.GetPacked_pnt().GetStrand() : eNa_strand_unknown;
2185  if (lc.strand_cur == eNa_strand_other) {
2186  lc.has_other = true;
2187  }
2188  lc.id_cur = &loc.GetPacked_pnt().GetId();
2189  lc.chk = IsValid(loc.GetPacked_pnt(), m_Scope);
2190  lc.int_prv = nullptr;
2191  break;
2193  x_CheckPackedInt(loc.GetPacked_int(), lc, obj);
2194  break;
2195  case CSeq_loc::e_Null:
2196  break;
2197  case CSeq_loc::e_Mix:
2198  for (auto l : loc.GetMix().Get()) {
2199  x_CheckLoc(*l, obj, lc, lowerSev);
2201  }
2202  break;
2203  default:
2204  lc.strand_cur = eNa_strand_other;
2205  lc.id_cur = nullptr;
2206  lc.int_prv = nullptr;
2207  break;
2208  }
2209  if (!lc.chk) {
2210  string lbl = GetValidatorLocationLabel (loc, *m_Scope);
2211  EDiagSev sev = eDiag_Critical;
2212  if (lowerSev) {
2213  sev = eDiag_Error;
2214  }
2216  lc.prefix + ": SeqLoc [" + lbl + "] out of range", obj);
2217  }
2218 
2219  if (loc.Which() != CSeq_loc::e_Null) {
2221 
2222  lc.strand_prv = lc.strand_cur;
2223  lc.id_prv = lc.id_cur;
2224  }
2225  } catch( const exception& e ) {
2226  string label = GetValidatorLocationLabel(loc, *m_Scope);
2228  "Exception caught while validating location " +
2229  label + ". Exception: " + e.what(), obj);
2230 
2231  lc.strand_cur = eNa_strand_other;
2232  lc.id_cur = nullptr;
2233  lc.int_prv = nullptr;
2234  }
2235 }
2236 
2238 (const CSeq_loc& loc,
2239  const CBioseq_Handle& seq,
2240  bool report_abutting,
2241  const string& prefix,
2242  const CSerialObject& obj,
2243  bool lowerSev)
2244 {
2245  SLocCheck lc;
2246 
2248 
2249  x_CheckLoc(loc, obj, lc, lowerSev);
2250 
2251  if (lc.has_other && lc.has_not_other) {
2252  string label = GetValidatorLocationLabel(loc, *m_Scope);
2254  prefix + ": Inconsistent use of other strand SeqLoc [" + label + "]", obj);
2255  } else if (lc.has_other && NStr::Equal(prefix, "Location")) {
2258  "Strand 'other' in location", obj);
2259  }
2260 
2261  x_ReportInvalidFuzz(loc, obj);
2262 
2266  "Duplicate exons in location", obj);
2267  }
2268 
2269  if (s_CountMix(loc) > 1) {
2270  string label;
2271  loc.GetLabel(&label);
2273  prefix + ": SeqLoc [" + label + "] has nested SEQLOC_MIX elements",
2274  obj);
2275  }
2276 
2277  // Warn if different parts of a seq-loc refer to the same bioseq using
2278  // differnt id types (i.e. gi and accession)
2279  ValidateSeqLocIds(loc, obj);
2280 
2281  bool trans_splice = false;
2282  bool circular_rna = false;
2283  bool exception = false;
2284  const CSeq_feat* sfp = nullptr;
2285  if (obj.GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
2286  sfp = dynamic_cast<const CSeq_feat*>(&obj);
2287  }
2288  if (sfp) {
2289  // primer_bind intervals MAY be in on opposite strands
2291  lc.mixed_strand = false;
2292  lc.unmarked_strand = false;
2293  }
2294 
2295  exception = sfp->IsSetExcept() ? sfp->GetExcept() : false;
2296  if (exception && sfp->CanGetExcept_text()) {
2297  if (NStr::FindNoCase(sfp->GetExcept_text(), "trans-splicing") != NPOS) {
2298  // trans splicing exception turns off both mixed_strand and
2299  // out_of_order messages
2300  trans_splice = true;
2301  } else if (NStr::FindNoCase(sfp->GetExcept_text(), "circular RNA") != NPOS) {
2302  // circular RNA exception turns off out_of_order message
2303  circular_rna = true;
2304  }
2305  }
2306  }
2307 
2308  string loc_lbl;
2309  if (report_abutting && (!sfp || !CSeqFeatData::AllowAdjacentIntervals(sfp->GetData().GetSubtype())) &&
2311  loc_lbl = GetValidatorLocationLabel(loc, *m_Scope);
2312 
2313  EDiagSev sev = exception ? eDiag_Warning : eDiag_Error;
2315  prefix + ": Adjacent intervals in SeqLoc [" +
2316  loc_lbl + "]", obj);
2317  }
2318 
2319  if (trans_splice && !NStr::Equal(prefix, "Product")) {
2320  CSeq_loc_CI li(loc);
2321  ++li;
2322  if (!li) {
2323  PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadTranssplicedInterval, "Trans-spliced feature should have multiple intervals", obj);
2324  }
2325  return;
2326  }
2327 
2328  bool ordered = true;
2329  bool circular = false;
2330  if ( seq &&
2331  seq.IsSetInst() && seq.GetInst().IsSetTopology() &&
2333  circular = true;
2334  }
2335  try {
2336  if (m_Scope && (!sfp || CSeqFeatData::RequireLocationIntervalsInBiologicalOrder(sfp->GetData().GetSubtype())) && !circular) {
2338  }
2339  } catch ( const CException& ex) {
2340  string label;
2341  loc.GetLabel(&label);
2343  "Exception caught while validating location " +
2344  label + ". Exception: " + ex.what(), obj);
2345  }
2346 
2347  if (lc.mixed_strand || lc.unmarked_strand || !ordered) {
2348  if (loc_lbl.empty()) {
2349  loc_lbl = GetValidatorLocationLabel(loc, *m_Scope);
2350  }
2351  if (lc.mixed_strand) {
2352  if (IsSmallGenomeSet()) {
2354  prefix + ": Mixed strands in SeqLoc ["
2355  + loc_lbl + "] in small genome set - set trans-splicing exception if appropriate", obj);
2356  } else {
2357  EDiagSev sev = eDiag_Error;
2358  if (IsGeneious() || (sfp && sequence::IsPseudo(*sfp, *m_Scope))) {
2359  sev = eDiag_Warning;
2360  }
2362  prefix + ": Mixed strands in SeqLoc ["
2363  + loc_lbl + "]", obj);
2364  }
2365  } else if (lc.unmarked_strand) {
2367  prefix + ": Mixed plus and unknown strands in SeqLoc ["
2368  + loc_lbl + "]", obj);
2369  }
2370  if (!ordered && !circular_rna) {
2371  if (IsSmallGenomeSet()) {
2373  prefix + ": Intervals out of order in SeqLoc [" +
2374  loc_lbl + "]", obj);
2375  } else {
2377  prefix + ": Intervals out of order in SeqLoc [" +
2378  loc_lbl + "]", obj);
2379  }
2380  }
2381  return;
2382  }
2383 
2384  if ( seq &&
2385  seq.IsSetInst_Repr() &&
2386  seq.GetInst_Repr() != CSeq_inst::eRepr_seg ) {
2387  return;
2388  }
2389 
2390  // Check for intervals out of order on segmented Bioseq
2391  if ( seq && BadSeqLocSortOrder(seq, loc) && !circular_rna ) {
2392  if (loc_lbl.empty()) {
2393  loc.GetLabel(&loc_lbl);
2394  }
2396  prefix + "Intervals out of order in SeqLoc [" +
2397  loc_lbl + "]", obj);
2398  }
2399 
2400  // Check for mixed strand on segmented Bioseq
2401  if ( IsMixedStrands(loc) ) {
2402  if (loc_lbl.empty()) {
2403  loc.GetLabel(&loc_lbl);
2404  }
2406  prefix + ": Mixed strands in SeqLoc [" +
2407  loc_lbl + "]", obj);
2408  }
2409 }
2410 
2411 
2413 {
2414  if (!SeqIsPatent(seq)) {
2415  m_BioseqWithNoSource.push_back(CConstRef<CBioseq>(&seq));
2416  }
2417 }
2418 
2419 
2421 {
2422  if (!SeqIsPatent (seq)) {
2424  "The product name is missing from this protein.", *(seq.GetCompleteBioseq()));
2425  }
2426 }
2427 
2428 
2430 {
2431  bool wgs = false;
2432 
2433  FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
2434  if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech()
2435  && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_wgs) {
2436  wgs = true;
2437  break;
2438  }
2439  }
2440  if (!wgs) {
2441  return false;
2442  }
2443 
2444  bool is_other = false;
2445  bool has_gi = false;
2446 
2447  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2448  if ((*it)->IsOther()) {
2449  is_other = true;
2450  break;
2451  } else if ((*it)->IsGi()) {
2452  has_gi = true;
2453  break;
2454  }
2455  }
2456  if (!is_other || has_gi) {
2457  return false;
2458  }
2459 
2460  return true;
2461 }
2462 
2463 
2465 {
2466  bool tsa = false;
2467 
2468  FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
2469  if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech()
2470  && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_tsa) {
2471  tsa = true;
2472  break;
2473  }
2474  }
2475  if (!tsa) {
2476  return false;
2477  }
2478 
2479  bool is_other = false;
2480  bool has_gi = false;
2481 
2482  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2483  if ((*it)->IsOther()) {
2484  is_other = true;
2485  break;
2486  } else if ((*it)->IsGi()) {
2487  has_gi = true;
2488  break;
2489  }
2490  }
2491  if (!is_other || has_gi) {
2492  return false;
2493  }
2494 
2495  return true;
2496 }
2497 
2498 
2500 {
2501  if (GetContext().PreprocessHugeFile) {
2502  if (m_pEntryInfo->IsNoBioSource() && !GetContext().IsPatent && !GetContext().IsPDB) {
2503  return;
2504  }
2505  }
2506  else if (m_pEntryInfo->IsNoBioSource() && !m_pEntryInfo->IsPatent() && !m_pEntryInfo->IsPDB()) {
2508  "No source information included on this record.", se);
2509 
2510  if (!GetContext().PostprocessHugeFile) {
2511  return;
2512  }
2513  }
2514 
2515  size_t num_no_source = m_BioseqWithNoSource.size();
2516 
2517  for ( size_t i = 0; i < num_no_source; ++i ) {
2519  "No organism name included in the source. Other qualifiers may exist.",
2520  *(m_BioseqWithNoSource[i]));
2521  }
2522 }
2523 
2524 
2526 {
2527  CConstRef<CSeq_feat> feat;
2528 
2530 
2531  if ( bsh ) {
2532  if ( IsNT() && m_TSE ) {
2533  // In case of a NT bioseq limit the search to features packaged on the
2534  // NT (we assume features have been pulled from the segments to the NT).
2536  sel.SetByProduct()
2538  CFeat_CI fi(bsh, sel);
2539  if ( fi ) {
2540  // return the first one (should be the one packaged on the
2541  // nuc-prot set).
2542  feat.Reset(&(fi->GetOriginalFeature()));
2543  }
2544  } else {
2546  sel.SetByProduct();
2547  CFeat_CI fi(bsh, sel);
2548  if ( fi ) {
2549  // return the first one (should be the one packaged on the
2550  // nuc-prot set).
2551  feat.Reset(&(fi->GetOriginalFeature()));
2552  }
2553  }
2554  }
2555 
2556  return feat;
2557 }
2558 
2559 
2561 {
2563  return GetmRNAGivenProduct(bsh);
2564 }
2565 
2566 
2568 {
2569  CConstRef<CSeq_feat> feat;
2570  if ( bsh ) {
2571  // In case of a NT bioseq limit the search to features packaged on the
2572  // NT (we assume features have been pulled from the segments to the NT).
2573  CSeq_entry_Handle limit;
2574  if ( IsNT() && m_TSE ) {
2575  limit = m_Scope->GetSeq_entryHandle(*m_TSE);
2576  }
2577 
2578  if (limit) {
2580  sel.SetByProduct() .SetLimitTSE(limit);
2581  CFeat_CI fi(bsh, sel);
2582  if ( fi ) {
2583  // return the first one (should be the one packaged on the
2584  // nuc-prot set).
2585  feat.Reset(&(fi->GetOriginalFeature()));
2586  }
2587  } else {
2589  sel.SetByProduct();
2590  CFeat_CI fi(bsh, sel);
2591  if ( fi ) {
2592  // return the first one (should be the one packaged on the
2593  // nuc-prot set).
2594  feat.Reset(&(fi->GetOriginalFeature()));
2595  }
2596  }
2597  }
2598 
2599  return feat;
2600 }
2601 
2602 
2604 (const CBioseq& seq,
2605  CBioseq_set::EClass clss)
2606 {
2607  const CSeq_entry* parent = nullptr;
2608  for ( parent = seq.GetParentEntry();
2609  parent;
2610  parent = parent->GetParentEntry() ) {
2611  if ( parent->IsSet() ) {
2612  const CBioseq_set& set = parent->GetSet();
2613  if ( set.IsSetClass() && set.GetClass() == clss ) {
2614  break;
2615  }
2616  }
2617  }
2618  return parent;
2619 }
2620 
2621 
2622 bool CValidError_imp::IsSerialNumberInComment(const string& comment)
2623 {
2624  size_t pos = comment.find('[', 0);
2625  while ( pos != string::npos ) {
2626  ++pos;
2627  bool okay = true;
2628  if ( isdigit((unsigned char) comment[pos]) ) {
2629  // skip if first character after bracket is 0
2630  if (comment[pos] == '0') {
2631  okay = false;
2632  }
2633  while ( isdigit((unsigned char) comment[pos]) ) {
2634  ++pos;
2635  }
2636  if ( comment[pos] == ']' && okay ) {
2637  return true;
2638  }
2639  }
2640 
2641  pos = comment.find('[', pos);
2642  }
2643  return false;
2644 }
2645 
2646 
2648 {
2649  // okay to have far RefSeq product, but only if genomic product set
2650  if ( sid && sid->IsOther() ) {
2651  if ( IsGPS() ) {
2652  return false;
2653  }
2654  }
2655  // or just a bioseq
2656  if ( GetTSE().IsSeq() ) {
2657  return false;
2658  }
2659 
2660  // or in a standalone Seq-annot
2661  if (IsStandaloneAnnot() ) {
2662  return false;
2663  }
2664  return true;
2665 }
2666 
2667 
2669  vector<TEntrezId>& pmids, vector<TEntrezId>& muids, vector<int>& serials,
2670  vector<string>& published_labels, vector<string>& unpublished_labels)
2671 {
2672  FOR_EACH_SEQDESC_ON_SEQENTRY (it, se) {
2673  if ((*it)->IsPub()) {
2674  CCleanup::GetPubdescLabels ((*it)->GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
2675  }
2676  }
2677 
2678  if (se.IsSet()) {
2679  FOR_EACH_SEQENTRY_ON_SEQSET (it, se.GetSet()) {
2680  s_CollectPubDescriptorLabels (**it, pmids, muids, serials, published_labels, unpublished_labels);
2681  }
2682  }
2683 }
2684 
2685 
2687 {
2688  vector<TEntrezId> pmids;
2689  vector<TEntrezId> muids;
2690  vector<int> serials;
2691  vector<string> published_labels;
2692  vector<string> unpublished_labels;
2693 
2694  // collect labels for pubs on record
2695  s_CollectPubDescriptorLabels (*(seh.GetCompleteSeq_entry()), pmids, muids, serials, published_labels, unpublished_labels);
2696 
2698  while (feat) {
2699  CCleanup::GetPubdescLabels (feat->GetData().GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
2700  ++feat;
2701  }
2702 
2703  // now examine citations to determine whether they match a pub on the record
2704  CFeat_CI f (seh);
2705  while (f) {
2706  if (f->IsSetCit() && f->GetCit().IsPub()) {
2707  ITERATE (CPub_set::TPub, cit_it, f->GetCit().GetPub()) {
2708  bool found = false;
2709 
2710  if ((*cit_it)->IsPmid()) {
2711  vector<TEntrezId>::iterator it = pmids.begin();
2712  while (it != pmids.end() && !found) {
2713  if (*it == (*cit_it)->GetPmid()) {
2714  found = true;
2715  }
2716  ++it;
2717  }
2718  if (!found) {
2720  "Citation on feature refers to uid ["
2721  + NStr::NumericToString((*cit_it)->GetPmid().Get())
2722  + "] not on a publication in the record",
2723  f->GetOriginalFeature());
2724  }
2725  } else if ((*cit_it)->IsMuid()) {
2726  vector<TEntrezId>::iterator it = muids.begin();
2727  while (it != muids.end() && !found) {
2728  if (*it == (*cit_it)->GetMuid()) {
2729  found = true;
2730  }
2731  ++it;
2732  }
2733  if (!found) {
2735  "Citation on feature refers to uid ["
2736  + NStr::NumericToString((*cit_it)->GetMuid())
2737  + "] not on a publication in the record",
2738  f->GetOriginalFeature());
2739  }
2740  } else if ((*cit_it)->IsEquiv()) {
2741  continue;
2742  } else {
2743  string label;
2744  (*cit_it)->GetLabel(&label, CPub::eContent, true);
2745 
2746  if (NStr::EndsWith (label, ">")) {
2747  label = label.substr(0, label.length() - 2);
2748  }
2749  if(NStr::EndsWith (label, "|")) {
2750  label = label.substr(0, label.length() - 1);
2751  }
2752  if (NStr::EndsWith (label, " ")) {
2753  label = label.substr(0, label.length() - 1);
2754  }
2755  size_t len = label.length();
2756  vector<string>::iterator unpub_it = unpublished_labels.begin();
2757  while (unpub_it != unpublished_labels.end() && !found) {
2758  size_t it_len =(*unpub_it).length();
2759  if (NStr::EqualNocase (*unpub_it, 0, it_len > len ? len : it_len, label)) {
2760  found = true;
2761  }
2762  ++unpub_it;
2763  }
2764  vector<string>::iterator pub_it = published_labels.begin();
2765 
2766  while (pub_it != published_labels.end() && !found) {
2767  size_t it_len =(*pub_it).length();
2768  if (NStr::EqualNocase (*pub_it, 0, it_len > len ? len : it_len, label)) {
2770  "Citation on feature needs to be updated to published uid",
2771  f->GetOriginalFeature());
2772  found = true;
2773  }
2774  ++pub_it;
2775  }
2776  if (!found) {
2778  "Citation on feature refers to a publication not in the record",
2779  f->GetOriginalFeature());
2780  }
2781  }
2782  }
2783  }
2784  ++f;
2785  }
2786 }
2787 
2788 
2789 // =============================================================================
2790 // Private
2791 // =============================================================================
2792 
2793 
2794 
2796 {
2798  for( ; it; ++it) {
2799  const string& str = *it;
2800  FOR_EACH_CHAR_IN_STRING(c_it, str) {
2801  const char& ch = *c_it;
2802  unsigned char chu = ch;
2803  if (ch > 127 || (ch < 32 && ch != '\t' && ch != '\r' && ch != '\n')) {
2805  "Non-ASCII character '" + NStr::NumericToString(chu) + "' found in item", obj);
2806  break;
2807  }
2808  }
2809  }
2810 }
2811 
2812 
2814 {
2815  class CScriptTagTextFsm : public CTextFsm<int>
2816  {
2817  public:
2818  CScriptTagTextFsm() {
2819  const char * script_tags[] = {
2820  "<script", "<object", "<applet", "<embed", "<form",
2821  "javascript:", "vbscript:"};
2822  ITERATE_0_IDX(idx, ArraySize(script_tags)) {
2823  AddWord(script_tags[idx], true);
2824  }
2825  Prime();
2826  }
2827 
2828  // Returns true if the given string matches any of the strings
2829  // in the fsm anywhere.
2830  bool DoesStrHaveFsmHits(const string &str) {
2831  int state = GetInitialState();
2832  ITERATE(string, str_it, str) {
2833  state = GetNextState(state, *str_it);
2834  if( IsMatchFound(state) ) {
2835  return true;
2836  }
2837  }
2838 
2839  return false;
2840  }
2841  };
2842  static CScriptTagTextFsm s_ScriptTagFsm;
2843 
2844 
2846  for( ; it; ++it) {
2847  if (s_ScriptTagFsm.DoesStrHaveFsmHits(*it)) {
2849  "Script tag found in item", obj);
2850  return;
2851  }
2852 }
2853 }
2854 
2855 
2856 bool CValidError_imp::IsMixedStrands(const CSeq_loc& loc)
2857 {
2858  if ( SeqLocCheck(loc, m_Scope) == eSeqLocCheck_warning ) {
2859  return false;
2860  }
2861 
2862  CSeq_loc_CI curr(loc);
2863  if ( !curr ) {
2864  return false;
2865  }
2866  CSeq_loc_CI prev = curr;
2867  ++curr;
2868 
2869  while ( curr ) {
2870  ENa_strand curr_strand = curr.GetStrand();
2871  ENa_strand prev_strand = prev.GetStrand();
2872 
2873  if ( (prev_strand == eNa_strand_minus &&
2874  curr_strand != eNa_strand_minus) ||
2875  (prev_strand != eNa_strand_minus &&
2876  curr_strand == eNa_strand_minus) ) {
2877  return true;
2878  }
2879 
2880  prev = curr;
2881  ++curr;
2882  }
2883 
2884  return false;
2885 }
2886 
2887 
2888 static bool s_SeqLocHasGI (const CSeq_loc& loc)
2889 {
2890  bool rval = false;
2891 
2892  for ( CSeq_loc_CI it(loc); it && !rval; ++it ) {
2893  if (it.GetSeq_id().IsGi()) {
2894  rval = true;
2895  }
2896  }
2897  return rval;
2898 }
2899 
2900 
2902 {
2903  m_TSEH = seh;
2905  m_GeneCache.Clear();
2906 }
2907 
2908 
2910 {
2912  return true;
2913  } else {
2914  return false;
2915  }
2916 }
2917 
2918 
2920 {
2921  if (se.IsSeq()) {
2922  return 1;
2923  } else if (!se.IsSet()) {
2924  return 0;
2925  }
2926  if (se.GetSet().IsSetClass()) {
2929  return 1;
2930  }
2931  }
2932  size_t count = 0;
2933  if (se.GetSet().IsSetSeq_set()) {
2934  for (auto it = se.GetSet().GetSeq_set().begin(); it != se.GetSet().GetSeq_set().end(); it++) {
2935  count += s_CountTopSetSiblings(**it);
2936  }
2937  }
2938  return count;
2939 }
2940 
2941 
2943 {
2944  // "Save" the Seq-entry
2945  SetTSE(seh);
2946 
2949 
2950  // If no Pubs/BioSource in CSeq_entry, post only one error
2951  if (GetContext().PreprocessHugeFile) {
2952  x_SetEntryInfo().SetNoPubs(GetContext().NoPubsFound);
2953  x_SetEntryInfo().SetNoCitSubPubs(GetContext().NoCitSubsFound);
2954  x_SetEntryInfo().SetNoBioSource(GetContext().NoBioSource);
2955  } else {
2957  x_SetEntryInfo().SetNoPubs(!pub);
2958  while (pub && !pub->IsSub()) {
2959  ++pub;
2960  }
2964  }
2965 
2966 
2967  // Look for genomic product set
2969  if (si->IsSetClass ()) {
2970  if (si->GetClass () == CBioseq_set::eClass_gen_prod_set) {
2971  x_SetEntryInfo().SetGPS();
2972  }
2973  if (si->GetClass () == CBioseq_set::eClass_small_genome_set) {
2975  }
2976  }
2977  }
2978 
2979  // Examine all Seq-ids on Bioseqs
2980  for (CTypeConstIterator <CBioseq> bi (*m_TSE); bi; ++bi) {
2981  FOR_EACH_SEQID_ON_BIOSEQ (sid_itr, *bi) {
2982  const CSeq_id& sid = **sid_itr;
2983  const CTextseq_id* tsid = sid.GetTextseq_Id();
2984  CSeq_id::E_Choice typ = sid.Which();
2985  switch (typ) {
2986  case CSeq_id::e_not_set:
2987  break;
2988  case CSeq_id::e_Local:
2989  break;
2990  case CSeq_id::e_Gibbsq:
2991  break;
2992  case CSeq_id::e_Gibbmt:
2993  break;
2994  case CSeq_id::e_Giim:
2995  break;
2996  case CSeq_id::e_Genbank:
2999  x_SetEntryInfo().SetGED();
3000  break;
3001  case CSeq_id::e_Embl:
3003  x_SetEntryInfo().SetGED();
3004  x_SetEntryInfo().SetEmbl();
3005  break;
3006  case CSeq_id::e_Pir:
3007  break;
3008  case CSeq_id::e_Swissprot:
3009  break;
3010  case CSeq_id::e_Patent:
3012  break;
3013  case CSeq_id::e_Other:
3015  // and do RefSeq subclasses up front as well
3016  if (sid.GetOther().IsSetAccession()) {
3017  string acc = sid.GetOther().GetAccession().substr(0, 3);
3018  if (acc == "NC_") {
3019  m_IsNC = true;
3020  } else if (acc == "NG_") {
3021  m_IsNG = true;
3022  } else if (acc == "NM_") {
3023  m_IsNM = true;
3024  } else if (acc == "NP_") {
3025  m_IsNP = true;
3026  } else if (acc == "NR_") {
3027  m_IsNR = true;
3028  } else if (acc == "NZ_") {
3029  m_IsNZ = true;
3030  } else if (acc == "NS_") {
3031  m_IsNS = true;
3032  } else if (acc == "NT_") {
3033  m_IsNT = true;
3034  } else if (acc == "NW_") {
3035  m_IsNW = true;
3036  } else if (acc == "WP_") {
3037  m_IsWP = true;
3038  } else if (acc == "XR_") {
3039  m_IsXR = true;
3040  }
3041  }
3042  break;
3043  case CSeq_id::e_General:
3044  if ((*bi).IsAa() && !sid.GetGeneral().IsSkippable()) {
3046  }
3047  break;
3048  case CSeq_id::e_Gi:
3049  x_SetEntryInfo().SetGI();
3051  break;
3052  case CSeq_id::e_Ddbj:
3054  x_SetEntryInfo().SetGED();
3055  x_SetEntryInfo().SetDdbj();
3056  break;
3057  case CSeq_id::e_Prf:
3058  break;
3059  case CSeq_id::e_Pdb:
3060  x_SetEntryInfo().SetPDB();
3061  break;
3062  case CSeq_id::e_Tpg:
3064  break;
3065  case CSeq_id::e_Tpe:
3066  x_SetEntryInfo().SetTPE();
3068  break;
3069  case CSeq_id::e_Tpd:
3071  break;
3072  case CSeq_id::e_Gpipe:
3074  break;
3075  default:
3076  break;
3077  }
3078  if ( tsid && tsid->IsSetAccession() && tsid->IsSetVersion() && tsid->GetVersion() >= 1 ) {
3080  }
3081  if (typ != CSeq_id::e_Local && typ != CSeq_id::e_General) {
3083  }
3084  }
3085  }
3086 
3087  // search all source descriptors for genomic source
3088  for (CSeqdesc_CI desc_ci (seh, CSeqdesc::e_Source);
3089  desc_ci && !m_pEntryInfo->IsGenomic();
3090  ++desc_ci) {
3091  if (desc_ci->GetSource().IsSetGenome()
3092  && desc_ci->GetSource().GetGenome() == CBioSource::eGenome_genomic) {
3094  }
3095  }
3096 
3097  // search genome build and annotation pipeline user object descriptors
3098  for (CSeqdesc_CI desc_ci (seh, CSeqdesc::e_User);
3099  desc_ci && !m_pEntryInfo->IsGpipe();
3100  ++desc_ci) {
3101  if ( desc_ci->GetUser().IsSetType() ) {
3102  const CUser_object& obj = desc_ci->GetUser();
3103  const CObject_id& oi = obj.GetType();
3104  if ( ! oi.IsStr() ) continue;
3105  if ( NStr::CompareNocase(oi.GetStr(), "GenomeBuild") == 0 ) {
3107  } else if ( NStr::CompareNocase(oi.GetStr(), "StructuredComment") == 0 ) {
3108  ITERATE (CUser_object::TData, field, obj.GetData()) {
3109  if ((*field)->IsSetLabel() && (*field)->GetLabel().IsStr()) {
3110  if (NStr::EqualNocase((*field)->GetLabel().GetStr(), "Annotation Pipeline")) {
3111  if (NStr::EqualNocase((*field)->GetData().GetStr(), "NCBI eukaryotic genome annotation pipeline")) {
3113  }
3114  }
3115  }
3116  }
3117  }
3118  }
3119  }
3120 
3121  // examine features for location gi, product gi, and locus tag
3122  for (CFeat_CI feat_ci (seh);
3124  ++feat_ci) {
3125  if (s_SeqLocHasGI(feat_ci->GetLocation())) {
3127  }
3128  if (feat_ci->IsSetProduct() && s_SeqLocHasGI(feat_ci->GetProduct())) {
3130  }
3131  if (feat_ci->IsSetData() && feat_ci->GetData().IsGene()
3132  && feat_ci->GetData().GetGene().IsSetLocus_tag()
3133  && !NStr::IsBlank (feat_ci->GetData().GetGene().GetLocus_tag())) {
3135  }
3136  }
3137 
3138  if ( m_PrgCallback ) {
3139  m_NumAlign = 0;
3140  for (CTypeConstIterator<CSeq_align> i(*m_TSE); i; ++i) {
3141  m_NumAlign++;
3142  }
3143  m_NumAnnot = 0;
3144  for (CTypeConstIterator<CSeq_annot> i(*m_TSE); i; ++i) {
3145  m_NumAnnot++;
3146  }
3147  m_NumBioseq = 0;
3148  for (CTypeConstIterator<CBioseq> i(*m_TSE); i; ++i) {
3149  m_NumBioseq++;
3150  }
3151  m_NumBioseq_set = 0;
3152  for (CTypeConstIterator<CBioseq_set> i(*m_TSE); i; ++i) {
3153  m_NumBioseq_set++;
3154  }
3155  m_NumDesc = 0;
3156  for (CTypeConstIterator<CSeqdesc> i(*m_TSE); i; ++i) {
3157  m_NumDesc++;
3158  }
3159  m_NumDescr = 0;
3160  for (CTypeConstIterator<CSeq_descr> i(*m_TSE); i; ++i) {
3161  m_NumDescr++;
3162  }
3163  m_NumFeat = 0;
3164  for (CTypeConstIterator<CSeq_feat> i(*m_TSE); i; ++i) {
3165  m_NumFeat++;
3166  }
3167  m_NumGraph = 0;
3168  for (CTypeConstIterator<CSeq_graph> i(*m_TSE); i; ++i) {
3169  m_NumGraph++;
3170  }
3173  m_NumGraph;
3174  }
3175 
3176  if (CNcbiApplication::Instance()->GetProgramDisplayName() == "table2asn") {
3177  m_IsTbl2Asn = true;
3178  }
3179 }
3180 
3181 
3183 {
3184  m_Scope.Reset(new CScope(*m_ObjMgr));
3185  m_Scope->AddTopLevelSeqEntry(*const_cast<CSeq_entry*>(&se));
3186  m_Scope->AddDefaults();
3187 }
3188 
3189 
3191 {
3192  m_IsStandaloneAnnot = true;
3193  if (! m_Scope) {
3194  m_Scope.Reset(& sah.GetScope());
3195  }
3197  m_TSE.Reset(new CSeq_entry); // set a dummy Seq-entry
3199 }
3200 
3201 
3203 {
3204  m_Scope.Reset(new CScope(*m_ObjMgr));
3205  CRef<CSeq_entry> tmp_entry(new CSeq_entry());
3206  tmp_entry->SetSeq().Assign(seq);
3207  m_TSE.Reset(tmp_entry);
3209  Setup(m_TSEH);
3210  return m_TSEH;
3211 }
3212 
3213 
3215 (const CSeq_loc& loc,
3216  const CSerialObject& obj)
3217 {
3218  for ( CSeq_loc_CI lit(loc); lit; ++lit ) {
3219  const CSeq_id& id1 = lit.GetSeq_id();
3220  CSeq_loc_CI lit2 = lit;
3221  for ( ++lit2; lit2; ++lit2 ) {
3222  const CSeq_id& id2 = lit2.GetSeq_id();
3223  if ( IsSameBioseq(id1, id2, m_Scope) && !id1.Match(id2) ) {
3226  "Two ids refer to the same bioseq but are of "
3227  "different type", obj);
3228  }
3229  }
3230  if (IsTemporary(id1)) {
3232  "Feature locations should not use Seq-ids that will be stripped during ID load", obj);
3233  }
3234  }
3237  "Feature location intervals should all be on the same sequence", obj);
3238  }
3239 }
3240 
3241 
3243 {
3244  return validator::IsInOrganelleSmallGenomeSet(id, scope);
3245 }
3246 
3247 
3248 // all ids in a location should point to the same sequence, unless the sequences are
3249 // in an organelle small genome set
3250 bool CValidError_imp::BadMultipleSequenceLocation(const CSeq_loc& loc, CScope& scope)
3251 {
3252  return validator::BadMultipleSequenceLocation(loc, scope);
3253 }
3254 
3255 
3256 bool CValidError_imp::x_IsFarFetchFailure (const CSeq_loc& loc)
3257 {
3259  && IsFarLocation(loc, GetTSEH())) {
3260  return true;
3261  } else {
3262  return false;
3263  }
3264 }
3265 
3266 
3267 //LCOV_EXCL_START
3268 // not used by asnvalidate, used by external programs
3270 {
3271  bool rval = false;
3272  Setup(se);
3273  CValidError_bioseq bioseq_validator(*this);
3275  while (bi) {
3276  rval |= bioseq_validator.GetTSANStretchErrors(*(bi->GetCompleteBioseq()));
3277  ++bi;
3278  }
3279  return rval;
3280 }
3281 
3282 
3284 {
3285  CSeq_entry_Handle seh = Setup(seq);
3286  CValidError_bioseq bioseq_validator(*this);
3287  return bioseq_validator.GetTSANStretchErrors(*(seh.GetSeq().GetCompleteBioseq()));
3288 }
3289 
3290 
3292 {
3293  bool rval = false;
3294  Setup(se);
3295  CValidError_feat feat_validator(*this);
3296  CFeat_CI fi(se);
3297  while (fi) {
3298  CBioseq_Handle bsh = se.GetScope().GetBioseqHandle(fi->GetLocation());
3299  if (bsh) {
3300  rval |= feat_validator.GetTSACDSOnMinusStrandErrors(*(fi->GetSeq_feat()), *(bsh.GetCompleteBioseq()));
3301  }
3302  ++fi;
3303  }
3304 
3305  return rval;
3306 }
3307 
3308 
3310 {
3311  CSeq_entry_Handle seh = Setup(seq);
3312  CValidError_feat feat_validator(*this);
3313  return feat_validator.GetTSACDSOnMinusStrandErrors(f, *(seh.GetSeq().GetCompleteBioseq()));
3314 }
3315 
3316 
3318 {
3319  bool rval = false;
3320  Setup(se);
3321  CValidError_bioseq bioseq_validator(*this);
3323  while (bi) {
3324  rval |= bioseq_validator.GetTSAConflictingBiomolTechErrors(*(bi->GetCompleteBioseq()));
3325  ++bi;
3326  }
3327  return rval;
3328 }
3329 
3330 
3332 {
3333  CSeq_entry_Handle seh = Setup(seq);
3334  CValidError_bioseq bioseq_validator(*this);
3335  return bioseq_validator.GetTSAConflictingBiomolTechErrors(*(seh.GetSeq().GetCompleteBioseq()));
3336 }
3337 //LCOV_EXCL_STOP
3338 
3339 const string kTooShort = "Too Short";
3340 const string kMissingPrimers = "Missing Primers";
3341 const string kMissingCountry = "Missing Country";
3342 const string kMissingVoucher = "Missing Voucher";
3343 const string kBadCollectionDate = "Bad Collection Date";
3344 const string kTooManyNs = "Too Many Ns";
3345 const string kMissingOrderAssignment = "Missing Order Assignment";
3346 const string kLowTrace = "Low Trace";
3347 const string kFrameShift = "Frame Shift";
3348 const string kStructuredVoucher = "Structured Voucher";
3349 
3350 #define ADD_BARCODE_ERR(TestName) \
3351  PostErr(eDiag_Warning, eErr_GENERIC_Barcode##TestName, k##TestName, sq); \
3352  if (!msg.empty()) { \
3353  msg += ","; \
3354  } \
3355  msg += k##TestName;
3356 
3358 {
3359  TBarcodeResults results = GetBarcodeValues(seh);
3360  for (auto r : results) {
3361  const CBioseq& sq = *(r.bsh.GetCompleteBioseq());
3362  if (BarcodeTestFails(r)){
3363  string msg;
3364  if (r.length) {
3365  ADD_BARCODE_ERR(TooShort)
3366  }
3367  if (r.primers) {
3368  ADD_BARCODE_ERR(MissingPrimers)
3369  }
3370  if (r.country) {
3371  ADD_BARCODE_ERR(MissingCountry)
3372  }
3373  if (r.voucher) {
3374  ADD_BARCODE_ERR(MissingVoucher)
3375  }
3376  if (!r.percent_n.empty()) {
3378  if (!msg.empty()) {
3379  msg += ",";
3380  }
3381  msg += kTooManyNs + ":" + r.percent_n;
3382  }
3383  if (r.collection_date) {
3384  ADD_BARCODE_ERR(BadCollectionDate)
3385  }
3386  if (r.order_assignment) {
3387  ADD_BARCODE_ERR(MissingOrderAssignment)
3388  }
3389  if (r.low_trace) {
3390  ADD_BARCODE_ERR(LowTrace)
3391  }
3392  if (r.frame_shift) {
3393  ADD_BARCODE_ERR(FrameShift)
3394  }
3395  if (!r.structured_voucher) {
3396  ADD_BARCODE_ERR(StructuredVoucher)
3397  }
3398  PostErr(eDiag_Info, eErr_GENERIC_BarcodeTestFails, "FAIL (" + msg + ")", sq);
3399  } else {
3401  }
3402  }
3403 }
3404 
3405 
3409 bool CValidError_imp::IsGPS() const { return GetEntryInfo().IsGPS(); }
3410 bool CValidError_imp::IsGED() const { return GetEntryInfo().IsGED(); }
3411 bool CValidError_imp::IsPDB() const { return GetEntryInfo().IsPDB(); }
3414 bool CValidError_imp::IsEmbl() const { return GetEntryInfo().IsEmbl(); }
3415 bool CValidError_imp::IsDdbj() const { return GetEntryInfo().IsDdbj(); }
3416 bool CValidError_imp::IsTPE() const { return GetEntryInfo().IsTPE(); }
3417 bool CValidError_imp::IsNC() const { return m_IsNC; }
3418 bool CValidError_imp::IsNG() const { return m_IsNG; }
3419 bool CValidError_imp::IsNM() const { return m_IsNM; }
3420 bool CValidError_imp::IsNP() const { return m_IsNP; }
3421 bool CValidError_imp::IsNR() const { return m_IsNR; }
3422 bool CValidError_imp::IsNS() const { return m_IsNS; }
3423 bool CValidError_imp::IsNT() const { return m_IsNT; }
3424 bool CValidError_imp::IsNW() const { return m_IsNW; }
3425 bool CValidError_imp::IsNZ() const { return m_IsNZ; }
3426 bool CValidError_imp::IsWP() const { return m_IsWP; }
3427 bool CValidError_imp::IsXR() const { return m_IsXR; }
3428 bool CValidError_imp::IsGI() const { return GetEntryInfo().IsGI(); }
3430 bool CValidError_imp::IsGpipe() const { return GetEntryInfo().IsGpipe(); }
3443 
3444 
3445 
3446 // =============================================================================
3447 // CValidError_base Implementation
3448 // =============================================================================
3449 
3450 
3452  m_Imp(imp), m_Scope(imp.GetScope())
3453 {
3454 }
3455 
3456 
3458 {
3459 }
3460 
3461 
3463 (EDiagSev sv,
3464  EErrType et,
3465  const string& msg,
3466  const CSerialObject& obj)
3467 {
3468  m_Imp.PostErr(sv, et, msg, obj);
3469 }
3470 
3471 
3472 //void CValidError_base::PostErr
3473 //(EDiagSev sv,
3474 // EErrType et,
3475 // const string& msg,
3476 // TDesc ds)
3477 //{
3478 // m_Imp.PostErr(sv, et, msg, ds);
3479 //}
3480 
3481 
3483 (EDiagSev sv,
3484  EErrType et,
3485  const string& msg,
3486  const CSeq_feat& ft)
3487 {
3488  m_Imp.PostErr(sv, et, msg, ft);
3489 }
3490 
3491 
3493 (EDiagSev sv,
3494  EErrType et,
3495  const string& msg,
3496  const CBioseq& sq)
3497 {
3498  m_Imp.PostErr(sv, et, msg, sq);
3499 }
3500 
3501 
3503 (EDiagSev sv,
3504  EErrType et,
3505  const string& msg,
3506  const CSeq_entry& ctx,
3507  const CSeqdesc& ds)
3508 {
3509  m_Imp.PostErr(sv, et, msg, ctx, ds);
3510 }
3511 
3512 
3514 (EDiagSev sv,
3515  EErrType et,
3516  const string& msg,
3517  const CBioseq_set& set)
3518 {
3519  m_Imp.PostErr(sv, et, msg, set);
3520 }
3521 
3522 
3524 (EDiagSev sv,
3525  EErrType et,
3526  const string& msg,
3527  const CSeq_annot& annot)
3528 {
3529  m_Imp.PostErr(sv, et, msg, annot);
3530 }
3531 
3533 (EDiagSev sv,
3534  EErrType et,
3535  const string& msg,
3536  const CSeq_graph& graph)
3537 {
3538  m_Imp.PostErr(sv, et, msg, graph);
3539 }
3540 
3541 
3543 (EDiagSev sv,
3544  EErrType et,
3545  const string& msg,
3546  const CBioseq& sq,
3547  const CSeq_graph& graph)
3548 {
3549  m_Imp.PostErr(sv, et, msg, sq, graph);
3550 }
3551 
3552 
3554 (EDiagSev sv,
3555  EErrType et,
3556  const string& msg,
3557  const CSeq_align& align)
3558 {
3559  m_Imp.PostErr(sv, et, msg, align);
3560 }
3561 
3562 
3564 (EDiagSev sv,
3565  EErrType et,
3566  const string& msg,
3567  const CSeq_entry& entry)
3568 {
3569  m_Imp.PostErr(sv, et, msg, entry);
3570 }
3571 
3572 CCacheImpl&
3574 {
3575  return m_Imp.GetCache();
3576 }
3577 
3578 
3580 {
3581  CSeq_entry_Handle parent = seh.GetParentEntry();
3582  if (!parent || !parent.IsSet()) {
3583  return false;
3584  }
3586  if (!pset) {
3587  return false;
3588  }
3589  if (pset->IsSetSeq_set() && pset->GetSeq_set().size() > 10) {
3590  return true;
3591  } else {
3592  return s_HasTopSetSiblings(parent);
3593  }
3594 }
3595 
3596 
3598 {
3599  CSeq_entry_Handle appropriate_parent;
3600 
3601  CSeq_entry_Handle np;
3602  CSeq_entry_Handle gps;
3603  if (seh.IsSet() && seh.GetSet().IsSetClass()) {
3604  if (seh.GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
3605  np = seh;
3606  } else if (s_IsGoodTopSetClass(seh.GetSet().GetClass())) {
3607  gps = seh;
3608  }
3609  } else if (seh.IsSeq()) {
3611  if (p && p.IsSet() && p.GetSet().IsSetClass()) {
3613  np = p;
3614  } else if (s_IsGoodTopSetClass(p.GetSet().GetClass())) {
3615  gps = p;
3616  }
3617  }
3618  }
3619  if (gps) {
3620  appropriate_parent = gps;
3621  } else if (np) {
3623  if (gp && gp.IsSet() && gp.GetSet().IsSetClass() &&
3625  appropriate_parent = gp;
3626  } else {
3627  appropriate_parent = np;
3628  }
3629  } else {
3630  appropriate_parent = seh;
3631  }
3632  return appropriate_parent;
3633 }
3634 
3635 
3638  CConstRef<CPubdesc> pub)
3639 {
3640  // first, try to receive from cache
3642  m_pubdescCache.find(pub);
3643  if( find_iter != m_pubdescCache.end() ) {
3644  return *find_iter->second;
3645  }
3646 
3647  CRef<CPubdescInfo> pInfo(new CPubdescInfo);
3649  *pub, pInfo->m_pmids, pInfo->m_muids,
3650  pInfo->m_serials, pInfo->m_published_labels,
3651  pInfo->m_unpublished_labels);
3652  m_pubdescCache[pub] = pInfo;
3653  return *pInfo;
3654 }
3655 
3656 bool
3658  const SFeatKey & rhs) const
3659 {
3660  if( feat_type != rhs.feat_type ) {
3661  return feat_type < rhs.feat_type;
3662  } else if( feat_subtype != rhs.feat_subtype ) {
3663  return feat_subtype < rhs.feat_subtype;
3664  } else {
3665  return bioseq_h < rhs.bioseq_h;
3666  }
3667 }
3668 
3669 bool
3671  const SFeatKey & rhs) const
3672 {
3673  return (feat_type == rhs.feat_type) &&
3674  (feat_subtype == rhs.feat_subtype) && (bioseq_h == rhs.bioseq_h);
3675 }
3676 
3677 const CCacheImpl::TFeatValue &
3679  const CCacheImpl::SFeatKey & featKey)
3680 {
3681  // check common case where already in the cache
3682  TFeatCache::iterator find_iter = m_featCache.find(featKey);
3683  if( find_iter != m_featCache.end() ) {
3684  return find_iter->second;
3685  }
3686 
3687  // check if bioseq already processed, but had no entry requested above
3688  SFeatKey bioseq_check_key(
3690  TFeatCache::const_iterator bioseq_find_iter =
3691  m_featCache.find(bioseq_check_key);
3692  if( bioseq_find_iter != m_featCache.end() ) {
3693  const static TFeatValue kEmptyFeatValue;
3694  // bioseq was already processed,
3695  // it just happened to not have an entry here
3696  return kEmptyFeatValue;
3697  }
3698 
3699  // bioseq never added to cache, so calculate that now
3700 
3701  // to avoid expensive constructions of CFeat_CI's,
3702  // we iterate through all the seqs on
3703  // the bioseq and load them into the cache.
3704  CFeat_CI feat_ci(featKey.bioseq_h);
3705  for( ; feat_ci; ++feat_ci ) {
3706  SFeatKey inner_feat_key(
3707  feat_ci->GetFeatType(), feat_ci->GetFeatSubtype(), featKey.bioseq_h);
3708 
3709  m_featCache[inner_feat_key].push_back(*feat_ci);
3710 
3711  // also add "don't care" entries for partial searches
3712  // (e.g. if caller just wants to search on type but not on
3713  // subtype they can set subtype to kAnyFeatSubtype)
3714  SFeatKey any_type_key = inner_feat_key;
3715  any_type_key.feat_type = kAnyFeatType;
3716  m_featCache[any_type_key].push_back(*feat_ci);
3717 
3718  SFeatKey any_subtype_key = inner_feat_key;
3719  any_subtype_key.feat_subtype = kAnyFeatSubtype;
3720  m_featCache[any_subtype_key].push_back(*feat_ci);
3721 
3722  // for when the caller wants all feats on a bioseq
3723  SFeatKey any_type_or_subtype_key = inner_feat_key;
3724  any_type_or_subtype_key.feat_type = kAnyFeatType;
3725  any_type_or_subtype_key.feat_subtype = kAnyFeatSubtype;
3726  m_featCache[any_type_or_subtype_key].push_back(*feat_ci);
3727  }
3728 
3729  // in case a bioseq has no features, we add a dummy key just to
3730  // remember that so we don't use CFeat_CI again on the same bioseq
3731  m_featCache[bioseq_check_key]; // gets default val
3732 
3733  return m_featCache[featKey];
3734 }
3735 
3738  const vector<SFeatKey> &featKeys)
3739 {
3740  if( featKeys.empty() ) {
3741  return new TFeatValue;
3742  }
3743 
3744  // all featKeys must have the same bioseq
3745  const CBioseq_Handle & bioseq_h = featKeys[0].bioseq_h;
3746  ITERATE(vector<SFeatKey>, feat_it, featKeys) {
3747  if( feat_it->bioseq_h != bioseq_h ) {
3748  throw runtime_error("GetFeatFromCacheMulti must be called with only 1 bioseq in its args");
3749  }
3750  }
3751 
3752  // set prevents dups
3753  set<TFeatValue::value_type> set_of_feats;
3754 
3755  // combine the answers from every key into the set
3756  ITERATE(vector<SFeatKey>, key_it, featKeys ) {
3757  const TFeatValue & feat_value = GetFeatFromCache(*key_it);
3758  copy(BEGIN_COMMA_END(feat_value), inserter(
3759  set_of_feats, set_of_feats.begin()));
3760  }
3761 
3762  // go through every feature on the bioseq and remember any that match what's in the set
3763  // (The purpose of this step is to return the feats in the same
3764  // order they were on the original bioseq. In the future, we may
3765  // consider adding a flag to avoid sorting for time purposes).
3766  AutoPtr<TFeatValue> answer(new TFeatValue);
3767  SFeatKey all_feats_key(
3768  kAnyFeatType, kAnyFeatSubtype, bioseq_h);
3769  const TFeatValue & all_feats_vec = GetFeatFromCache(all_feats_key);
3770  ITERATE(TFeatValue, feat_it, all_feats_vec) {
3771  if( set_of_feats.find(*feat_it) != set_of_feats.end() ) {
3772  answer->push_back(*feat_it);
3773  }
3774  }
3775 
3776  return answer;
3777 }
3778 
3779 
3780 //LCOV_EXCL_START
3781 //not used
3782 bool
3784 {
3785  if( m_eFeatKeyStr != rhs.m_eFeatKeyStr ) {
3786  return m_eFeatKeyStr < rhs.m_eFeatKeyStr;
3787  }
3788  if( m_bioseq != rhs.m_bioseq ) {
3789  return m_bioseq < rhs.m_bioseq;
3790  }
3791  return s_QuickStringLess(m_feat_str, rhs.m_feat_str);
3792 }
3793 
3794 
3795 bool
3797 {
3798  if( m_eFeatKeyStr != rhs.m_eFeatKeyStr ) {
3799  return false;
3800  }
3801  if( m_bioseq != rhs.m_bioseq ) {
3802  return false;
3803  }
3804  return (m_feat_str == rhs.m_feat_str);
3805 }
3806 
3807 
3808 const CCacheImpl::TFeatValue &
3810  const SFeatStrKey & feat_str_key, const CTSE_Handle & tse_arg)
3811 {
3812  const CBioseq_Handle & search_bsh = feat_str_key.m_bioseq;
3813 
3814  // caller must give us something to work with
3815  _ASSERT(search_bsh || tse_arg);
3816 
3817  const CTSE_Handle & tse = (tse_arg ? tse_arg : search_bsh.GetTSE_Handle());
3818 
3819  // load cache if empty
3821  // (for now just indexes genes, but more may be added in the future)
3823  AutoPtr<CFeat_CI> p_gene_ci;
3824  // if we have TSE, get all features on it; otherwise, just get
3825  // the features from the bioseq
3826  if( tse ) {
3827  p_gene_ci.reset(new CFeat_CI(tse, sel));
3828  } else {
3829  p_gene_ci.reset(new CFeat_CI(search_bsh, sel));
3830  }
3831  CFeat_CI & gene_ci = *p_gene_ci; // for convenience
3832 
3833  for( ; gene_ci; ++gene_ci ) {
3834  CBioseq_Handle bsh = tse.GetScope().GetBioseqHandle(gene_ci->GetLocation());
3835  string label;
3836  const CGene_ref & gene_ref = gene_ci->GetData().GetGene();
3837 
3838  // for each one, add an entry for using given Bioseq and the
3839  // kAnyBioseq (so users can search on any bioseq)
3840  gene_ref.GetLabel(&label);
3841  SFeatStrKey label_key(eFeatKeyStr_Label, bsh, label);
3842  m_featStrKeyToFeatsCache[label_key].push_back(*gene_ci);
3843  if( bsh ) {
3844  label_key.m_bioseq = kAnyBioseq;
3845  m_featStrKeyToFeatsCache[label_key].push_back(*gene_ci);
3846  }
3847 
3848  const string & locus_tag = (
3849  gene_ref.IsSetLocus_tag() ? gene_ref.GetLocus_tag() :
3850  kEmptyStr);
3851  SFeatStrKey locus_tag_key(eFeatKeyStr_LocusTag, bsh, locus_tag);
3852  m_featStrKeyToFeatsCache[locus_tag_key].push_back(*gene_ci);
3853  if( bsh ) {
3854  locus_tag_key.m_bioseq = kAnyBioseq;
3855  m_featStrKeyToFeatsCache[locus_tag_key].push_back(*gene_ci);
3856  }
3857  }
3858  }
3859 
3860  // get from cache, if possible
3862  m_featStrKeyToFeatsCache.find(feat_str_key);
3863  if( find_iter != m_featStrKeyToFeatsCache.end() ) {
3864  return find_iter->second;
3865  } else {
3866  // nothing found
3867  return kEmptyFeatValue;
3868  }
3869 }
3870 
3871 
3874  const CCacheImpl::TFeatToBioseqKey & feat_to_bioseq_key,
3875  const CTSE_Handle & tse)
3876 {
3877  // load cache if empty
3878  if( m_featToBioseqCache.empty() ) {
3879  CBioseq_CI bioseq_ci(tse);
3880  for( ; bioseq_ci; ++bioseq_ci ) {
3881  CFeat_CI feat_ci(*bioseq_ci);
3882  for( ; feat_ci; ++feat_ci ) {
3883  m_featToBioseqCache[*feat_ci].insert(*bioseq_ci);
3884  }
3885  }
3886  }
3887 
3888  // we're being given the map to a feature, so we should've loaded
3889  // at least one feature when we loaded the cache
3891 
3892  // load from the cache
3894  m_featToBioseqCache.find(feat_to_bioseq_key);
3895  if( find_iter != m_featToBioseqCache.end() ) {
3896  return find_iter->second;
3897  } else {
3898  const static TFeatToBioseqValue kEmptyFeatToBioseqCache;
3899  return kEmptyFeatToBioseqCache;
3900  }
3901 }
3902 //LCOV_EXCL_STOP
3903 
3907  const CTSE_Handle & tse)
3908 {
3909  _ASSERT(tse);
3910 
3911  // load cache if empty
3912  if( m_IdToBioseqCache.empty() ) {
3913  CBioseq_CI bioseq_ci(tse);
3914  for( ; bioseq_ci; ++bioseq_ci ) {
3915  const CBioseq_Handle::TId & ids = bioseq_ci->GetId();
3916  ITERATE(CBioseq_Handle::TId, id_it, ids) {
3917  m_IdToBioseqCache[id_it->GetSeqId()] = *bioseq_ci;
3918  }
3919  }
3920  }
3921 
3922  // there should be at least one Bioseq otherwise there wouldn't
3923  // be anything to validate.
3925 
3927  if( find_iter != m_IdToBioseqCache.end() ) {
3928  return find_iter->second;
3929  } else {
3930  static const TIdToBioseqValue s_EmptyResult;
3931  return s_EmptyResult;
3932  }
3933 }
3934 
3937  CScope *scope, const CSeq_loc& loc, const CTSE_Handle & tse)
3938 {
3939  _ASSERT(scope || tse);
3940  if( ! tse || (!tse.GetTopLevelEntry().IsSet() && !tse.GetTopLevelEntry().IsSeq())) {
3941  // fall back on old style
3942  return BioseqHandleFromLocation(scope, loc);
3943  }
3944 
3945 
3946  for ( CSeq_loc_CI citer (loc); citer; ++citer) {
3947  CConstRef<CSeq_id> id(&citer.GetSeq_id());
3948  const TIdToBioseqValue & bioseq = GetIdToBioseq(id, tse);
3949  if( bioseq ) {
3950  return bioseq;
3951  }
3952  }
3953 
3954  // nothing found, so fall back on old style if possible
3955  if( scope ) {
3956  return BioseqHandleFromLocation(scope, loc);
3957  } else {
3958  return kEmptyBioseqHandle;
3959  }
3960 }
3961 
3962 
3964 {
3966  m_featCache.clear();
3970 }
3971 
3972 
3973 
3974 
3975 
3976 END_SCOPE(validator)
static CRef< CScope > m_Scope
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
EErrType
@ eErr_SEQ_FEAT_WrongQualOnImpFeat
@ eErr_SEQ_DESCR_ObsoleteSourceQual
@ eErr_SEQ_DESCR_ObsoleteSourceLocation
@ eErr_SEQ_INST_FarFetchFailure
@ eErr_SEQ_FEAT_WholeLocation
@ eErr_SEQ_INST_ShortSeq
@ eErr_GENERIC_MissingPubRequirement
@ eErr_SEQ_FEAT_EcNumberProblem
@ eErr_SEQ_FEAT_DuplicateAnticodonInterval
@ eErr_SEQ_INST_CompleteGenomeHasGaps
@ eErr_SEQ_FEAT_CDShasTooManyXs
@ eErr_SEQ_FEAT_TranslExceptPhase
@ eErr_SEQ_FEAT_MinusStrandProtein
@ eErr_SEQ_INST_CompleteTitleProblem
@ eErr_SEQ_PKG_EmptySet
@ eErr_SEQ_DESCR_UnwantedCompleteFlag
@ eErr_SEQ_FEAT_GeneXrefWithoutLocus
@ eErr_SEQ_FEAT_BadLocation
@ eErr_SEQ_FEAT_GenesInconsistent
@ eErr_SEQ_INST_HighNContentStretch
@ eErr_SEQ_PKG_NoBioseqFound
@ eErr_SEQ_FEAT_PseudoRnaHasProduct
@ eErr_SEQ_DESCR_InconsistentBioSources
@ eErr_GENERIC_PastReleaseDate
@ eErr_SEQ_DESCR_BioSourceDbTagConflict
@ eErr_SEQ_FEAT_UnknownImpFeatQual
@ eErr_SEQ_FEAT_DuplicateExonInterval
@ eErr_GENERIC_UnnecessaryPubEquiv
@ eErr_SEQ_DESCR_BioSourceOnProtein
@ eErr_SEQ_DESCR_LatLonRange
@ eErr_SEQ_FEAT_UnnecessaryTranslExcept
@ eErr_SEQ_GRAPH_GraphBioseqId
@ eErr_SEQ_FEAT_MixedStrand
@ eErr_SEQ_FEAT_BadRRNAcomponentOrder
@ eErr_SEQ_DESCR_DuplicatePCRPrimerSequence
@ eErr_SEQ_FEAT_BadGeneOntologyFormat
@ eErr_SEQ_DESCR_LatLonCountry
@ eErr_SEQ_PKG_NucProtSetHasTitle
@ eErr_SEQ_FEAT_IllegalDbXref
@ eErr_GENERIC_SgmlPresentInText
@ eErr_SEQ_FEAT_BadAnticodonAA
@ eErr_SEQ_FEAT_MissingCDSproduct
@ eErr_SEQ_FEAT_FeatureBeginsOrEndsInGap
@ eErr_SEQ_FEAT_TranslExceptAndRnaEditing
@ eErr_GENERIC_BarcodeTooManyNs
@ eErr_SEQ_PKG_BioseqSetClassNotSet
@ eErr_SEQ_DESCR_NoOrgFound
@ eErr_SEQ_FEAT_MissingProteinName
@ eErr_SEQ_DESCR_BadPCRPrimerSequence
@ eErr_SEQ_FEAT_GeneXrefWithoutGene
@ eErr_SEQ_DESCR_TransgenicProblem
@ eErr_SEQ_PKG_MissingSetTitle
@ eErr_SEQ_FEAT_InvalidQualifierValue
@ eErr_SEQ_FEAT_GeneOntologyTermMissingGOID
@ eErr_SEQ_FEAT_ProtRefHasNoData
@ eErr_SEQ_GRAPH_GraphSeqLocLen
@ eErr_SEQ_DESCR_InvalidForType
@ eErr_SEQ_DESCR_LatLonValue
@ eErr_SEQ_FEAT_TransLen
@ eErr_SEQ_FEAT_FeatureCitationProblem
@ eErr_SEQ_DESCR_IdenticalInstitutionCode
@ eErr_SEQ_PKG_ImproperlyNestedSets
@ eErr_SEQ_INST_UnknownLengthGapNot100
@ eErr_SEQ_FEAT_WrongQualOnFeature
@ eErr_SEQ_FEAT_MultipleProtRefs
@ eErr_SEQ_FEAT_MultipleEquivPublications
@ eErr_SEQ_PKG_SeqSubmitWithWgsSet
@ eErr_SEQ_PKG_InconsistentMoltypeSet
@ eErr_SEQ_INST_ConflictingBiomolTech
@ eErr_SEQ_FEAT_MissingQualOnImpFeat
@ eErr_SEQ_PKG_INSDRefSeqPackaging
@ eErr_SEQ_FEAT_LocusCollidesWithLocusTag
@ eErr_SEQ_PKG_GPSnonGPSPackaging
@ eErr_SEQ_DESCR_BadCollectionDate
@ eErr_SEQ_FEAT_MultipleEquivBioSources
@ eErr_SEQ_FEAT_CDSwithNoMRNAOverlap
@ eErr_SEQ_DESCR_BadInstitutionCode
@ eErr_SEQ_FEAT_PeptideFeatOutOfFrame
@ eErr_SEQ_FEAT_ProteinNameHasPMID
@ eErr_SEQ_FEAT_RepeatRegionNeedsNote
@ eErr_SEQ_DESCR_BadAltitude
@ eErr_SEQ_FEAT_GeneXrefStrandProblem
@ eErr_SEQ_FEAT_MissingTrnaAA
@ eErr_GENERIC_NonAsciiAsn
@ eErr_SEQ_FEAT_CDSwithMultipleMRNAs
@ eErr_SEQ_FEAT_CollidingFeatureIDs
@ eErr_SEQ_DESCR_IncorrectlyFormattedVoucherID
@ eErr_SEQ_FEAT_OrfCdsHasProduct
@ eErr_SEQ_FEAT_ImproperBondLocation
@ eErr_SEQ_PKG_GraphPackagingProblem
@ eErr_SEQ_INST_OverlappingDeltaRange
@ eErr_SEQ_FEAT_BadTranssplicedInterval
@ eErr_SEQ_INST_SeqLocLength
@ eErr_SEQ_DESCR_MultipleTaxonIDs
@ eErr_SEQ_DESCR_BadKeyword
@ eErr_SEQ_FEAT_UnknownImpFeatKey
@ eErr_SEQ_DESCR_Inconsistent
@ eErr_SEQ_PKG_ArchaicFeatureLocation
@ eErr_GENERIC_BadDate
@ eErr_GENERIC_BarcodeTestFails
@ eErr_SEQ_FEAT_NestedSeqLocMix
@ eErr_SEQ_FEAT_ShortIntron
@ eErr_SEQ_FEAT_UnknownFeatureQual
@ eErr_SEQ_DESCR_MultipleChromosomes
@ eErr_SEQ_FEAT_Range
@ eErr_SEQ_FEAT_InconsistentGeneOntologyTermAndId
@ eErr_SEQ_PKG_MisplacedMolInfo
@ eErr_GENERIC_EmbeddedScript
@ eErr_GENERIC_BarcodeTestPasses
@ eErr_SEQ_GRAPH_GraphAbove
@ eErr_SEQ_FEAT_FeatureInsideGap
@ eErr_SEQ_FEAT_DifferntIdTypesInSeqLoc
@ eErr_SEQ_FEAT_BadFullLengthFeature
@ eErr_SEQ_FEAT_RNAtype0
@ eErr_SEQ_FEAT_BadCharInAuthorName
@ eErr_SEQ_FEAT_FarLocation
@ eErr_SEQ_INST_BadHTGSeq
@ eErr_SEQ_FEAT_InvalidFuzz
@ eErr_SEQ_FEAT_InvalidInferenceValue
@ eErr_SEQ_FEAT_GeneXrefNeeded
@ eErr_SEQ_INST_UnexpectedIdentifierChange
@ eErr_SEQ_FEAT_InconsistentRRNAstrands
@ eErr_SEQ_PKG_ArchaicFeatureProduct
@ eErr_SEQ_DESCR_MultipleSourceQualifiers
@ eErr_SEQ_FEAT_BadRRNAcomponentOverlap
@ eErr_SEQ_FEAT_BadTrailingCharacter
@ eErr_SEQ_DESCR_WrongVoucherType
@ eErr_SEQ_INST_ProteinsHaveGeneralID
@ eErr_SEQ_GRAPH_GraphOutOfOrder
@ eErr_SEQ_FEAT_BadInternalCharacter
@ eErr_SEQ_DESCR_NoSourceDescriptor
@ eErr_SEQ_DESCR_BadCollectionCode
@ eErr_SEQ_FEAT_BadProteinName
@ eErr_SEQ_FEAT_FeatureProductInconsistency
@ eErr_GENERIC_PublicationInconsistency
@ eErr_GENERIC_CollidingSerialNumbers
@ eErr_SEQ_PKG_ComponentMissingTitle
@ eErr_SEQ_DESCR_DBLinkMissingUserObject
@ eErr_SEQ_PKG_InternalGenBankSet
@ eErr_SEQ_DESCR_BioSourceMissing
@ eErr_SEQ_FEAT_BadAnticodonCodon
@ eErr_SEQ_FEAT_BadTrailingHyphen
@ eErr_SEQ_FEAT_OldLocusTagMismtach
@ eErr_SEQ_DESCR_MolInfoConflictsWithBioSource
@ eErr_SEQ_FEAT_UTRdoesNotAbutCDS
@ eErr_SEQ_FEAT_PseudoRnaViaGeneHasProduct
@ eErr_SEQ_FEAT_ConflictFlagSet
@ eErr_SEQ_FEAT_StrandOther
@ eErr_SEQ_PKG_FeaturePackagingProblem
@ eErr_SEQ_DESCR_MultipleNames
@ eErr_SEQ_INST_BadSeqIdFormat
@ eErr_SEQ_PKG_GenomicProductPackagingProblem
@ eErr_INTERNAL_Exception
@ eErr_SEQ_FEAT_BadEcNumberFormat
@ eErr_SEQ_FEAT_CDSproductPackagingProblem
@ eErr_SEQ_FEAT_RedundantFields
@ eErr_SEQ_INST_InternalNsInSeqRaw
@ eErr_SEQ_DESCR_BadOrgMod
@ eErr_SEQ_INST_TerminalNs
@ eErr_SEQ_DESCR_BadOrganelleLocation
@ eErr_SEQ_FEAT_NoNameForProtein
@ eErr_SEQ_FEAT_RptUnitRangeProblem
@ eErr_SEQ_FEAT_SeqLocOrder
@ eErr_SEQ_DESCR_TaxonomyIsSpeciesProblem
@ eErr_SEQ_FEAT_CDSmRNAXrefLocationProblem
@ eErr_SEQ_PKG_SingleItemSet
@ eErr_SEQ_DESCR_BioSourceNeedsChromosome
@ eErr_SEQ_FEAT_VectorContamination
@ eErr_SEQ_FEAT_AbuttingIntervals
@ eErr_SEQ_FEAT_CDSrange
@ eErr_SEQ_FEAT_LocusTagProblem
@ eErr_SEQ_DESCR_BioSourceInconsistency
@ eErr_SEQ_FEAT_OnlyGeneXrefs
@ eErr_SEQ_FEAT_TranslExcept
@ eErr_SEQ_INST_InternalGapsInSeqRaw
@ eErr_SEQ_FEAT_GeneRefHasNoData
@ eErr_SEQ_INST_DuplicateSegmentReferences
@ eErr_SEQ_FEAT_TooManyInferenceAccessions
@ eErr_SEQ_FEAT_TerminalXDiscrepancy
@ eErr_SEQ_FEAT_MiscFeatureNeedsNote
@ eErr_SEQ_DESCR_CollidingPublications
@ eErr_SEQ_FEAT_GenomeSetMixedStrand
@ eErr_SEQ_FEAT_BadCharInAuthorLastName
@ eErr_SEQ_FEAT_HypotheticalProteinMismatch
@ eErr_SEQ_INST_TpaAssemblyProblem
@ eErr_SEQ_FEAT_MissingGeneXref
AutoPtr –.
Definition: ncbimisc.hpp:401
CAlign_CI –.
Definition: align_ci.hpp:63
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
CSeq_entry * GetParentEntry(void) const
Definition: Bioseq.hpp:174
static void GetPubdescLabels(const CPubdesc &pd, vector< TEntrezId > &pmids, vector< TEntrezId > &muids, vector< int > &serials, vector< string > &published_labels, vector< string > &unpublished_labels)
For Publication Citations Get labels for a pubdesc.
Definition: cleanup.cpp:3167
Definition: Dbtag.hpp:53
bool GetDBFlags(bool &is_refseq, bool &is_src, string &correct_caps) const
Definition: Dbtag.cpp:327
bool IsSkippable(void) const
Definition: Dbtag.cpp:281
CFeat_CI –.
Definition: feat_ci.hpp:64
void Clear()
Definition: gene_cache.hpp:122
CConstRef< CSeq_feat > GetGeneFromCache(const CSeq_feat *feat, CScope &scope)
Definition: gene_cache.cpp:106
void GetLabel(string *label) const
Definition: Gene_ref.cpp:57
CGraph_CI –.
Definition: graph_ci.hpp:234
CMappedFeat –.
Definition: mapped_feat.hpp:59
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:244
CObjectManager –.
const string & GetDivision(void) const
Definition: Org_ref.cpp:164
bool IsSetDivision(void) const
Definition: Org_ref.cpp:159
@ eContent
Definition: Pub.hpp:66
@Pubdesc.hpp User-defined methods of the data storage class.
Definition: Pubdesc.hpp:54
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
ESubtype GetSubtype(void) const
static bool RequireLocationIntervalsInBiologicalOrder(ESubtype subtype)
static bool AllowAdjacentIntervals(ESubtype subtype)
@ eSubtype_bad
These no longer need to match the FEATDEF values in the C toolkit's objfdef.h.
CSeq_annot_Handle –.
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
@ eContent
Definition: Seq_entry.hpp:93
void GetLabel(string *label, ELabelType type) const
Definition: Seq_entry.cpp:274
CSeq_entry * GetParentEntry(void) const
Definition: Seq_entry.hpp:131
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
Base class for all serializable objects.
Definition: serialbase.hpp:150
CSubmit_block –.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
Definition: tse_handle.cpp:205
TSeq_feat_Handles GetFeaturesWithId(CSeqFeatData::E_Choice type, TFeatureIdInt id) const
Definition: tse_handle.cpp:604
CScope & GetScope(void) const
Returns scope.
Definition: tse_handle.hpp:325
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
CTypeInfo class contains all information about C++ types (both basic and classes): members and layout...
Definition: typeinfo.hpp:76
Thrown on an attempt to write unassigned data member.
Definition: exception.hpp:84
static string GetFeatureBioseqLabel(const CSeq_feat &ft, CRef< CScope > scope, bool suppress_context)
static string GetDescriptorContent(const CSeqdesc &ds)
static string GetFeatureLocationLabel(const CSeq_feat &ft, CRef< CScope > scope, bool suppress_context)
static string GetFeatureProductLocLabel(const CSeq_feat &ft, CRef< CScope > scope, bool suppress_context)
static string GetDescriptorLabel(const CSeqdesc &ds, const CSeq_entry &ctx, CRef< CScope > scope, bool suppress_context)
static string GetFeatureContentLabel(const CSeq_feat &feat, CRef< CScope > scope)
static string GetFeatureIdLabel(const CSeq_feat &ft)
static string GetBioseqSetLabel(const CBioseq_set &st, CRef< CScope > scope, bool suppress_context)
void ValidateSeqAlign(const CSeq_align &align, int order=-1)
void ValidateSeqAnnot(const CSeq_annot_Handle &annot)
CCacheImpl & GetCache()
virtual ~CValidError_base()
static CSeq_entry_Handle GetAppropriateXrefParent(CSeq_entry_Handle seh)
CValidError_imp & m_Imp
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
CValidError_base(CValidError_imp &imp)
void ValidateBioseq(const CBioseq &seq)
bool GetTSAConflictingBiomolTechErrors(const CBioseq &seq)
bool GetTSANStretchErrors(const CBioseq &seq)
void ValidateBioseqSet(const CBioseq_set &seqset)
void ValidateSeqDesc(const CSeqdesc &desc, const CSeq_entry &ctx)
Validate descriptors as stand alone objects (no context)
void SetScope(CScope &scope)
void SetTSE(CSeq_entry_Handle seh)
bool GetTSACDSOnMinusStrandErrors(const CSeq_feat &feat, const CBioseq &seq)
static bool GetPrefixAndAccessionFromInferenceAccession(string inf_accession, string &prefix, string &accession)
void ValidateSeqFeat(const CSeq_feat &feat)
static vector< string > GetAccessionsFromInferenceString(string inference, string &prefix, string &remainder, bool &same_species)
void ValidateSeqGraph(const CSeq_graph &graph)
void x_ReportInvalidFuzz(const CPacked_seqint &packed_int, const CSerialObject &obj)
CRef< CObjectManager > m_ObjMgr
bool IsGED() const
void SetScope(const CSeq_entry &se)
void FindCollidingSerialNumbers(const CSerialObject &obj)
Definition: valid_pub.cpp:1295
const CSeq_entry_Handle & GetTSEH()
static bool BadMultipleSequenceLocation(const CSeq_loc &loc, CScope &scope)
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
Definition: validatorp.cpp:358
static bool IsTSAIntermediate(const CBioseq &seq)
void x_CheckPackedInt(const CPacked_seqint &packed_int, SLocCheck &lc, const CSerialObject &obj)
static bool IsInOrganelleSmallGenomeSet(const CSeq_id &id, CScope &scope)
bool IsNC() const
const CBioSourceKind & BioSourceKind() const
SIZE_TYPE m_NumPseudogene
bool IsNS() const
CRef< CScope > m_Scope
bool HasGiOrAccnVer() const
SIZE_TYPE m_NumTpaWithHistory
void SetTSE(const CSeq_entry_Handle &seh)
const SValidatorContext & GetContext() const
Definition: validatorp.cpp:204
CValidator::TProgressCallback m_PrgCallback
bool IsPDB() const
CValidError * m_ErrRepository
CConstRef< CSeq_feat > GetmRNAGivenProduct(const CBioseq &seq)
bool IsValidateAlignments() const
CBioseq_Handle GetBioseqHandleFromTSE(const CSeq_id &id)
Definition: validatorp.cpp:248
void ValidateCitations(const CSeq_entry_Handle &seh)
bool DoesAnyFeatLocHaveGI() const
void FindNonAsciiText(const CSerialObject &obj)
void AddBioseqWithNoBiosource(const CBioseq &seq)
void ValidateSeqLocIds(const CSeq_loc &loc, const CSerialObject &obj)
bool GenerateGoldenFile() const
bool IsStandaloneAnnot() const
void x_DoBarcodeTests(CSeq_entry_Handle seh)
CConstRef< CSeq_annot > m_SeqAnnot
bool IsNM() const
bool DoesAnyProductLocHaveGI() const
bool GetTSAConflictingBiomolTechErrors(const CSeq_entry_Handle &se)
void x_AddValidErrItem(EDiagSev sev, EErrType type, const string &msg, const string &desc, const CSerialObject &obj, const string &accession, const int version)
unique_ptr< CValidatorEntryInfo > m_pEntryInfo
SIZE_TYPE m_NumMisplacedGraphs
bool IsNT() const
bool IsGenbank() const
void PostObjErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
bool IsNZ() const
void Setup(const CSeq_entry_Handle &seh)
bool Validate(const CSeq_entry &se, const CCit_sub *cs=nullptr, CScope *scope=nullptr)
SIZE_TYPE m_NumTpaWithoutHistory
static bool IsWGSIntermediate(const CBioseq &seq)
CValidator::CProgressInfo m_PrgInfo
void ValidateDbxref(const CDbtag &xref, const CSerialObject &obj, bool biosource=false, const CSeq_entry *ctx=nullptr)
bool IsSerialNumberInComment(const string &comment)
bool IsGenomic() const
void ValidateTaxonomy(const CSeq_entry &se)
bool IsFarSequence(const CSeq_id &id)
Definition: validatorp.cpp:234
const CTSE_Handle & GetTSE_Handle()
void FindEmbeddedScript(const CSerialObject &obj)
bool IsHugeFileMode() const
Definition: validatorp.cpp:211
SIZE_TYPE m_NumSmallGenomeSetMisplaced
void ValidateCitSub(const CCit_sub &cs, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
Definition: valid_pub.cpp:1049
void SetOptions(Uint4 options)
Definition: validatorp.cpp:270
bool m_ValidateInferenceAccessions
void ValidateSubmitBlock(const CSubmit_block &block, const CSeq_submit &ss)
bool IsNoCitSubPubs() const
void SetErrorRepository(CValidError *errors)
Definition: validatorp.cpp:304
bool IsNP() const
vector< CConstRef< CBioseq > > m_BioseqWithNoSource
void ValidateAffil(const CAffil::TStd &std, const CSerialObject &obj, const CSeq_entry *ctx)
Definition: valid_pub.cpp:958
CConstRef< CSeq_feat > GetCDSGivenProduct(const CBioseq &seq)
CBioseq_Handle GetLocalBioseqHandle(const CSeq_id &id)
Definition: validatorp.cpp:257
bool IsRefSeq() const
bool IsGPS() const
bool IsINSDInSep() const
bool IsNG() const
const CSeq_entry * GetAncestor(const CBioseq &seq, CBioseq_set::EClass clss)
bool IsGeneious() const
SIZE_TYPE m_NumGeneXrefs
bool x_IsFarFetchFailure(const CSeq_loc &loc)
bool IsNoPubs() const
CValidError_imp(CObjectManager &objmgr, shared_ptr< SValidatorContext > pContext, CValidError *errors, Uint4 options=0)
Definition: validatorp.cpp:169
void PostBadDateError(EDiagSev sv, const string &msg, int flags, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
void AddProtWithoutFullRef(const CBioseq_Handle &seq)
void ValidateBioSource(const CBioSource &bsrc, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
const CValidatorEntryInfo & GetEntryInfo() const
CSeq_entry_Handle m_TSEH
bool RaiseGenomeSeverity(EErrType et)
Definition: validatorp.cpp:601
SIZE_TYPE m_NumBioseq_set
bool x_CheckSeqInt(CConstRef< CSeq_id > &id_cur, const CSeq_interval *int_cur, ENa_strand &strand_cur, const CSerialObject &obj)
bool RequireLocalProduct(const CSeq_id *sid) const
bool IsGI() const
bool IsGpipe() const
bool IsFarFetchCDSproducts() const
bool IsPatent() const
bool DoesAnyProteinHaveGeneralID() const
virtual ~CValidError_imp()
Definition: validatorp.cpp:189
void x_Init(Uint4 options)
Definition: validatorp.cpp:180
bool IsWP() const
void ReportMissingPubs(const CSeq_entry &se, const CCit_sub *cs)
Definition: valid_pub.cpp:1249
bool IsNoBioSource() const
bool IsMixedStrands(const CSeq_loc &loc)
bool IsLocalGeneralOnly() const
CBioSourceKind m_biosource_kind
CConstRef< CSeq_entry > m_TSE
CGeneCache m_GeneCache
void x_InitLocCheck(SLocCheck &lc, const string &prefix)
SValidatorContext & SetContext()
Definition: validatorp.cpp:194
bool IsIndexerVersion() const
CGeneCache & GetGeneCache()
bool IsSmallGenomeSet() const
bool IsEmbl() const
void SetProgressCallback(CValidator::TProgressCallback callback, void *user_data)
bool GetTSACDSOnMinusStrandErrors(const CSeq_entry_Handle &se)
void ValidateMultipleTaxIds(const CSeq_entry_Handle &seh)
bool IsHugeSet(const CBioseq_set &bioseqSet) const
Definition: validatorp.cpp:219
bool IsSeqSubmit() const
void ValidateSeqLoc(const CSeq_loc &loc, const CBioseq_Handle &seq, bool report_abutting, const string &prefix, const CSerialObject &obj, bool lowerSev=false)
bool GetTSANStretchErrors(const CSeq_entry_Handle &se)
bool IsXR() const
shared_ptr< SValidatorContext > m_pContext
bool DoesAnyGeneHaveLocusTag() const
bool IsTPE() const
void x_CheckLoc(const CSeq_loc &loc, const CSerialObject &obj, SLocCheck &lc, bool lowerSev=false)
bool IsNR() const
const CSeq_entry & GetTSE() const
bool IsFarFetchMRNAproducts() const
CValidatorEntryInfo & x_SetEntryInfo()
SIZE_TYPE m_NumMisplacedFeatures
CCacheImpl & GetCache()
bool IsDdbj() const
void x_CheckForStrandChange(SLocCheck &lc)
bool IsNW() const
void ReportMissingBiosource(const CSeq_entry &se)
void ValidatePubdesc(const CPubdesc &pub, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
Definition: valid_pub.cpp:78
void AddValidErrItem(EDiagSev sev, unsigned int ec, const string &msg, const string &desc, const CSerialObject &obj, const string &acc, const int ver, const string &location=kEmptyStr, const int seq_offset=0)
Definition: ValidError.cpp:59
void SetNoBioSource(bool val=true)
Definition: entry_info.cpp:42
void SetPatent(bool val=true)
Definition: entry_info.cpp:46
void SetGpipe(bool val=true)
Definition: entry_info.cpp:52
bool IsDdbj() const
Definition: entry_info.cpp:75
void SetGenomic(bool val=true)
Definition: entry_info.cpp:55
bool IsGPS() const
Definition: entry_info.cpp:69
void SetProductLocHasGI(bool val=true)
Definition: entry_info.cpp:60
bool IsNoPubs() const
Definition: entry_info.cpp:66
bool DoesAnyGeneHaveLocusTag() const
Definition: entry_info.cpp:87
void SetPDB(bool val=true)
Definition: entry_info.cpp:45