NCBI C++ ToolKit
validatorp.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: validatorp.cpp 102488 2024-05-13 20:57:01Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko, Mati Shomrat, ....
27  *
28  * File Description:
29  * Implementation of private parts of the validator
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/ncbiapp.hpp>
38 
48 
49 #include <serial/iterator.hpp>
50 #include <serial/enumvalues.hpp>
51 
55 
57 
60 
61 #include <objects/seq/Bioseq.hpp>
63 #include <objects/seq/Seqdesc.hpp>
65 #include <objects/seq/Pubdesc.hpp>
66 #include <objects/seq/MolInfo.hpp>
73 
78 
80 
83 
84 #include <objmgr/bioseq_ci.hpp>
85 #include <objmgr/seqdesc_ci.hpp>
86 #include <objmgr/graph_ci.hpp>
87 #include <objmgr/seq_annot_ci.hpp>
88 #include <objmgr/util/feature.hpp>
89 #include <objmgr/util/sequence.hpp>
90 
91 #include <objmgr/feat_ci.hpp>
92 #include <objmgr/align_ci.hpp>
93 #include <objmgr/seq_vector.hpp>
94 #include <objmgr/scope.hpp>
95 
96 #include <objects/pub/Pub.hpp>
98 
110 #include <objects/biblio/Title.hpp>
112 #include <objects/biblio/Affil.hpp>
115 #include <objects/taxon3/taxon3.hpp>
117 
124 
125 #include <objtools/error_codes.hpp>
131 #include <util/sgml_entity.hpp>
132 #include <util/line_reader.hpp>
133 #include <util/util_misc.hpp>
134 #include <util/static_set.hpp>
135 
136 #include <algorithm>
137 
138 
139 #include <serial/iterator.hpp>
140 
141 #define NCBI_USE_ERRCODE_X Objtools_Validator
142 
145 BEGIN_SCOPE(validator)
146 using namespace sequence;
147 
148 namespace {
149  // avoid creating a PQuickStringLess for every comparison
150  PQuickStringLess s_QuickStringLess;
151 };
152 
153 
154 // =============================================================================
155 // CValidError_imp Public
156 // =============================================================================
157 
163 
167 
169 (CObjectManager& objmgr,
170  shared_ptr<SValidatorContext> pContext,
171  IValidError* errs,
172  Uint4 options) :
173  m_ObjMgr{&objmgr},
174  m_ErrRepository{errs},
175  m_pContext{pContext}
176 {
177  x_Init(options, pContext->CumulativeInferenceCount);
178 }
179 
180 void CValidError_imp::x_Init(Uint4 options, size_t initialInferenceCount)
181 {
182  SetOptions(options);
183  Reset(initialInferenceCount);
184 
186 }
187 
188 // Destructor
190 {
191 }
192 
193 
195 {
196  return m_SuppressedErrors;
197 }
198 
200 {
201  // if (!m_pContext) {
202  // m_pContext = make_shared<SValidatorContext>();
203  // }
205  return *m_pContext;
206 }
207 
208 
210 {
212  return *m_pContext;
213 }
214 
215 
217 {
218  const auto& context = GetContext();
219  return context.PreprocessHugeFile ||
220  context.PostprocessHugeFile;
221 }
222 
223 
224 bool CValidError_imp::IsHugeSet(const CBioseq_set& bioseqSet) const
225 {
226  if (bioseqSet.IsSetClass()) {
227  return IsHugeSet(bioseqSet.GetClass());
228  }
229  return false;
230 }
231 
232 
234 {
235  return edit::CHugeAsnReader::IsHugeSet(setClass);
236 }
237 
238 
239 bool CValidError_imp::IsFarSequence(const CSeq_id& id) // const
240 {
241  if (IsHugeFileMode() && GetContext().IsIdInBlob) {
242  return !GetContext().IsIdInBlob(id);
243  }
244 
245  _ASSERT(m_Scope);
246  if (GetBioseqHandleFromTSE(id)) {
247  return false;
248  }
249  return true;
250 }
251 
252 
254 {
255  if (m_Scope) {
257  }
258  return CBioseq_Handle();
259 }
260 
261 
263 {
264  if (!IsHugeFileMode()) {
265  return GetBioseqHandleFromTSE(id);
266  }
267  // Huge-file mode
268  if (!IsFarSequence(id)) {
269  return m_Scope->GetBioseqHandle(id);
270  }
271  return CBioseq_Handle();
272 }
273 
274 
276 {
277  m_NonASCII = (options & CValidator::eVal_non_ascii) != 0;
280  m_ValidateExons = (options & CValidator::eVal_val_exons) != 0;
281  m_OvlPepErr = (options & CValidator::eVal_ovl_pep_err) != 0;
284  m_RemoteFetch = (options & CValidator::eVal_remote_fetch) != 0;
290  m_UseEntrez = (options & CValidator::eVal_use_entrez) != 0;
305 }
306 
307 
308 //LCOV_EXCL_START
309 //not used by asnvalidate
311 {
312  m_ErrRepository = errors;
313 }
314 //LCOV_EXCL_STOP
315 
316 
317 void CValidError_imp::Reset(size_t prevCumulativeInferenceCount)
318 {
319  m_Scope = nullptr;
320  m_TSE = nullptr;
321  m_IsStandaloneAnnot = false;
322  m_SeqAnnot.Reset();
323 
324  m_pEntryInfo.reset(new CValidatorEntryInfo());
325 
326  m_CumulativeInferenceCount = prevCumulativeInferenceCount;
327 
328  m_IsNC = false;
329  m_IsNG = false;
330  m_IsNM = false;
331  m_IsNP = false;
332  m_IsNR = false;
333  m_IsNZ = false;
334  m_IsNS = false;
335  m_IsNT = false;
336  m_IsNW = false;
337  m_IsWP = false;
338  m_IsXR = false;
339 
340  m_PrgCallback = nullptr;
341  m_NumAlign = 0;
342  m_NumAnnot = 0;
343  m_NumBioseq = 0;
344  m_NumBioseq_set = 0;
346  m_NumDesc = 0;
347  m_NumDescr = 0;
348  m_NumFeat = 0;
349  m_NumGraph = 0;
353  m_NumGenes = 0;
354  m_NumGeneXrefs = 0;
357  m_NumPseudo = 0;
358  m_NumPseudogene = 0;
359  m_FarFetchFailure = false;
360  m_IsTbl2Asn = false;
361 
362  SetSuppressed().clear();
363 }
364 
366 {
367  return (m_SuppressedErrors.find(errType) != m_SuppressedErrors.end());
368 }
369 
370 // Error post methods
372 (EDiagSev sv,
373  EErrType et,
374  const string& msg,
375  const CSerialObject& obj)
376 {
377  if (x_IsSuppressed(et)) {
378  return;
379  }
380 
381  const CTypeInfo* type_info = obj.GetThisTypeInfo();
382  if (type_info == CSeqdesc::GetTypeInfo()) {
383  const CSeqdesc* desc = dynamic_cast < const CSeqdesc* > (&obj);
384  ERR_POST_X(1, Warning << "Seqdesc validation error using default context.");
385  PostErr (sv, et, msg, GetTSE(), *desc);
386  } else if (type_info == CSeq_feat::GetTypeInfo()) {
387  const CSeq_feat* feat = dynamic_cast < const CSeq_feat* > (&obj);
388  PostErr (sv, et, msg, *feat);
389  } else if (type_info == CBioseq::GetTypeInfo()) {
390  const CBioseq* seq = dynamic_cast < const CBioseq* > (&obj);
391  PostErr (sv, et, msg, *seq);
392  } else if (type_info == CBioseq_set::GetTypeInfo()) {
393  const CBioseq_set* set = dynamic_cast < const CBioseq_set* > (&obj);
394  PostErr (sv, et, msg, *set);
395  } else if (type_info == CSeq_annot::GetTypeInfo()) {
396  const CSeq_annot* annot = dynamic_cast < const CSeq_annot* > (&obj);
397  PostErr (sv, et, msg, *annot);
398  } else if (type_info == CSeq_graph::GetTypeInfo()) {
399  const CSeq_graph* graph = dynamic_cast < const CSeq_graph* > (&obj);
400  PostErr (sv, et, msg, *graph);
401  } else if (type_info == CSeq_align::GetTypeInfo()) {
402  const CSeq_align* align = dynamic_cast < const CSeq_align* > (&obj);
403  PostErr (sv, et, msg, *align);
404  } else if (type_info == CSeq_entry::GetTypeInfo()) {
405  const CSeq_entry* entry = dynamic_cast < const CSeq_entry* > (&obj);
406  PostErr (sv, et, msg, *entry);
407  } else if (type_info == CBioSource::GetTypeInfo()) {
408  const CBioSource* src = dynamic_cast < const CBioSource* > (&obj);
409  PostErr (sv, et, msg, *src);
410  } else if (type_info == COrg_ref::GetTypeInfo()) {
411  const COrg_ref* org = dynamic_cast < const COrg_ref* > (&obj);
412  PostErr (sv, et, msg, *org);
413  } else if (type_info == CPubdesc::GetTypeInfo()) {
414  const CPubdesc* pd = dynamic_cast < const CPubdesc* > (&obj);
415  PostErr (sv, et, msg, *pd);
416  } else if (type_info == CSeq_submit::GetTypeInfo()) {
417  const CSeq_submit* ss = dynamic_cast < const CSeq_submit* > (&obj);
418  PostErr (sv, et, msg, *ss);
419  } else {
420  ERR_POST_X(1, Warning << "Unknown data type in PostErr.");
421  }
422 }
423 
424 
425 /*
426 void CValidError_imp::PostErr
427 (EDiagSev sv,
428  EErrType et,
429  const string& msg,
430  TDesc ds)
431 {
432  // Append Descriptor label
433  string desc = "DESCRIPTOR: ";
434  ds.GetLabel (&desc, CSeqdesc::eBoth);
435  desc += ", NO Descriptor Context";
436  m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, *m_Scope);
437 }
438 */
439 
440 static const EErrType sc_ValidGenomeRaise[] = {
598 };
599 
601 
607 };
608 
610 
611 
614 };
615 
617 
618 
620  EErrType et
621 )
622 
623 {
624  if (sc_GenomeRaiseExceptEmblDdbjRefSeqArray.find(et) != sc_GenomeRaiseExceptEmblDdbjRefSeqArray.end()) {
625  if (IsEmbl() || IsDdbj() || IsRefSeq()) {
626  return false;
627  } else {
628  return true;
629  }
630  }
631  if (sc_GenomeRaiseExceptEmblDdbjArray.find(et) != sc_GenomeRaiseExceptEmblDdbjArray.end()) {
632  if (IsEmbl() || IsDdbj()) {
633  return false;
634  } else {
635  return true;
636  }
637  }
638  if (sc_GenomeRaiseArray.find (et) != sc_GenomeRaiseArray.end()) {
639  return true;
640  }
641  return false;
642 }
643 
645 (EDiagSev sv,
646  EErrType et,
647  const string& msg,
648  TFeat ft)
649 {
650 
651  if (x_IsSuppressed(et)) {
652  return;
653  }
654 
656 
657  // Adjust severity
659  sv = eDiag_Error;
660  }
661 
662  item->SetSev(sv);
663  item->SetErrIndex(et);
664  item->SetMsg(msg);
665  item->SetObject(ft);
666 
667  if (GenerateGoldenFile()) {
669  return;
670  }
671 
672  string content_label = CValidErrorFormat::GetFeatureContentLabel(ft, m_Scope);
673  item->SetObj_content(content_label);
674 
675  string feature_id = CValidErrorFormat::GetFeatureIdLabel(ft);
676  if (!NStr::IsBlank(feature_id)) {
677  item->SetFeatureId(feature_id);
678  }
679 
681  if (!NStr::IsBlank(bioseq_label)) {
682  item->SetBioseq(bioseq_label);
683  }
684 
685  // Calculate sequence offset
686  TSeqPos offset = 0;
687  string location;
688  if (ft.IsSetLocation()) {
691  if (!NStr::IsBlank(loc_label)) {
692  item->SetLocation(loc_label);
693  }
694  item->SetSeqOffset(offset);
695  }
696 
697 
699  if (!NStr::IsBlank(product_label)) {
700  item->SetProduct_loc(product_label);
701  }
702 
703  int version = 0;
704  string accession;
705  if (m_Scope) {
706  accession = GetAccessionFromObjects(&ft, nullptr, *m_Scope, &version);
707  }
708  item->SetAccession(accession);
709  if (version > 0) {
710  item->SetAccnver(accession + "." + NStr::IntToString(version));
711  item->SetVersion(version);
712  } else {
713  item->SetAccnver(accession);
714  }
715 
716  if (ft.IsSetData()) {
717  if (ft.GetData().IsGene()) {
718  if (ft.GetData().GetGene().IsSetLocus_tag() &&
720  item->SetLocus_tag(ft.GetData().GetGene().GetLocus_tag());
721  }
722  } else {
723  if (m_CollectLocusTags) {
724  // TODO: this should be part of post-processing
726  if (gene && gene->GetData().GetGene().IsSetLocus_tag() &&
727  !NStr::IsBlank(gene->GetData().GetGene().GetLocus_tag())) {
728  item->SetLocus_tag(gene->GetData().GetGene().GetLocus_tag());
729  }
730  }
731  }
732  }
733 
734  item->SetFeatureObjDescFromFields();
736 }
737 
738 
740 (EDiagSev sv,
741  EErrType et,
742  const string& msg,
743  TBioseq sq)
744 {
745  if (x_IsSuppressed(et)) {
746  return;
747  }
748 
749  // Adjust severity
751  sv = eDiag_Error;
752  }
753 
754  if (GenerateGoldenFile()) {
755  m_ErrRepository->AddValidErrItem(sv, et, msg);
756  return;
757  }
758 
759  // Append bioseq label
760  string desc;
762  int version = 0;
763  const string& accession = GetAccessionFromBioseq(sq, &version);
764  // GetAccessionFromObjects(&sq, nullptr, *m_Scope, &version);
765  x_AddValidErrItem(sv, et, msg, desc, sq, accession, version);
766 }
767 
768 
770 (EDiagSev sv,
771  EErrType et,
772  const string& msg,
773  TSet st)
774 {
775  if (x_IsSuppressed(et)) {
776  return;
777  }
778 
779  // Adjust severity
781  sv = eDiag_Error;
782  }
783 
784  if (GenerateGoldenFile()) {
785  m_ErrRepository->AddValidErrItem(sv, et, msg);
786  return;
787  }
788 
789  // Append Bioseq_set label
790 
791  const auto isSetClass = st.IsSetClass();
792 
793  if (isSetClass && GetContext().PreprocessHugeFile) {
794  if (auto setClass = st.GetClass(); IsHugeSet(setClass)) {
795  string desc =
797  x_AddValidErrItem(sv, et, msg, desc, st, GetContext().HugeSetId, 0);
798  return;
799  }
800  }
801 
802  int version = 0;
803  const string& accession = GetAccessionFromBioseqSet(st, &version);
804  //string desc = CValidErrorFormat::GetBioseqSetLabel(st, m_SuppressContext);
805  string desc = CValidErrorFormat::GetBioseqSetLabel(accession,
806  isSetClass ? st.GetClass() : CBioseq_set::eClass_not_set,
807  isSetClass ? m_SuppressContext : true);
808  x_AddValidErrItem(sv, et, msg, desc, st, accession, version);
809 }
810 
811 
813 (EDiagSev sv,
814  EErrType et,
815  const string& msg,
816  TEntry ctx,
817  TDesc ds)
818 {
819  if (x_IsSuppressed(et)) {
820  return;
821  }
822 
823  // Adjust severity
825  sv = eDiag_Error;
826  }
827 
828  if (GenerateGoldenFile()) {
829  m_ErrRepository->AddValidErrItem(sv, et, msg);
830  return;
831  }
832 
833 
834  if (GetContext().PreprocessHugeFile &&
835  ctx.IsSet() && ctx.GetSet().IsSetClass()) {
836  if (auto setClass = ctx.GetSet().GetClass(); IsHugeSet(setClass)) {
837  string desc{"DESCRIPTOR: "};
838  desc += CValidErrorFormat::GetDescriptorContent(ds) + " ";
839  desc += "BIOSEQ-SET: ";
840  if (!m_SuppressContext) {
841  if (setClass == CBioseq_set::eClass_genbank) {
842  desc += "genbank: ";
843  }
844  else {
845  desc += "wgs-set: ";
846  }
847  }
848  desc += GetContext().HugeSetId;
849  m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, GetContext().HugeSetId, 0);
850  return;
851  }
852  }
853 
854  // Append Descriptor label
856  int version = 0;
857  const string& accession = GetAccessionFromObjects(&ds, &ctx, *m_Scope, &version);
858  m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, ctx, accession, version);
859 }
860 
861 
862 //void CValidError_imp::PostErr
863 //(EDiagSev sv,
864 // EErrType et,
865 // const string& msg,
866 // TBioseq sq,
867 // TDesc ds)
868 //{
869 // // Append Descriptor label
870 // string desc("DESCRIPTOR: ");
871 // ds.GetLabel(&desc, CSeqdesc::eBoth);
872 //
873 // s_AppendBioseqLabel(desc, sq, m_SuppressContext);
874 // m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, *m_Scope);
875 // //PostErr(sv, et, msg, sq);
876 //}
877 
878 
879 //void CValidError_imp::PostErr
880 //(EDiagSev sv,
881 // EErrType et,
882 // const string& msg,
883 // TSet st,
884 // TDesc ds)
885 //{
886 // // Append Descriptor label
887 // string desc = " DESCRIPTOR: ";
888 // ds.GetLabel(&desc, CSeqdesc::eBoth);
889 // s_AppendSetLabel(desc, st, m_SuppressContext);
890 // m_ErrRepository->AddValidErrItem(sv, et, msg, desc, st, *m_Scope);
891 //
892 //}
893 
894 
896 (EDiagSev sv,
897  EErrType et,
898  const string& msg,
899  TAnnot an)
900 {
901  if (x_IsSuppressed(et)) {
902  return;
903  }
904 
905  // Adjust severity
907  sv = eDiag_Error;
908  }
909 
910  if (GenerateGoldenFile()) {
911  m_ErrRepository->AddValidErrItem(sv, et, msg);
912  return;
913  }
914 
915  // Append Annotation label
916  string desc = "ANNOTATION: ";
917 
918  // !!! need to decide on the message
919 
920  int version = 0;
921  const string& accession = GetAccessionFromObjects(&an, nullptr, *m_Scope, &version);
922  x_AddValidErrItem(sv, et, msg, desc, an, accession, version);
923 }
924 
925 
927 (EDiagSev sv,
928  EErrType et,
929  const string& msg,
930  TGraph graph)
931 {
932 
933  if (x_IsSuppressed(et)) {
934  return;
935  }
936 
937  // Adjust severity
939  sv = eDiag_Error;
940  }
941 
942  if (GenerateGoldenFile()) {
943  m_ErrRepository->AddValidErrItem(sv, et, msg);
944  return;
945  }
946 
947  // Append Graph label
948  string desc = "GRAPH: ";
949  if (graph.IsSetTitle()) {
950  desc += graph.GetTitle();
951  } else {
952  desc += "<Unnamed>";
953  }
954  desc += " ";
955  graph.GetLoc().GetLabel(&desc);
956 
957  int version = 0;
958  const string& accession = GetAccessionFromObjects(&graph, nullptr, *m_Scope, &version);
959  x_AddValidErrItem(sv, et, msg, desc, graph, accession, version);
960 }
961 
962 
964 (EDiagSev sv,
965  EErrType et,
966  const string& msg,
967  TBioseq sq,
968  TGraph graph)
969 {
970 
971  if (x_IsSuppressed(et)) {
972  return;
973  }
974 
975  // Adjust severity
977  sv = eDiag_Error;
978  }
979 
980  if (GenerateGoldenFile()) {
981  m_ErrRepository->AddValidErrItem(sv, et, msg);
982  return;
983  }
984 
985  // Append Graph label
986  string desc("GRAPH: ");
987  if ( graph.IsSetTitle() ) {
988  desc += graph.GetTitle();
989  } else {
990  desc += "<Unnamed>";
991  }
992  desc += " ";
993  graph.GetLoc().GetLabel(&desc);
995  int version = 0;
996  const string& accession = GetAccessionFromObjects(&graph, nullptr, *m_Scope, &version);
997  x_AddValidErrItem(sv, et, msg, desc, graph, accession, version);
998 }
999 
1000 
1002 (EDiagSev sv,
1003  EErrType et,
1004  const string& msg,
1005  TAlign align)
1006 {
1007 
1008  if (x_IsSuppressed(et)) {
1009  return;
1010  }
1011 
1012  // Adjust severity
1013  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1014  sv = eDiag_Error;
1015  }
1016 
1017  if (GenerateGoldenFile()) {
1018  m_ErrRepository->AddValidErrItem(sv, et, msg);
1019  return;
1020  }
1021 
1023  if (id) {
1025  if (bsh) {
1026  PostErr(sv, et, msg, *(bsh.GetCompleteBioseq()));
1027  return;
1028  }
1029  }
1030 
1031  // Can't get bioseq for reporting, use other Alignment label
1032  string desc = "ALIGNMENT: ";
1033  if (align.IsSetType()) {
1034  desc += align.ENUM_METHOD_NAME(EType)()->FindName(align.GetType(), true);
1035  }
1036  try {
1037  CSeq_align::TDim dim = align.GetDim();
1038  desc += ", dim=" + NStr::NumericToString(dim);
1039  } catch ( const CUnassignedMember& ) {
1040  desc += ", dim=UNASSIGNED";
1041  }
1042 
1043  if (align.IsSetSegs()) {
1044  desc += " SEGS: ";
1045  desc += align.GetSegs().SelectionName(align.GetSegs().Which());
1046  }
1047 
1048  int version = 0;
1049  const string& accession = GetAccessionFromObjects(&align, nullptr, *m_Scope, &version);
1050  x_AddValidErrItem(sv, et, msg, desc, align, accession, version);
1051 }
1052 
1053 
1055 (EDiagSev sv,
1056  EErrType et,
1057  const string& msg,
1058  TEntry entry)
1059 {
1060  if (x_IsSuppressed(et)) {
1061  return;
1062  }
1063 
1064  // Adjust severity
1065  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1066  sv = eDiag_Error;
1067  }
1068 
1069  if (GenerateGoldenFile()) {
1070  m_ErrRepository->AddValidErrItem(sv, et, msg);
1071  return;
1072  }
1073 
1074  if (entry.IsSeq()) {
1075  PostErr(sv, et, msg, entry.GetSeq());
1076  } else if (entry.IsSet()) {
1077  PostErr(sv, et, msg, entry.GetSet());
1078  } else {
1079  string desc = "SEQ-ENTRY: ";
1080  entry.GetLabel(&desc, CSeq_entry::eContent);
1081 
1082  int version = 0;
1083  const string& accession = GetAccessionFromObjects(&entry, nullptr, *m_Scope, &version);
1084  x_AddValidErrItem(sv, et, msg, desc, entry, accession, version);
1085  }
1086 }
1087 
1088 
1090 (EDiagSev sv,
1091  EErrType et,
1092  const string& msg,
1093  const CBioSource& src)
1094 {
1095 
1096  if (x_IsSuppressed(et)) {
1097  return;
1098  }
1099 
1100  // Adjust severity
1101  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1102  sv = eDiag_Error;
1103  }
1104 
1105  if (GenerateGoldenFile()) {
1106  m_ErrRepository->AddValidErrItem(sv, et, msg);
1107  return;
1108  }
1109 
1110  string desc = "BioSource: ";
1111  x_AddValidErrItem(sv, et, msg, desc, src, "", 0);
1112 }
1113 
1114 
1116 (EDiagSev sv,
1117  EErrType et,
1118  const string& msg,
1119  const COrg_ref& org)
1120 {
1121 
1122  if (x_IsSuppressed(et)) {
1123  return;
1124  }
1125 
1126  // Adjust severity
1127  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1128  sv = eDiag_Error;
1129  }
1130 
1131  if (GenerateGoldenFile()) {
1132  m_ErrRepository->AddValidErrItem(sv, et, msg);
1133  return;
1134  }
1135 
1136  string desc = "Org-ref: ";
1137  x_AddValidErrItem(sv, et, msg, desc, org, "", 0);
1138 }
1139 
1140 
1142 (EDiagSev sv,
1143  EErrType et,
1144  const string& msg,
1145  const CPubdesc& pd)
1146 {
1147  if (x_IsSuppressed(et)) {
1148  return;
1149  }
1150 
1151  // Adjust severity
1152  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1153  sv = eDiag_Error;
1154  }
1155 
1156  if (GenerateGoldenFile()) {
1157  m_ErrRepository->AddValidErrItem(sv, et, msg);
1158  return;
1159  }
1160 
1161  string desc = "Pubdesc: ";
1162  x_AddValidErrItem(sv, et, msg, desc, pd, "", 0);
1163 }
1164 
1165 
1167 (EDiagSev sv,
1168  EErrType et,
1169  const string& msg,
1170  const CSeq_submit& ss)
1171 {
1172  if (x_IsSuppressed(et)) {
1173  return;
1174  }
1175 
1176  // Adjust severity
1177  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1178  sv = eDiag_Error;
1179  }
1180 
1181  if (GenerateGoldenFile()) {
1182  m_ErrRepository->AddValidErrItem(sv, et, msg);
1183  return;
1184  }
1185 
1186  string desc = "Seq-submit: ";
1187  x_AddValidErrItem(sv, et, msg, desc, ss, "", 0);
1188 }
1189 
1190 
1192  EDiagSev sev,
1193  EErrType type,
1194  const string& msg,
1195  const string& desc,
1196  const CSerialObject& obj,
1197  const string& accession,
1198  const int version)
1199 {
1200  if (IsHugeFileMode()) {
1201  m_ErrRepository->AddValidErrItem(sev, type, msg, desc, accession, version);
1202  return;
1203  }
1204  m_ErrRepository->AddValidErrItem(sev, type, msg, desc, obj, accession, version);
1205 }
1206 
1207 
1209 (EDiagSev sv,
1210  EErrType et,
1211  const string& msg,
1212  const CSerialObject& obj,
1213  const CSeq_entry *ctx)
1214 {
1215  if (!ctx) {
1216  PostErr (sv, et, msg, obj);
1217  } else if (obj.GetThisTypeInfo() == CSeqdesc::GetTypeInfo()) {
1218  PostErr(sv, et, msg, *ctx, *(dynamic_cast <const CSeqdesc*> (&obj)));
1219  } else {
1220  PostErr(sv, et, msg, obj);
1221  }
1222 
1223 }
1224 
1225 
1227 (EDiagSev sv,
1228  const string& msg,
1229  int flags,
1230  const CSerialObject& obj,
1231  const CSeq_entry *ctx)
1232 {
1233  string reasons = GetDateErrorDescription(flags);
1234 
1235  NStr::TruncateSpacesInPlace (reasons);
1236  reasons = msg + " - " + reasons;
1237 
1238  PostObjErr (sv, eErr_GENERIC_BadDate, reasons, obj, ctx);
1239 }
1240 
1241 
1243 (const CSeq_entry& se,
1244  const CCit_sub* cs,
1245  CScope* scope)
1246 {
1247  CSeq_entry_Handle seh;
1248  try {
1249  seh = scope->GetSeq_entryHandle(se);
1250  } catch (const CException& ) { ; }
1251  if (! seh) {
1252  seh = scope->AddTopLevelSeqEntry(se);
1253  if (!seh) {
1254  return false;
1255  }
1256  }
1257 
1258  return Validate(seh, cs);
1259 }
1260 
1261 static bool s_IsPhage(const COrg_ref& org)
1262 {
1263  if (org.IsSetDivision() && NStr::Equal(org.GetDivision(), "PHG")) {
1264  return true;
1265  } else {
1266  return false;
1267  }
1268 }
1269 
1270 
1272 {
1273  bool has_mult = false;
1274  int first_id = 0;
1275  int phage_id = 0;
1276 
1277  for (CBioseq_CI bi(seh); bi; ++bi) {
1278  for (CSeqdesc_CI desc_ci(*bi, CSeqdesc::e_Source);
1279  desc_ci && !has_mult;
1280  ++desc_ci) {
1281  if (desc_ci->GetSource().IsSetOrg()) {
1282  const COrg_ref& org = desc_ci->GetSource().GetOrg();
1283  if (org.IsSetDb()) {
1284  ITERATE(COrg_ref::TDb, it, org.GetDb()) {
1285  if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "taxon") &&
1286  (*it)->IsSetTag() && (*it)->GetTag().IsId()) {
1287  int this_id = (*it)->GetTag().GetId();
1288  if (this_id > 0) {
1289  if (s_IsPhage(org)) {
1290  phage_id = this_id;
1291  } else if (first_id == 0) {
1292  first_id = this_id;
1293  } else if (first_id != this_id) {
1294  has_mult = true;
1295  }
1296  }
1297  }
1298  }
1299  }
1300  }
1301  }
1302  }
1303  if (has_mult || (phage_id > 0 && first_id > 0)) {
1305  "There are multiple taxonIDs in this RefSeq record.",
1306  *m_TSE);
1307  }
1308 }
1309 
1310 
1312 {
1313  return *m_pEntryInfo;
1314 }
1315 
1316 
1318 {
1319  if (!m_pEntryInfo) {
1320  m_pEntryInfo.reset(new CValidatorEntryInfo());
1321  }
1322 
1323  return *m_pEntryInfo;
1324 }
1325 
1326 
1328 (const CSeq_entry_Handle& seh,
1329  const CCit_sub* cs)
1330 {
1331  _ASSERT(seh);
1332 
1333  if ( m_PrgCallback ) {
1335  if ( m_PrgCallback(&m_PrgInfo) ) {
1336  return false;
1337  }
1338  }
1339 
1340  // Check that CSeq_entry has data
1341  if (seh.Which() == CSeq_entry::e_not_set) {
1342  ERR_POST_X(2, Warning << "Seq_entry not set");
1343  return false;
1344  }
1345 
1346  Setup(seh);
1347 
1348  // Seq-submit has submission citationTest_Descr_LatLonValue
1349  if (cs) {
1350  x_SetEntryInfo().SetNoPubs(false);
1352  }
1353 
1354  // Get first CBioseq object pointer for PostErr below.
1356  if (!seq) {
1358  "No Bioseqs in this entire record.", seh.GetCompleteSeq_entry()->GetSet());
1359  return true;
1360  }
1361 
1362  // If m_NonASCII is true, then this flag was set by the caller
1363  // of validate to indicate that a non ascii character had been
1364  // read from a file being used to create a CSeq_entry, that the
1365  // error had been corrected, but that the error needs to be reported
1366  // by Validate. Note, Validate is not doing anything other than
1367  // reporting an error if m_NonASCII is true;
1368  if (m_NonASCII) {
1370  "Non-ascii chars in input ASN.1 strings", *seq);
1371  // Only report the error once
1372  m_NonASCII = false;
1373  }
1374 
1375  // Iterate thru components of record and validate each
1376 
1377  // also want to know if we have gi
1378  bool has_gi = false;
1379  // also want to know if there are any nucleotide sequences
1380  bool has_nucleotide_sequence = false;
1381 
1383  bi && (!IsINSDInSep() || !has_gi || !has_nucleotide_sequence);
1384  ++bi) {
1385  FOR_EACH_SEQID_ON_BIOSEQ (it, *(bi->GetCompleteBioseq())) {
1386  if ((*it)->IsGi()) {
1387  has_gi = true;
1388  }
1389  }
1390  if (bi->IsSetInst_Mol() && bi->IsNa()) {
1391  has_nucleotide_sequence = true;
1392  }
1393  }
1394 
1395  if (IsINSDInSep() && m_pEntryInfo->IsRefSeq()) {
1396  // NOTE: We use m_IsRefSeq to indicate the actual presence of RefSeq IDs in
1397  // the record, rather than IsRefSeq(), which indicates *either* RefSeq IDs are
1398  // present *OR* the refseq flag has been used
1400  "INSD and RefSeq records should not be present in the same set", *m_TSE);
1401  }
1402 
1403 #if 0
1404  // disabled for now
1405  // look for long IDs that would collide if truncated at 30 characters
1406  vector<string> id_strings;
1408  bi;
1409  ++bi) {
1410  FOR_EACH_SEQID_ON_BIOSEQ (it, *(bi->GetCompleteBioseq())) {
1411  if (!IsNCBIFILESeqId(**it)) {
1412  string label;
1413  (*it)->GetLabel(&label);
1414  id_strings.push_back(label);
1415  }
1416  }
1417  }
1418  stable_sort (id_strings.begin(), id_strings.end());
1419  for (vector<string>::iterator id_str_it = id_strings.begin();
1420  id_str_it != id_strings.end();
1421  ++id_str_it) {
1422  string pattern = (*id_str_it).substr(0, 30);
1423  string first_id = *id_str_it;
1424  vector<string>::iterator cmp_it = id_str_it;
1425  ++cmp_it;
1426  while (cmp_it != id_strings.end() && NStr::StartsWith(*cmp_it, pattern)) {
1427  CRef<CSeq_id> id(new CSeq_id(*cmp_it));
1430  "First 30 characters of " + first_id + " and " +
1431  *cmp_it + " are identical", *(bsh.GetCompleteBioseq()));
1432  ++id_str_it;
1433  ++cmp_it;
1434  }
1435  }
1436 #endif
1437 
1438  // look for colliding feature IDs
1439  vector < int > feature_ids;
1440  for (CFeat_CI fi(GetTSEH()); fi; ++fi) {
1441  const CSeq_feat& sf = fi->GetOriginalFeature();
1442  if (sf.IsSetId() && sf.GetId().IsLocal() && sf.GetId().GetLocal().IsId()) {
1443  feature_ids.push_back(sf.GetId().GetLocal().GetId());
1444  }
1445  }
1446 
1447  if (feature_ids.size() > 0) {
1448  const CTSE_Handle& tse = seh.GetTSE_Handle ();
1449  stable_sort (feature_ids.begin(), feature_ids.end());
1450  vector <int>::iterator it = feature_ids.begin();
1451  int id = *it;
1452  ++it;
1453  while (it != feature_ids.end()) {
1454  if (*it == id) {
1455  vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, id);
1456  ITERATE( vector<CSeq_feat_Handle>, feat_it, handles ) {
1458  "Colliding feature ID " + NStr::NumericToString (id), *(feat_it->GetSeq_feat()));
1459  }
1460  while (it != feature_ids.end() && *it == id) {
1461  ++it;
1462  }
1463  if (it != feature_ids.end()) {
1464  id = *it;
1465  ++it;
1466  }
1467  } else {
1468  id = *it;
1469  ++it;
1470  }
1471  }
1472  }
1473 
1474  // look for mixed gps and non-gps sets
1475  bool has_nongps = false;
1476  bool has_gps = false;
1477 
1478  for (CTypeConstIterator<CBioseq_set> si(*m_TSE); si && (!has_nongps || !has_gps); ++si) {
1479  if (si->IsSetClass()) {
1480  if (si->GetClass() == CBioseq_set::eClass_mut_set
1481  || si->GetClass() == CBioseq_set::eClass_pop_set
1482  || si->GetClass() == CBioseq_set::eClass_phy_set
1483  || si->GetClass() == CBioseq_set::eClass_eco_set
1484  || si->GetClass() == CBioseq_set::eClass_wgs_set
1485  || si->GetClass() == CBioseq_set::eClass_small_genome_set) {
1486  has_nongps = true;
1487  } else if (si->GetClass() == CBioseq_set::eClass_gen_prod_set) {
1488  has_gps = true;
1489  }
1490  }
1491  }
1492 
1493  if (has_nongps && has_gps) {
1495  "Genomic product set and mut/pop/phy/eco set records should not be present in the same set",
1496  *m_TSE);
1497  }
1498 
1499  // count inference accessions - if there are too many, WAS temporarily disable inference checking
1500  // now disable inference checking for rest of this validator run
1501  bool old_inference_acc_check = m_ValidateInferenceAccessions;
1502  if (! m_IgnoreInferences) {
1503  CFeat_CI feat_inf(seh);
1504  while (feat_inf && ! m_IgnoreInferences) {
1505  FOR_EACH_GBQUAL_ON_FEATURE (qual, *feat_inf) {
1506  if (! m_IgnoreInferences && (*qual)->IsSetQual() && (*qual)->IsSetVal() && NStr::Equal((*qual)->GetQual(), "inference")) {
1508  if (m_CumulativeInferenceCount >= 1000) {
1509  // disable inference checking for remainder of run
1510  m_IgnoreInferences = true;
1511 
1512  // warn about too many inferences
1514  "Skipping validation of remaining /inference qualifiers",
1515  *m_TSE);
1516  }
1517 
1519  string prefix, remainder;
1520  bool same_species;
1521  size_t num_accessions = 0;
1522  vector<string> accessions = CValidError_feat::GetAccessionsFromInferenceString ((*qual)->GetVal(), prefix, remainder, same_species);
1523  for (size_t i = 0; i < accessions.size(); i++) {
1524  NStr::TruncateSpacesInPlace (accessions[i]);
1525  string acc_prefix, accession;
1526  if (CValidError_feat::GetPrefixAndAccessionFromInferenceAccession (accessions[i], acc_prefix, accession)) {
1527  num_accessions++;
1529  }
1530  }
1531  if (num_accessions > 0) {
1532  m_CumulativeInferenceCount += num_accessions;
1533  if (m_CumulativeInferenceCount >= 1000) {
1534  // disable inference checking for remainder of run
1535  m_IgnoreInferences = true;
1536 
1537  // warn about too many inferences
1539  "Skipping validation of remaining /inference qualifiers",
1540  *m_TSE);
1541  }
1542  }
1543  }
1544  }
1545  }
1546  ++feat_inf;
1547  }
1548  }
1549 
1550  // validate the main data
1551  if (seh.IsSeq()) {
1552  const CBioseq& seq2 = seh.GetCompleteSeq_entry()->GetSeq();
1553  CValidError_bioseq bioseq_validator(*this);
1554  try {
1555  bioseq_validator.ValidateBioseq(seq2);
1556  } catch ( const exception& e ) {
1558  string("Exception while validating bioseq. EXCEPTION: ") +
1559  e.what(), seq2);
1560  return true;
1561  }
1562  } else if (seh.IsSet()) {
1563  const CBioseq_set& set = seh.GetCompleteSeq_entry()->GetSet();
1564  CValidError_bioseqset bioseqset_validator(*this);
1565 
1566  try {
1567  bioseqset_validator.ValidateBioseqSet(set);
1568 
1569  } catch ( const exception& e ) {
1571  string("Exception while validating bioseq set. EXCEPTION: ") +
1572  e.what(), set);
1573  return true;
1574  }
1575  }
1576 
1577  // put flag for validating inference accessions back to original value
1578  m_ValidateInferenceAccessions = old_inference_acc_check;
1579 
1580  // validation from data collected during previous step
1581 
1582  if (!GetContext().PreprocessHugeFile) {
1583  if ( m_NumTpaWithHistory > 0 &&
1584  m_NumTpaWithoutHistory > 0 ) {
1586  "There are " +
1588  " TPAs with history and " +
1590  " without history in this record.", *seq);
1591  }
1592  if ( m_NumTpaWithoutHistory > 0 && has_gi) {
1594  "There are " +
1596  " TPAs without history in this record, but the record has a gi number assignment.", *m_TSE);
1597  }
1598  }
1599 
1600  if (IsIndexerVersion() && DoesAnyProteinHaveGeneralID() && !IsRefSeq() && has_nucleotide_sequence) {
1601  call_once(SetContext().ProteinHaveGeneralIDOnceFlag,
1602  [](CValidError_imp* imp, CSeq_entry_Handle seh2) {
1604  "INDEXER_ONLY - Protein bioseqs have general seq-id.",
1605  *(seh2.GetCompleteSeq_entry()));
1606  }, this, seh);
1607  }
1608 
1609  ReportMissingPubs(*m_TSE, cs);
1611 
1612  if (m_NumMisplacedFeatures > 1) {
1614  "There are " + NStr::SizetToString (m_NumMisplacedFeatures) + " mispackaged features in this record.",
1615  *(seh.GetCompleteSeq_entry()));
1616  } else if (m_NumMisplacedFeatures == 1) {
1618  "There is 1 mispackaged feature in this record.",
1619  *(seh.GetCompleteSeq_entry()));
1620  }
1621  if (m_NumSmallGenomeSetMisplaced > 1) {
1623  "There are " + NStr::SizetToString (m_NumSmallGenomeSetMisplaced) + " mispackaged features in this small genome set record.",
1624  *(seh.GetCompleteSeq_entry()));
1625  } else if (m_NumSmallGenomeSetMisplaced == 1) {
1627  "There is 1 mispackaged feature in this small genome set record.",
1628  *(seh.GetCompleteSeq_entry()));
1629  }
1630  if ( !GetContext().PreprocessHugeFile ) {
1631  if ( m_NumGenes == 0 && m_NumGeneXrefs > 0 ) {
1633  "There are " + NStr::SizetToString(m_NumGeneXrefs) +
1634  " gene xrefs and no gene features in this record.", *m_TSE);
1635  }
1636  }
1637  ValidateCitations (seh);
1638 
1639 
1640  if ( m_NumMisplacedGraphs > 0 ) {
1643  string("There ") + ((m_NumMisplacedGraphs > 1) ? "are " : "is ") + num +
1644  " mispackaged graph" + ((m_NumMisplacedGraphs > 1) ? "s" : "") + " in this record.",
1645  *m_TSE);
1646  }
1647 
1648  if ( IsRefSeq() && ! IsWP() ) {
1650  }
1651 
1652 
1655  if (!GetContext().PreprocessHugeFile) {
1657  }
1658 
1659  if (m_FarFetchFailure) {
1661  "Far fetch failures caused some validator tests to be bypassed",
1662  *m_TSE);
1663  }
1664 
1665  if (m_DoTaxLookup) {
1667  }
1668 
1669  // validate cit-sub
1670  if (cs) {
1672  }
1673 
1674  // optional barcode tests
1675  if (m_DoBarcodeTests) {
1676  x_DoBarcodeTests(seh);
1677  }
1678  return true;
1679 }
1680 
1681 
1683 {
1684  if (block.IsSetHup() && block.GetHup() && block.IsSetReldate() &&
1685  IsDateInPast(block.GetReldate())) {
1687  "Record release date has already passed", ss);
1688  }
1689 
1690  if (block.IsSetContact() && block.GetContact().IsSetContact()) {
1691  const CAuthor& author = block.GetContact().GetContact();
1692  if (author.IsSetAffil() && author.GetAffil().IsStd()) {
1693  ValidateAffil(author.GetAffil().GetStd(), ss, nullptr);
1694  }
1695  const CPerson_id& pid = author.GetName();
1696  if (pid.IsName()) {
1697  const CName_std& nstd = pid.GetName();
1698  string first = "";
1699  string last = "";
1700  if (nstd.IsSetLast()) {
1701  last = nstd.GetLast();
1704  "Bad last name '" + last + "'", ss);
1705  }
1706  }
1707  if (nstd.IsSetFirst()) {
1708  first = nstd.GetFirst();
1711  "Bad first name '" + first + "'", ss);
1712  }
1713  }
1714  if (first != "" && last != "" && NStr::EqualNocase(last, "last") && NStr::EqualNocase(first, "first")) {
1716  "Bad first and last name", ss);
1717  }
1718  }
1719  }
1720  if (block.IsSetCit()) {
1721  const CCit_sub& sub = block.GetCit();
1722  if (sub.IsSetAuthors()) {
1723  const CAuth_list& auth_list = sub.GetAuthors();
1724  const CAuth_list::TNames& names = auth_list.GetNames();
1725  if (names.IsStd()) {
1726  ITERATE ( CAuth_list::C_Names::TStd, name, names.GetStd() ) {
1727  if ( (*name)->GetName().IsName() ) {
1728  const CName_std& nstd = (*name)->GetName().GetName();
1729  string first = "";
1730  string last = "";
1731  if (nstd.IsSetLast()) {
1732  last = nstd.GetLast();
1735  "Bad last name '" + last + "'", ss);
1736  }
1737  }
1738  if (nstd.IsSetFirst()) {
1739  first = nstd.GetFirst();
1742  "Bad first name '" + first + "'", ss);
1743  }
1744  }
1745  if (first != "" && last != "" && NStr::EqualNocase(last, "last") && NStr::EqualNocase(first, "first")) {
1747  "Bad first and last name", ss);
1748  }
1749  }
1750  }
1751  }
1752  }
1753  }
1754 }
1755 
1756 
1758  const CSeq_submit& ss, CScope* scope)
1759 {
1760  // Check that ss is type e_Entrys
1761  if ( ss.GetData().Which() != CSeq_submit::C_Data::e_Entrys ) {
1762  return;
1763  }
1764 
1766  if (ss.IsSetSub()) {
1767  if (IsHugeFileMode()) {
1768  call_once(SetContext().SubmitBlockOnceFlag,
1769  [this, &ss](){ ValidateSubmitBlock(ss.GetSub(), ss); });
1770  }
1771  else {
1772  ValidateSubmitBlock(ss.GetSub(), ss);
1773  }
1774  }
1775 
1776  // Get CCit_sub pointer
1777  const CCit_sub* cs = &ss.GetSub().GetCit();
1778 
1779  if (ss.IsSetSub() && ss.GetSub().IsSetTool() && NStr::StartsWith(ss.GetSub().GetTool(), "Geneious")) {
1781  }
1782 
1783  // Just loop thru CSeq_entrys
1784  FOR_EACH_SEQENTRY_ON_SEQSUBMIT (se_itr, ss) {
1785  const CSeq_entry& se = **se_itr;
1786  if(se.IsSet())
1787  {
1788  const CBioseq_set &set = se.GetSet();
1789  if(set.IsSetClass() &&
1790  set.GetClass() == CBioseq_set::eClass_wgs_set)
1791  {
1793  CSeq_entry_Handle seh;
1794  seh = scope->GetSeq_entryHandle(se);
1795  Setup(seh);
1796  call_once(SetContext().WgsSetInSeqSubmitOnceFlag,
1797  [this, seh]() {
1799  "File was created as a wgs-set, but should be a batch submission instead.",
1800  seh.GetCompleteSeq_entry()->GetSet());
1801  });
1802  } else {
1803  CSeq_entry_Handle seh;
1804  seh = scope->GetSeq_entryHandle(se);
1805  Setup(seh);
1807  "File was created as a wgs-set, but should be a batch submission instead.",
1808  seh.GetCompleteSeq_entry()->GetSet());
1809  }
1810  }
1811  }
1812  Validate (se, cs, scope);
1813  }
1814 }
1815 
1816 
1818  const CSeq_annot_Handle& sah)
1819 {
1820  Setup(sah);
1821 
1822  // Iterate thru components of record and validate each
1823 
1824  CValidError_annot annot_validator(*this);
1825  annot_validator.ValidateSeqAnnot(sah);
1826 
1827  switch (sah.Which()) {
1829  {
1830  CValidError_feat feat_validator(*this);
1831  for (CFeat_CI fi (sah); fi; ++fi) {
1832  const CSeq_feat& sf = fi->GetOriginalFeature();
1833  feat_validator.ValidateSeqFeat(sf);
1834  }
1835  }
1836  break;
1837 
1839  {
1840  if (IsValidateAlignments()) {
1841  CValidError_align align_validator(*this);
1842  int order = 1;
1843  for (CAlign_CI ai(sah); ai; ++ai) {
1844  const CSeq_align& sa = ai.GetOriginalSeq_align();
1845  align_validator.ValidateSeqAlign(sa, order++);
1846  }
1847  }
1848  }
1849  break;
1850 
1852  {
1853  CValidError_graph graph_validator(*this);
1854  // for (CTypeConstIterator <CSeq_graph> gi (sa); gi; ++gi) {
1855  for (CGraph_CI gi(sah); gi; ++gi) {
1856  const CSeq_graph& sg = gi->GetOriginalGraph();
1857  graph_validator.ValidateSeqGraph(sg);
1858  }
1859  }
1860  break;
1861  default:
1862  break;
1863  }
1867 }
1868 
1869 
1870 void CValidError_imp::Validate(const CSeq_feat& feat, CScope* scope)
1871 {
1872  // automatically restores m_Scope to its old value when we leave
1873  // the function
1874  CScopeRestorer scopeRestorer( m_Scope );
1875 
1876  if( scope ) {
1877  m_Scope.Reset(scope);
1878  }
1879  if (!m_Scope) {
1880  // set up a temporary local scope if there is no scope set already
1881  m_Scope.Reset(new CScope(*m_ObjMgr));
1882  }
1883 
1884  CValidError_feat feat_validator(*this);
1885  feat_validator.SetScope(*m_Scope);
1887  feat_validator.SetTSE(empty);
1888  feat_validator.ValidateSeqFeat(feat);
1889  if (feat.IsSetData() && feat.GetData().IsBiosrc()) {
1890  const CBioSource& src = feat.GetData().GetBiosrc();
1891  if (src.IsSetOrg()) {
1893  }
1894  }
1895  FindEmbeddedScript(feat);
1896  FindNonAsciiText(feat);
1898 }
1899 
1900 
1902 {
1903  // automatically restores m_Scope to its old value when we leave
1904  // the function
1905  CScopeRestorer scopeRestorer( m_Scope );
1906 
1907  if( scope ) {
1908  m_Scope.Reset(scope);
1909  }
1910  if (!m_Scope) {
1911  // set up a temporary local scope if there is no scope set already
1912  m_Scope.Reset(new CScope(*m_ObjMgr));
1913  }
1914 
1915  ValidateBioSource(src, src);
1916  if (src.IsSetOrg()) {
1918  }
1919  FindEmbeddedScript(src);
1920  FindNonAsciiText(src);
1922 }
1923 
1924 
1925 void CValidError_imp::Validate(const CPubdesc& pubdesc, CScope* scope)
1926 {
1927  // automatically restores m_Scope to its old value when we leave
1928  // the function
1929  CScopeRestorer scopeRestorer( m_Scope );
1930 
1931  if( scope ) {
1932  m_Scope.Reset(scope);
1933  }
1934  if (!m_Scope) {
1935  // set up a temporary local scope if there is no scope set already
1936  m_Scope.Reset(new CScope(*m_ObjMgr));
1937  }
1938 
1939  ValidatePubdesc(pubdesc, pubdesc);
1940  FindEmbeddedScript(pubdesc);
1941  FindNonAsciiText(pubdesc);
1942  FindCollidingSerialNumbers(pubdesc);
1943 }
1944 
1946 {
1947  CValidError_desc seqdesc_validator(*this);
1948  m_Scope.Reset(new CScope(*m_ObjMgr));
1950  seqdesc_validator.ValidateSeqDesc(desc,ctx);
1951 }
1952 
1953 
1956  void* user_data)
1957 {
1958  m_PrgCallback = callback;
1959  m_PrgInfo.m_UserData = user_data;
1960 }
1961 
1962 
1964 (const CDbtag& xref,
1965  const CSerialObject& obj,
1966  bool biosource,
1967  const CSeq_entry *ctx)
1968 {
1969  bool refseq_or_gps = IsRefSeq() || IsGPS();
1971  refseq_or_gps);
1972 
1973  const string& db = xref.IsSetDb() ? xref.GetDb() : kEmptyStr;
1974 
1977  "dbxref value " + xref.GetTag().GetStr() + " has SGML",
1978  obj, ctx);
1979  }
1982  "dbxref value " + xref.GetTag().GetStr() + " contains space character",
1983  obj, ctx);
1984  }
1985  if (flags & CValidator::eDbHasSgml) {
1987  "dbxref database " + db + " has SGML",
1988  obj, ctx);
1989  }
1990 
1991  bool isStr = false;
1992  string dbv;
1993  if (xref.IsSetTag() && xref.GetTag().IsStr()) {
1994  dbv = xref.GetTag().GetStr();
1995  isStr = true;
1996  } else if (xref.IsSetTag() && xref.GetTag().IsId()) {
1997  dbv = NStr::NumericToString(xref.GetTag().GetId());
1998  }
1999 
2002  "Illegal db_xref type " + db + " (" + dbv + ")", obj, ctx);
2003  }
2005  // capitalization is bad
2006  bool refseq_db = false, src_db = false;
2007  string correct_caps;
2008  xref.GetDBFlags(refseq_db, src_db, correct_caps);
2009  string message = "Illegal db_xref type " + db + " (" + dbv + "), legal capitalization is " + correct_caps;
2011  message += ", but should not be used on an OrgRef";
2012  } else if (flags & CValidator::eOnlyForSource) {
2013  message += ", but should only be used on an OrgRef";
2014  }
2015 
2017  } else {
2021  "RefSeq-specific db_xref type " + db + " (" + dbv + ") should not be used on a non-RefSeq OrgRef",
2022  obj, ctx);
2023  } else {
2025  "db_xref type " + db + " (" + dbv + ") is only legal for RefSeq",
2026  obj, ctx);
2027  }
2028  } else if (flags & CValidator::eNotForSource) {
2031  "RefSeq-specific db_xref type " + db + " (" + dbv + ") should not be used on an OrgRef",
2032  obj, ctx);
2033  } else {
2035  "db_xref type " + db + " (" + dbv + ") should not be used on an OrgRef",
2036  obj, ctx);
2037  }
2038  } else if (flags & CValidator::eOnlyForSource) {
2040  "db_xref type " + db + " (" + dbv + ") should only be used on an OrgRef",
2041  obj, ctx);
2042  }
2043  }
2044 
2045  if (isStr && db == "GeneID") {
2047  "db_xref type " + db + " (" + dbv + ") is required to be an integer",
2048  obj, ctx);
2049  }
2050 }
2051 
2052 
2054 (TDbtags& xref_list,
2055  const CSerialObject& obj,
2056  bool biosource,
2057  const CSeq_entry *ctx)
2058 {
2059  string last_db;
2060 
2061  ITERATE( TDbtags, xref, xref_list) {
2062  if (biosource
2063  && (*xref)->IsSetDb()) {
2064  if (!NStr::IsBlank(last_db)
2065  && NStr::EqualNocase((*xref)->GetDb(), last_db)) {
2067  "BioSource uses db " + last_db + " multiple times",
2068  obj, ctx);
2069  }
2070  last_db = (*xref)->GetDb();
2071  }
2072  ValidateDbxref(**xref, obj, biosource, ctx);
2073  }
2074 }
2075 
2076 
2078 (const CPacked_seqint& packed_int,
2079  SLocCheck& lc,
2080  const CSerialObject& obj)
2081 {
2082  ITERATE(CPacked_seqint::Tdata, it, packed_int.Get()) {
2083  lc.int_cur = (*it);
2084  lc.chk &= x_CheckSeqInt(lc.id_cur, lc.int_cur, lc.strand_cur);
2085 
2087 
2088  lc.id_prv = lc.id_cur;
2089  lc.strand_prv = lc.strand_cur;
2090  lc.int_prv = lc.int_cur;
2091  }
2092 }
2093 
2094 
2096  CConstRef<CSeq_id>& id_cur,
2097  const CSeq_interval* int_cur,
2098  ENa_strand& strand_cur)
2099 {
2100  strand_cur = int_cur->IsSetStrand() ?
2101  int_cur->GetStrand() : eNa_strand_unknown;
2102  id_cur = &int_cur->GetId();
2103  bool chk = IsValid(*int_cur, m_Scope);
2104  return chk;
2105 }
2106 
2107 
2109 {
2110  ITERATE(CPacked_seqint::Tdata, it, packed_int.Get()) {
2111  x_ReportInvalidFuzz(**it, obj);
2112  }
2113 }
2114 
2115 
2116 static const string kSpaceLeftFirst = "Should not specify 'space to left' at first position of non-circular sequence";
2117 static const string kSpaceRightLast = "Should not specify 'space to right' at last position of non-circular sequence";
2118 
2119 static const string kSpaceLeftCircle = "Should not specify 'circle to left' except at first position of circular sequence";
2120 static const string kSpaceRightCircle = "Should not specify 'circle to right' except at last position of circular sequence";
2121 
2123 {
2126  bool has_fuzz_from = false;
2127  bool has_fuzz_to = false;
2128 
2129  if (interval.IsSetFuzz_from() && interval.GetFuzz_from().IsLim()) {
2130  fuzz_from = interval.GetFuzz_from().GetLim();
2131  has_fuzz_from = true;
2132  }
2133  if (interval.IsSetFuzz_to() && interval.GetFuzz_to().IsLim()) {
2134  fuzz_to = interval.GetFuzz_to().GetLim();
2135  has_fuzz_to = true;
2136  }
2137  if (! has_fuzz_from && ! has_fuzz_to) {
2138  return;
2139  }
2140 
2141  // check for invalid fuzz on both ends of Interval
2142  if (has_fuzz_from && has_fuzz_to && fuzz_from == fuzz_to) {
2143  if (fuzz_from == CInt_fuzz::eLim_tl) {
2146  "Should not specify 'space to left' for both ends of interval", obj);
2147  }
2148  else if (fuzz_from == CInt_fuzz::eLim_tr) {
2151  "Should not specify 'space to right' for both ends of interval", obj);
2152  }
2153  else if (fuzz_from == CInt_fuzz::eLim_circle) {
2156  "Should not specify 'origin of circle' for both ends of interval", obj);
2157  }
2158  }
2159 
2160  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(interval.GetId());
2161  if (! bsh) {
2162  return;
2163  }
2164 
2166  if (bsh.IsSetInst_Topology()) {
2167  top = bsh.GetInst_Topology();
2168  }
2169 
2170  if (top != CSeq_inst::eTopology_circular) {
2171 
2172  // VR-15
2173  // look for space to left at beginning of sequence or space to right at end
2174  if (fuzz_from == CInt_fuzz::eLim_tl && interval.IsSetFrom() && interval.GetFrom() == 0) {
2176  }
2177  if (fuzz_to == CInt_fuzz::eLim_tr && interval.IsSetTo() && interval.GetTo() == bsh.GetBioseqLength() - 1) {
2179  }
2180 
2181  } else if (fuzz_from == CInt_fuzz::eLim_circle || fuzz_to == CInt_fuzz::eLim_circle) {
2182 
2183  if (obj.GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
2184  const CSeq_feat* sfp = dynamic_cast<const CSeq_feat*>(&obj);
2185  if (sfp && sfp->IsSetExcept() && sfp->CanGetExcept_text() && NStr::FindNoCase(sfp->GetExcept_text(), "ribosomal slippage") != NPOS) {
2186  return;
2187  }
2188  }
2189 
2190  // VR-832
2191  if (fuzz_from == CInt_fuzz::eLim_circle && interval.IsSetFrom() && interval.GetFrom() != 0) {
2193  }
2194  if (fuzz_to == CInt_fuzz::eLim_circle && interval.IsSetTo() && interval.GetTo() != bsh.GetBioseqLength() - 1) {
2196  }
2197  }
2198 }
2199 
2200 
2202 {
2203  // VR-15
2204  if (!point.IsSetFuzz() || !point.GetFuzz().IsLim() ||
2205  (point.GetFuzz().GetLim() != CInt_fuzz::eLim_tl && point.GetFuzz().GetLim() != CInt_fuzz::eLim_tr) ||
2206  !point.IsSetId() || !point.IsSetPoint()) {
2207  return;
2208  }
2209  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(point.GetId());
2210  if (!bsh) {
2211  return;
2212  }
2214  return;
2215  }
2216  if (point.GetPoint() == 0 && point.GetFuzz().GetLim() == CInt_fuzz::eLim_tl) {
2218  }
2219  if (point.GetPoint() == bsh.GetBioseqLength() - 1) {
2221  }
2222 }
2223 
2224 
2225 void CValidError_imp::x_ReportInvalidFuzz(const CSeq_loc& loc, const CSerialObject& obj)
2226 {
2228  for (; lit; ++lit) {
2229  CSeq_loc::E_Choice loc_choice = lit->Which();
2230  switch (loc_choice) {
2231  case CSeq_loc::e_Int:
2232  x_ReportInvalidFuzz(lit->GetInt(), obj);
2233  break;
2235  x_ReportInvalidFuzz(lit->GetPacked_int(), obj);
2236  break;
2237  case CSeq_loc::e_Pnt:
2238  x_ReportInvalidFuzz(lit->GetPnt(), obj);
2239  break;
2240  default:
2241  break;
2242  }
2243  }
2244 }
2245 
2246 
2247 unsigned int s_CountMix(const CSeq_loc& loc)
2248 {
2249  unsigned int num_mix = 0;
2251  for (; lit; ++lit) {
2252  if (lit->IsMix()) {
2253  num_mix++;
2254  }
2255  }
2256  return num_mix;
2257 }
2258 
2259 
2261 {
2262  lc.chk = true;
2263  lc.unmarked_strand = false;
2264  lc.mixed_strand = false;
2265  lc.has_other = false;
2266  lc.has_not_other = false;
2267  lc.id_cur = nullptr;
2268  lc.id_prv = nullptr;
2269  lc.int_cur = nullptr;
2270  lc.int_prv = nullptr;
2271  lc.strand_cur = eNa_strand_unknown;
2272  lc.strand_prv = eNa_strand_unknown;
2273  lc.prefix = prefix;
2274 }
2275 
2277 {
2278  if (lc.strand_prv != eNa_strand_other &&
2279  lc.strand_cur != eNa_strand_other) {
2280  if (lc.id_cur && lc.id_prv &&
2281  IsSameBioseq(*lc.id_cur, *lc.id_prv, m_Scope)) {
2282  if (lc.strand_prv != lc.strand_cur) {
2283  if ((lc.strand_prv == eNa_strand_plus &&
2284  lc.strand_cur == eNa_strand_unknown) ||
2285  (lc.strand_prv == eNa_strand_unknown &&
2286  lc.strand_cur == eNa_strand_plus)) {
2287  lc.unmarked_strand = true;
2288  } else {
2289  lc.mixed_strand = true;
2290  }
2291  }
2292  }
2293  }
2294  if (lc.strand_cur == eNa_strand_other) {
2295  lc.has_other = true;
2296  } else if (lc.strand_cur == eNa_strand_minus || lc.strand_cur == eNa_strand_plus) {
2297  lc.has_not_other = true;
2298  }
2299 
2300 }
2301 
2302 void CValidError_imp::x_CheckLoc(const CSeq_loc& loc, const CSerialObject& obj, SLocCheck& lc, bool lowerSev)
2303 {
2304  try {
2305  switch (loc.Which()) {
2306  case CSeq_loc::e_Int:
2307  lc.int_cur = &loc.GetInt();
2308  lc.chk = x_CheckSeqInt(lc.id_cur, lc.int_cur, lc.strand_cur);
2309  if (lc.strand_cur == eNa_strand_other) {
2310  lc.has_other = true;
2311  }
2312  if ((!lc.chk) && lowerSev) {
2313  TSeqPos length = GetLength(loc.GetInt().GetId(), m_Scope);
2314  TSeqPos fr = loc.GetInt().GetFrom();
2315  TSeqPos to = loc.GetInt().GetTo();
2316  if (fr < length && to >= length) {
2317  // RefSeq variation feature with dbSNP xref and interval flanking the length is ERROR
2318  } else {
2319  // otherwise keep severity at REJECT
2320  lowerSev = false;
2321  }
2322  }
2323  break;
2324  case CSeq_loc::e_Pnt:
2325  lc.strand_cur = loc.GetPnt().IsSetStrand() ?
2326  loc.GetPnt().GetStrand() : eNa_strand_unknown;
2327  if (lc.strand_cur == eNa_strand_other) {
2328  lc.has_other = true;
2329  }
2330  lc.id_cur = &loc.GetPnt().GetId();
2331  lc.chk = IsValid(loc.GetPnt(), m_Scope);
2332  lc.int_prv = nullptr;
2333  break;
2335  lc.strand_cur = loc.GetPacked_pnt().IsSetStrand() ?
2336  loc.GetPacked_pnt().GetStrand() : eNa_strand_unknown;
2337  if (lc.strand_cur == eNa_strand_other) {
2338  lc.has_other = true;
2339  }
2340  lc.id_cur = &loc.GetPacked_pnt().GetId();
2341  lc.chk = IsValid(loc.GetPacked_pnt(), m_Scope);
2342  lc.int_prv = nullptr;
2343  break;
2345  x_CheckPackedInt(loc.GetPacked_int(), lc, obj);
2346  break;
2347  case CSeq_loc::e_Null:
2348  break;
2349  case CSeq_loc::e_Mix:
2350  for (auto l : loc.GetMix().Get()) {
2351  x_CheckLoc(*l, obj, lc, lowerSev);
2353  }
2354  break;
2355  default:
2356  lc.strand_cur = eNa_strand_other;
2357  lc.id_cur = nullptr;
2358  lc.int_prv = nullptr;
2359  break;
2360  }
2361  if (!lc.chk) {
2362  string lbl = GetValidatorLocationLabel (loc, *m_Scope);
2363  EDiagSev sev = eDiag_Critical;
2364  if (lowerSev) {
2365  sev = eDiag_Error;
2366  }
2368  lc.prefix + ": SeqLoc [" + lbl + "] out of range", obj);
2369  }
2370 
2371  if (loc.Which() != CSeq_loc::e_Null) {
2373 
2374  lc.strand_prv = lc.strand_cur;
2375  lc.id_prv = lc.id_cur;
2376  }
2377  } catch( const exception& e ) {
2378  string label = GetValidatorLocationLabel(loc, *m_Scope);
2380  "Exception caught while validating location " +
2381  label + ". Exception: " + e.what(), obj);
2382 
2383  lc.strand_cur = eNa_strand_other;
2384  lc.id_cur = nullptr;
2385  lc.int_prv = nullptr;
2386  }
2387 }
2388 
2390 (const CSeq_loc& loc,
2391  const CBioseq_Handle& seq,
2392  bool report_abutting,
2393  const string& prefix,
2394  const CSerialObject& obj,
2395  bool lowerSev)
2396 {
2397  SLocCheck lc;
2398 
2400 
2401  x_CheckLoc(loc, obj, lc, lowerSev);
2402 
2403  if (lc.has_other && lc.has_not_other) {
2404  string label = GetValidatorLocationLabel(loc, *m_Scope);
2406  prefix + ": Inconsistent use of other strand SeqLoc [" + label + "]", obj);
2407  } else if (lc.has_other && NStr::Equal(prefix, "Location")) {
2410  "Strand 'other' in location", obj);
2411  }
2412 
2413  x_ReportInvalidFuzz(loc, obj);
2414 
2418  "Duplicate exons in location", obj);
2419  }
2420 
2421  if (s_CountMix(loc) > 1) {
2422  string label;
2423  loc.GetLabel(&label);
2425  prefix + ": SeqLoc [" + label + "] has nested SEQLOC_MIX elements",
2426  obj);
2427  }
2428 
2429  // Warn if different parts of a seq-loc refer to the same bioseq using
2430  // differnt id types (i.e. gi and accession)
2431  ValidateSeqLocIds(loc, obj);
2432 
2433  bool trans_splice = false;
2434  bool circular_rna = false;
2435  bool exception = false;
2436  const CSeq_feat* sfp = nullptr;
2437  if (obj.GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
2438  sfp = dynamic_cast<const CSeq_feat*>(&obj);
2439  }
2440  if (sfp) {
2441  // primer_bind intervals MAY be in on opposite strands
2443  lc.mixed_strand = false;
2444  lc.unmarked_strand = false;
2445  }
2446 
2447  exception = sfp->IsSetExcept() ? sfp->GetExcept() : false;
2448  if (exception && sfp->CanGetExcept_text()) {
2449  if (NStr::FindNoCase(sfp->GetExcept_text(), "trans-splicing") != NPOS) {
2450  // trans splicing exception turns off both mixed_strand and
2451  // out_of_order messages
2452  trans_splice = true;
2453  } else if (NStr::FindNoCase(sfp->GetExcept_text(), "circular RNA") != NPOS) {
2454  // circular RNA exception turns off out_of_order message
2455  circular_rna = true;
2456  }
2457  }
2458  }
2459 
2460  string loc_lbl;
2461  if (report_abutting && (!sfp || !CSeqFeatData::AllowAdjacentIntervals(sfp->GetData().GetSubtype())) &&
2463  loc_lbl = GetValidatorLocationLabel(loc, *m_Scope);
2464 
2465  EDiagSev sev = exception ? eDiag_Warning : eDiag_Error;
2467  prefix + ": Adjacent intervals in SeqLoc [" +
2468  loc_lbl + "]", obj);
2469  }
2470 
2471  if (trans_splice && !NStr::Equal(prefix, "Product")) {
2472  CSeq_loc_CI li(loc);
2473  ++li;
2474  if (!li) {
2475  PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadTranssplicedInterval, "Trans-spliced feature should have multiple intervals", obj);
2476  }
2477  return;
2478  }
2479 
2480  bool ordered = true;
2481  bool circular = false;
2482  if ( seq &&
2483  seq.IsSetInst() && seq.GetInst().IsSetTopology() &&
2485  circular = true;
2486  }
2487  try {
2488  if (m_Scope && (!sfp || CSeqFeatData::RequireLocationIntervalsInBiologicalOrder(sfp->GetData().GetSubtype())) && !circular) {
2490  }
2491  } catch ( const CException& ex) {
2492  string label;
2493  loc.GetLabel(&label);
2495  "Exception caught while validating location " +
2496  label + ". Exception: " + ex.what(), obj);
2497  }
2498 
2499  if (lc.mixed_strand || lc.unmarked_strand || !ordered) {
2500  if (loc_lbl.empty()) {
2501  loc_lbl = GetValidatorLocationLabel(loc, *m_Scope);
2502  }
2503  if (lc.mixed_strand) {
2504  if (IsSmallGenomeSet()) {
2506  prefix + ": Mixed strands in SeqLoc ["
2507  + loc_lbl + "] in small genome set - set trans-splicing exception if appropriate", obj);
2508  } else {
2509  EDiagSev sev = eDiag_Error;
2510  if (IsGeneious() || (sfp && sequence::IsPseudo(*sfp, *m_Scope))) {
2511  sev = eDiag_Warning;
2512  }
2514  prefix + ": Mixed strands in SeqLoc ["
2515  + loc_lbl + "]", obj);
2516  }
2517  } else if (lc.unmarked_strand) {
2519  prefix + ": Mixed plus and unknown strands in SeqLoc ["
2520  + loc_lbl + "]", obj);
2521  }
2522  if (!ordered && !circular_rna) {
2523  if (IsSmallGenomeSet()) {
2525  prefix + ": Intervals out of order in SeqLoc [" +
2526  loc_lbl + "]", obj);
2527  } else {
2529  prefix + ": Intervals out of order in SeqLoc [" +
2530  loc_lbl + "]", obj);
2531  }
2532  }
2533  return;
2534  }
2535 
2536  if ( seq &&
2537  seq.IsSetInst_Repr() &&
2538  seq.GetInst_Repr() != CSeq_inst::eRepr_seg ) {
2539  return;
2540  }
2541 
2542  // Check for intervals out of order on segmented Bioseq
2543  if ( seq && BadSeqLocSortOrder(seq, loc) && !circular_rna ) {
2544  if (loc_lbl.empty()) {
2545  loc.GetLabel(&loc_lbl);
2546  }
2548  prefix + "Intervals out of order in SeqLoc [" +
2549  loc_lbl + "]", obj);
2550  }
2551 
2552  // Check for mixed strand on segmented Bioseq
2553  if ( IsMixedStrands(loc) ) {
2554  if (loc_lbl.empty()) {
2555  loc.GetLabel(&loc_lbl);
2556  }
2558  prefix + ": Mixed strands in SeqLoc [" +
2559  loc_lbl + "]", obj);
2560  }
2561 }
2562 
2563 
2565 {
2566  if (!SeqIsPatent(seq)) {
2567  m_BioseqWithNoSource.push_back(CConstRef<CBioseq>(&seq));
2568  }
2569 }
2570 
2571 
2573 {
2574  if (!SeqIsPatent (seq)) {
2576  "The product name is missing from this protein.", *(seq.GetCompleteBioseq()));
2577  }
2578 }
2579 
2580 
2582 {
2583  bool wgs = false;
2584 
2585  FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
2586  if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech()
2587  && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_wgs) {
2588  wgs = true;
2589  break;
2590  }
2591  }
2592  if (!wgs) {
2593  return false;
2594  }
2595 
2596  bool is_other = false;
2597  bool has_gi = false;
2598 
2599  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2600  if ((*it)->IsOther()) {
2601  is_other = true;
2602  break;
2603  } else if ((*it)->IsGi()) {
2604  has_gi = true;
2605  break;
2606  }
2607  }
2608  if (!is_other || has_gi) {
2609  return false;
2610  }
2611 
2612  return true;
2613 }
2614 
2615 
2617 {
2618  bool tsa = false;
2619 
2620  FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
2621  if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech()
2622  && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_tsa) {
2623  tsa = true;
2624  break;
2625  }
2626  }
2627  if (!tsa) {
2628  return false;
2629  }
2630 
2631  bool is_other = false;
2632  bool has_gi = false;
2633 
2634  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2635  if ((*it)->IsOther()) {
2636  is_other = true;
2637  break;
2638  } else if ((*it)->IsGi()) {
2639  has_gi = true;
2640  break;
2641  }
2642  }
2643  if (!is_other || has_gi) {
2644  return false;
2645  }
2646 
2647  return true;
2648 }
2649 
2650 
2652 {
2653  if (GetContext().PreprocessHugeFile) {
2654  if (m_pEntryInfo->IsNoBioSource() && !GetContext().IsPatent && !GetContext().IsPDB) {
2655  return;
2656  }
2657  }
2658  else if (m_pEntryInfo->IsNoBioSource() && !m_pEntryInfo->IsPatent() && !m_pEntryInfo->IsPDB()) {
2660  "No source information included on this record.", se);
2661 
2662  if (!GetContext().PostprocessHugeFile) {
2663  return;
2664  }
2665  }
2666 
2667  size_t num_no_source = m_BioseqWithNoSource.size();
2668 
2669  for ( size_t i = 0; i < num_no_source; ++i ) {
2671  "No organism name included in the source. Other qualifiers may exist.",
2672  *(m_BioseqWithNoSource[i]));
2673  }
2674 }
2675 
2676 
2678 {
2679  CConstRef<CSeq_feat> feat;
2680 
2682 
2683  if ( bsh ) {
2684  if ( IsNT() && m_TSE ) {
2685  // In case of a NT bioseq limit the search to features packaged on the
2686  // NT (we assume features have been pulled from the segments to the NT).
2688  sel.SetByProduct()
2690  CFeat_CI fi(bsh, sel);
2691  if ( fi ) {
2692  // return the first one (should be the one packaged on the
2693  // nuc-prot set).
2694  feat.Reset(&(fi->GetOriginalFeature()));
2695  }
2696  } else {
2698  sel.SetByProduct();
2699  CFeat_CI fi(bsh, sel);
2700  if ( fi ) {
2701  // return the first one (should be the one packaged on the
2702  // nuc-prot set).
2703  feat.Reset(&(fi->GetOriginalFeature()));
2704  }
2705  }
2706  }
2707 
2708  return feat;
2709 }
2710 
2711 
2713 {
2715  return GetmRNAGivenProduct(bsh);
2716 }
2717 
2718 
2720 {
2721  CConstRef<CSeq_feat> feat;
2722  if ( bsh ) {
2723  // In case of a NT bioseq limit the search to features packaged on the
2724  // NT (we assume features have been pulled from the segments to the NT).
2725  CSeq_entry_Handle limit;
2726  if ( IsNT() && m_TSE ) {
2727  limit = m_Scope->GetSeq_entryHandle(*m_TSE);
2728  }
2729 
2730  if (limit) {
2732  sel.SetByProduct() .SetLimitTSE(limit);
2733  CFeat_CI fi(bsh, sel);
2734  if ( fi ) {
2735  // return the first one (should be the one packaged on the
2736  // nuc-prot set).
2737  feat.Reset(&(fi->GetOriginalFeature()));
2738  }
2739  } else {
2741  sel.SetByProduct();
2742  CFeat_CI fi(bsh, sel);
2743  if ( fi ) {
2744  // return the first one (should be the one packaged on the
2745  // nuc-prot set).
2746  feat.Reset(&(fi->GetOriginalFeature()));
2747  }
2748  }
2749  }
2750 
2751  return feat;
2752 }
2753 
2754 
2756 (const CBioseq& seq,
2757  CBioseq_set::EClass clss)
2758 {
2759  const CSeq_entry* parent = nullptr;
2760  for ( parent = seq.GetParentEntry();
2761  parent;
2762  parent = parent->GetParentEntry() ) {
2763  if ( parent->IsSet() ) {
2764  const CBioseq_set& set = parent->GetSet();
2765  if ( set.IsSetClass() && set.GetClass() == clss ) {
2766  break;
2767  }
2768  }
2769  }
2770  return parent;
2771 }
2772 
2773 
2774 bool CValidError_imp::IsSerialNumberInComment(const string& comment)
2775 {
2776  size_t pos = comment.find('[', 0);
2777  while ( pos != string::npos ) {
2778  ++pos;
2779  bool okay = true;
2780  if ( isdigit((unsigned char) comment[pos]) ) {
2781  // skip if first character after bracket is 0
2782  if (comment[pos] == '0') {
2783  okay = false;
2784  }
2785  while ( isdigit((unsigned char) comment[pos]) ) {
2786  ++pos;
2787  }
2788  if ( comment[pos] == ']' && okay ) {
2789  return true;
2790  }
2791  }
2792 
2793  pos = comment.find('[', pos);
2794  }
2795  return false;
2796 }
2797 
2798 
2800 {
2801  // okay to have far RefSeq product, but only if genomic product set
2802  if ( sid && sid->IsOther() ) {
2803  if ( IsGPS() ) {
2804  return false;
2805  }
2806  }
2807  // or just a bioseq
2808  if ( GetTSE().IsSeq() ) {
2809  return false;
2810  }
2811 
2812  // or in a standalone Seq-annot
2813  if (IsStandaloneAnnot() ) {
2814  return false;
2815  }
2816  return true;
2817 }
2818 
2819 
2821  vector<TEntrezId>& pmids, vector<TEntrezId>& muids, vector<int>& serials,
2822  vector<string>& published_labels, vector<string>& unpublished_labels)
2823 {
2824  FOR_EACH_SEQDESC_ON_SEQENTRY (it, se) {
2825  if ((*it)->IsPub()) {
2826  CCleanup::GetPubdescLabels ((*it)->GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
2827  }
2828  }
2829 
2830  if (se.IsSet()) {
2831  FOR_EACH_SEQENTRY_ON_SEQSET (it, se.GetSet()) {
2832  s_CollectPubDescriptorLabels (**it, pmids, muids, serials, published_labels, unpublished_labels);
2833  }
2834  }
2835 }
2836 
2837 
2839 {
2840  vector<TEntrezId> pmids;
2841  vector<TEntrezId> muids;
2842  vector<int> serials;
2843  vector<string> published_labels;
2844  vector<string> unpublished_labels;
2845 
2846  // collect labels for pubs on record
2847  s_CollectPubDescriptorLabels (*(seh.GetCompleteSeq_entry()), pmids, muids, serials, published_labels, unpublished_labels);
2848 
2850  while (feat) {
2851  CCleanup::GetPubdescLabels (feat->GetData().GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
2852  ++feat;
2853  }
2854 
2855  // now examine citations to determine whether they match a pub on the record
2856  CFeat_CI f (seh);
2857  while (f) {
2858  if (f->IsSetCit() && f->GetCit().IsPub()) {
2859  ITERATE (CPub_set::TPub, cit_it, f->GetCit().GetPub()) {
2860  bool found = false;
2861 
2862  if ((*cit_it)->IsPmid()) {
2863  vector<TEntrezId>::iterator it = pmids.begin();
2864  while (it != pmids.end() && !found) {
2865  if (*it == (*cit_it)->GetPmid()) {
2866  found = true;
2867  }
2868  ++it;
2869  }
2870  if (!found) {
2872  "Citation on feature refers to uid ["
2873  + NStr::NumericToString((*cit_it)->GetPmid().Get())
2874  + "] not on a publication in the record",
2875  f->GetOriginalFeature());
2876  }
2877  } else if ((*cit_it)->IsMuid()) {
2878  vector<TEntrezId>::iterator it = muids.begin();
2879  while (it != muids.end() && !found) {
2880  if (*it == (*cit_it)->GetMuid()) {
2881  found = true;
2882  }
2883  ++it;
2884  }
2885  if (!found) {
2887  "Citation on feature refers to uid ["
2888  + NStr::NumericToString((*cit_it)->GetMuid())
2889  + "] not on a publication in the record",
2890  f->GetOriginalFeature());
2891  }
2892  } else if ((*cit_it)->IsEquiv()) {
2893  continue;
2894  } else {
2895  string label;
2896  (*cit_it)->GetLabel(&label, CPub::eContent, CPub::fLabel_Unique);
2897 
2898  if (NStr::EndsWith (label, ">")) {
2899  label = label.substr(0, label.length() - 2);
2900  }
2901  if(NStr::EndsWith (label, "|")) {
2902  label = label.substr(0, label.length() - 1);
2903  }
2904  if (NStr::EndsWith (label, " ")) {
2905  label = label.substr(0, label.length() - 1);
2906  }
2907  size_t len = label.length();
2908  vector<string>::iterator unpub_it = unpublished_labels.begin();
2909  while (unpub_it != unpublished_labels.end() && !found) {
2910  size_t it_len =(*unpub_it).length();
2911  if (NStr::EqualNocase (*unpub_it, 0, it_len > len ? len : it_len, label)) {
2912  found = true;
2913  }
2914  ++unpub_it;
2915  }
2916  vector<string>::iterator pub_it = published_labels.begin();
2917 
2918  while (pub_it != published_labels.end() && !found) {
2919  size_t it_len =(*pub_it).length();
2920  if (NStr::EqualNocase (*pub_it, 0, it_len > len ? len : it_len, label)) {
2922  "Citation on feature needs to be updated to published uid",
2923  f->GetOriginalFeature());
2924  found = true;
2925  }
2926  ++pub_it;
2927  }
2928  if (!found) {
2930  "Citation on feature refers to a publication not in the record",
2931  f->GetOriginalFeature());
2932  }
2933  }
2934  }
2935  }
2936  ++f;
2937  }
2938 }
2939 
2940 
2941 // =============================================================================
2942 // Private
2943 // =============================================================================
2944 
2945 
2946 
2948 {
2950  for( ; it; ++it) {
2951  const string& str = *it;
2952  FOR_EACH_CHAR_IN_STRING(c_it, str) {
2953  const char& ch = *c_it;
2954  unsigned char chu = ch;
2955  if (ch > 127 || (ch < 32 && ch != '\t' && ch != '\r' && ch != '\n')) {
2957  "Non-ASCII character '" + NStr::NumericToString(chu) + "' found in item", obj);
2958  break;
2959  }
2960  }
2961  }
2962 }
2963 
2964 
2966 {
2967  class CScriptTagTextFsm : public CTextFsm<int>
2968  {
2969  public:
2970  CScriptTagTextFsm() {
2971  const char * script_tags[] = {
2972  "<script", "<object", "<applet", "<embed", "<form",
2973  "javascript:", "vbscript:"};
2974  ITERATE_0_IDX(idx, ArraySize(script_tags)) {
2975  AddWord(script_tags[idx], true);
2976  }
2977  Prime();
2978  }
2979 
2980  // Returns true if the given string matches any of the strings
2981  // in the fsm anywhere.
2982  bool DoesStrHaveFsmHits(const string &str) {
2983  int state = GetInitialState();
2984  ITERATE(string, str_it, str) {
2985  state = GetNextState(state, *str_it);
2986  if( IsMatchFound(state) ) {
2987  return true;
2988  }
2989  }
2990 
2991  return false;
2992  }
2993  };
2994  static CScriptTagTextFsm s_ScriptTagFsm;
2995 
2996 
2998  for( ; it; ++it) {
2999  if (s_ScriptTagFsm.DoesStrHaveFsmHits(*it)) {
3001  "Script tag found in item", obj);
3002  return;
3003  }
3004 }
3005 }
3006 
3007 
3008 bool CValidError_imp::IsMixedStrands(const CSeq_loc& loc)
3009 {
3010  if ( SeqLocCheck(loc, m_Scope) == eSeqLocCheck_warning ) {
3011  return false;
3012  }
3013 
3014  CSeq_loc_CI curr(loc);
3015  if ( !curr ) {
3016  return false;
3017  }
3018  CSeq_loc_CI prev = curr;
3019  ++curr;
3020 
3021  while ( curr ) {
3022  ENa_strand curr_strand = curr.GetStrand();
3023  ENa_strand prev_strand = prev.GetStrand();
3024 
3025  if ( (prev_strand == eNa_strand_minus &&
3026  curr_strand != eNa_strand_minus) ||
3027  (prev_strand != eNa_strand_minus &&
3028  curr_strand == eNa_strand_minus) ) {
3029  return true;
3030  }
3031 
3032  prev = curr;
3033  ++curr;
3034  }
3035 
3036  return false;
3037 }
3038 
3039 
3040 static bool s_SeqLocHasGI (const CSeq_loc& loc)
3041 {
3042  bool rval = false;
3043 
3044  for ( CSeq_loc_CI it(loc); it && !rval; ++it ) {
3045  if (it.GetSeq_id().IsGi()) {
3046  rval = true;
3047  }
3048  }
3049  return rval;
3050 }
3051 
3052 
3054 {
3055  m_TSEH = seh;
3057  m_GeneCache.Clear();
3058 }
3059 
3060 
3062 {
3064  return true;
3065  } else {
3066  return false;
3067  }
3068 }
3069 
3070 
3072 {
3073  if (se.IsSeq()) {
3074  return 1;
3075  } else if (!se.IsSet()) {
3076  return 0;
3077  }
3078  if (se.GetSet().IsSetClass()) {
3081  return 1;
3082  }
3083  }
3084  size_t count = 0;
3085  if (se.GetSet().IsSetSeq_set()) {
3086  for (auto it = se.GetSet().GetSeq_set().begin(); it != se.GetSet().GetSeq_set().end(); it++) {
3087  count += s_CountTopSetSiblings(**it);
3088  }
3089  }
3090  return count;
3091 }
3092 
3093 
3095 {
3096  // "Save" the Seq-entry
3097  SetTSE(seh);
3098 
3101 
3102  // If no Pubs/BioSource in CSeq_entry, post only one error
3103  if (GetContext().PreprocessHugeFile) {
3104  x_SetEntryInfo().SetNoPubs(GetContext().NoPubsFound);
3105  x_SetEntryInfo().SetNoCitSubPubs(GetContext().NoCitSubsFound);
3106  x_SetEntryInfo().SetNoBioSource(GetContext().NoBioSource);
3107  } else {
3109  x_SetEntryInfo().SetNoPubs(!pub);
3110  while (pub && !pub->IsSub()) {
3111  ++pub;
3112  }
3116  }
3117 
3118 
3119  // Look for genomic product set
3121  if (si->IsSetClass ()) {
3122  if (si->GetClass () == CBioseq_set::eClass_gen_prod_set) {
3123  x_SetEntryInfo().SetGPS();
3124  }
3125  if (si->GetClass () == CBioseq_set::eClass_small_genome_set) {
3127  }
3128  }
3129  }
3130 
3131  // Examine all Seq-ids on Bioseqs
3132  for (CTypeConstIterator <CBioseq> bi (*m_TSE); bi; ++bi) {
3133  FOR_EACH_SEQID_ON_BIOSEQ (sid_itr, *bi) {
3134  const CSeq_id& sid = **sid_itr;
3135  const CTextseq_id* tsid = sid.GetTextseq_Id();
3136  CSeq_id::E_Choice typ = sid.Which();
3137  switch (typ) {
3138  case CSeq_id::e_not_set:
3139  break;
3140  case CSeq_id::e_Local:
3141  break;
3142  case CSeq_id::e_Gibbsq:
3143  break;
3144  case CSeq_id::e_Gibbmt:
3145  break;
3146  case CSeq_id::e_Giim:
3147  break;
3148  case CSeq_id::e_Genbank:
3151  x_SetEntryInfo().SetGED();
3152  break;
3153  case CSeq_id::e_Embl:
3155  x_SetEntryInfo().SetGED();
3156  x_SetEntryInfo().SetEmbl();
3157  break;
3158  case CSeq_id::e_Pir:
3159  break;
3160  case CSeq_id::e_Swissprot:
3161  break;
3162  case CSeq_id::e_Patent:
3164  break;
3165  case CSeq_id::e_Other:
3167  // and do RefSeq subclasses up front as well
3168  if (sid.GetOther().IsSetAccession()) {
3169  string acc = sid.GetOther().GetAccession().substr(0, 3);
3170  if (acc == "NC_") {
3171  m_IsNC = true;
3172  } else if (acc == "NG_") {
3173  m_IsNG = true;
3174  } else if (acc == "NM_") {
3175  m_IsNM = true;
3176  } else if (acc == "NP_") {
3177  m_IsNP = true;
3178  } else if (acc == "NR_") {
3179  m_IsNR = true;
3180  } else if (acc == "NZ_") {
3181  m_IsNZ = true;
3182  } else if (acc == "NS_") {
3183  m_IsNS = true;
3184  } else if (acc == "NT_") {
3185  m_IsNT = true;
3186  } else if (acc == "NW_") {
3187  m_IsNW = true;
3188  } else if (acc == "WP_") {
3189  m_IsWP = true;
3190  } else if (acc == "XR_") {
3191  m_IsXR = true;
3192  }
3193  }
3194  break;
3195  case CSeq_id::e_General:
3196  if ((*bi).IsAa() && !sid.GetGeneral().IsSkippable()) {
3198  }
3199  break;
3200  case CSeq_id::e_Gi:
3201  x_SetEntryInfo().SetGI();
3203  break;
3204  case CSeq_id::e_Ddbj:
3206  x_SetEntryInfo().SetGED();
3207  x_SetEntryInfo().SetDdbj();
3208  break;
3209  case CSeq_id::e_Prf:
3210  break;
3211  case CSeq_id::e_Pdb:
3212  x_SetEntryInfo().SetPDB();
3213  break;
3214  case CSeq_id::e_Tpg:
3216  break;
3217  case CSeq_id::e_Tpe:
3218  x_SetEntryInfo().SetTPE();
3220  break;
3221  case CSeq_id::e_Tpd:
3223  break;
3224  case CSeq_id::e_Gpipe:
3226  break;
3227  default:
3228  break;
3229  }
3230  if ( tsid && tsid->IsSetAccession() && tsid->IsSetVersion() && tsid->GetVersion() >= 1 ) {
3232  }
3233  if (typ != CSeq_id::e_Local && typ != CSeq_id::e_General) {
3235  }
3236  }
3237  }
3238 
3239  // search all source descriptors for genomic source
3240  for (CSeqdesc_CI desc_ci (seh, CSeqdesc::e_Source);
3241  desc_ci && !m_pEntryInfo->IsGenomic();
3242  ++desc_ci) {
3243  if (desc_ci->GetSource().IsSetGenome()
3244  && desc_ci->GetSource().GetGenome() == CBioSource::eGenome_genomic) {
3246  }
3247  }
3248 
3249  // search genome build and annotation pipeline user object descriptors
3250  for (CSeqdesc_CI desc_ci (seh, CSeqdesc::e_User);
3251  desc_ci && !m_pEntryInfo->IsGpipe();
3252  ++desc_ci) {
3253  if ( desc_ci->GetUser().IsSetType() ) {
3254  const CUser_object& obj = desc_ci->GetUser();
3255  const CObject_id& oi = obj.GetType();
3256  if ( ! oi.IsStr() ) continue;
3257  if ( NStr::CompareNocase(oi.GetStr(), "GenomeBuild") == 0 ) {
3259  } else if ( NStr::CompareNocase(oi.GetStr(), "StructuredComment") == 0 ) {
3260  ITERATE (CUser_object::TData, field, obj.GetData()) {
3261  if ((*field)->IsSetLabel() && (*field)->GetLabel().IsStr()) {
3262  if (NStr::EqualNocase((*field)->GetLabel().GetStr(), "Annotation Pipeline")) {
3263  if (NStr::EqualNocase((*field)->GetData().GetStr(), "NCBI eukaryotic genome annotation pipeline")) {
3265  }
3266  }
3267  }
3268  }
3269  }
3270  }
3271  }
3272 
3273  // examine features for location gi, product gi, and locus tag
3274  for (CFeat_CI feat_ci (seh);
3276  ++feat_ci) {
3277  if (s_SeqLocHasGI(feat_ci->GetLocation())) {
3279  }
3280  if (feat_ci->IsSetProduct() && s_SeqLocHasGI(feat_ci->GetProduct())) {
3282  }
3283  if (feat_ci->IsSetData() && feat_ci->GetData().IsGene()
3284  && feat_ci->GetData().GetGene().IsSetLocus_tag()
3285  && !NStr::IsBlank (feat_ci->GetData().GetGene().GetLocus_tag())) {
3287  }
3288  }
3289 
3290  if ( m_PrgCallback ) {
3291  m_NumAlign = 0;
3292  for (CTypeConstIterator<CSeq_align> i(*m_TSE); i; ++i) {
3293  m_NumAlign++;
3294  }
3295  m_NumAnnot = 0;
3296  for (CTypeConstIterator<CSeq_annot> i(*m_TSE); i; ++i) {
3297  m_NumAnnot++;
3298  }
3299  m_NumBioseq = 0;
3300  for (CTypeConstIterator<CBioseq> i(*m_TSE); i; ++i) {
3301  m_NumBioseq++;
3302  }
3303  m_NumBioseq_set = 0;
3304  for (CTypeConstIterator<CBioseq_set> i(*m_TSE); i; ++i) {
3305  m_NumBioseq_set++;
3306  }
3307  m_NumDesc = 0;
3308  for (CTypeConstIterator<CSeqdesc> i(*m_TSE); i; ++i) {
3309  m_NumDesc++;
3310  }
3311  m_NumDescr = 0;
3312  for (CTypeConstIterator<CSeq_descr> i(*m_TSE); i; ++i) {
3313  m_NumDescr++;
3314  }
3315  m_NumFeat = 0;
3316  for (CTypeConstIterator<CSeq_feat> i(*m_TSE); i; ++i) {
3317  m_NumFeat++;
3318  }
3319  m_NumGraph = 0;
3320  for (CTypeConstIterator<CSeq_graph> i(*m_TSE); i; ++i) {
3321  m_NumGraph++;
3322  }
3325  m_NumGraph;
3326  }
3327 
3328  if (CNcbiApplication::Instance()->GetProgramDisplayName() == "table2asn") {
3329  m_IsTbl2Asn = true;
3330  }
3331 }
3332 
3333 
3335 {
3336  m_Scope.Reset(new CScope(*m_ObjMgr));
3337  m_Scope->AddTopLevelSeqEntry(*const_cast<CSeq_entry*>(&se));
3338  m_Scope->AddDefaults();
3339 }
3340 
3341 
3343 {
3344  m_IsStandaloneAnnot = true;
3345  if (! m_Scope) {
3346  m_Scope.Reset(& sah.GetScope());
3347  }
3349  m_TSE.Reset(new CSeq_entry); // set a dummy Seq-entry
3351 }
3352 
3353 
3355 {
3356  m_Scope.Reset(new CScope(*m_ObjMgr));
3357  CRef<CSeq_entry> tmp_entry(new CSeq_entry());
3358  tmp_entry->SetSeq().Assign(seq);
3359  m_TSE.Reset(tmp_entry);
3361  Setup(m_TSEH);
3362  return m_TSEH;
3363 }
3364 
3365 
3367 (const CSeq_loc& loc,
3368  const CSerialObject& obj)
3369 {
3370  for ( CSeq_loc_CI lit(loc); lit; ++lit ) {
3371  const CSeq_id& id1 = lit.GetSeq_id();
3372  CSeq_loc_CI lit2 = lit;
3373  for ( ++lit2; lit2; ++lit2 ) {
3374  const CSeq_id& id2 = lit2.GetSeq_id();
3375  if ( IsSameBioseq(id1, id2, m_Scope) && !id1.Match(id2) ) {
3378  "Two ids refer to the same bioseq but are of "
3379  "different type", obj);
3380  }
3381  }
3382  if (IsTemporary(id1)) {
3384  "Feature locations should not use Seq-ids that will be stripped during ID load", obj);
3385  }
3386  }
3389  "Feature location intervals should all be on the same sequence", obj);
3390  }
3391 }
3392 
3393 
3395 {
3396  return validator::IsInOrganelleSmallGenomeSet(id, scope);
3397 }
3398 
3399 
3400 // all ids in a location should point to the same sequence, unless the sequences are
3401 // in an organelle small genome set
3402 bool CValidError_imp::BadMultipleSequenceLocation(const CSeq_loc& loc, CScope& scope)
3403 {
3404  return validator::BadMultipleSequenceLocation(loc, scope);
3405 }
3406 
3407 
3408 bool CValidError_imp::x_IsFarFetchFailure (const CSeq_loc& loc)
3409 {
3411  && IsFarLocation(loc, GetTSEH())) {
3412  return true;
3413  } else {
3414  return false;
3415  }
3416 }
3417 
3418 
3419 //LCOV_EXCL_START
3420 // not used by asnvalidate, used by external programs
3422 {
3423  bool rval = false;
3424  Setup(se);
3425  CValidError_bioseq bioseq_validator(*this);
3427  while (bi) {
3428  rval |= bioseq_validator.GetTSANStretchErrors(*(bi->GetCompleteBioseq()));
3429  ++bi;
3430  }
3431  return rval;
3432 }
3433 
3434 
3436 {
3437  CSeq_entry_Handle seh = Setup(seq);
3438  CValidError_bioseq bioseq_validator(*this);
3439  return bioseq_validator.GetTSANStretchErrors(*(seh.GetSeq().GetCompleteBioseq()));
3440 }
3441 
3442 
3444 {
3445  bool rval = false;
3446  Setup(se);
3447  CValidError_feat feat_validator(*this);
3448  CFeat_CI fi(se);
3449  while (fi) {
3450  CBioseq_Handle bsh = se.GetScope().GetBioseqHandle(fi->GetLocation());
3451  if (bsh) {
3452  rval |= feat_validator.GetTSACDSOnMinusStrandErrors(*(fi->GetSeq_feat()), *(bsh.GetCompleteBioseq()));
3453  }
3454  ++fi;
3455  }
3456 
3457  return rval;
3458 }
3459 
3460 
3462 {
3463  CSeq_entry_Handle seh = Setup(seq);
3464  CValidError_feat feat_validator(*this);
3465  return feat_validator.GetTSACDSOnMinusStrandErrors(f, *(seh.GetSeq().GetCompleteBioseq()));
3466 }
3467 
3468 
3470 {
3471  bool rval = false;
3472  Setup(se);
3473  CValidError_bioseq bioseq_validator(*this);
3475  while (bi) {
3476  rval |= bioseq_validator.GetTSAConflictingBiomolTechErrors(*(bi->GetCompleteBioseq()));
3477  ++bi;
3478  }
3479  return rval;
3480 }
3481 
3482 
3484 {
3485  CSeq_entry_Handle seh = Setup(seq);
3486  CValidError_bioseq bioseq_validator(*this);
3487  return bioseq_validator.GetTSAConflictingBiomolTechErrors(*(seh.GetSeq().GetCompleteBioseq()));
3488 }
3489 //LCOV_EXCL_STOP
3490 
3491 const string kTooShort = "Too Short";
3492 const string kMissingPrimers = "Missing Primers";
3493 const string kMissingCountry = "Missing Country";
3494 const string kMissingVoucher = "Missing Voucher";
3495 const string kBadCollectionDate = "Bad Collection Date";
3496 const string kTooManyNs = "Too Many Ns";
3497 const string kMissingOrderAssignment = "Missing Order Assignment";
3498 const string kLowTrace = "Low Trace";
3499 const string kFrameShift = "Frame Shift";
3500 const string kStructuredVoucher = "Structured Voucher";
3501 
3502 #define ADD_BARCODE_ERR(TestName) \
3503  PostErr(eDiag_Warning, eErr_GENERIC_Barcode##TestName, k##TestName, sq); \
3504  if (!msg.empty()) { \
3505  msg += ","; \
3506  } \
3507  msg += k##TestName;
3508 
3510 {
3511  TBarcodeResults results = GetBarcodeValues(seh);
3512  for (auto r : results) {
3513  const CBioseq& sq = *(r.bsh.GetCompleteBioseq());
3514  if (BarcodeTestFails(r)){
3515  string msg;
3516  if (r.length) {
3517  ADD_BARCODE_ERR(TooShort)
3518  }
3519  if (r.primers) {
3520  ADD_BARCODE_ERR(MissingPrimers)
3521  }
3522  if (r.country) {
3523  ADD_BARCODE_ERR(MissingCountry)
3524  }
3525  if (r.voucher) {
3526  ADD_BARCODE_ERR(MissingVoucher)
3527  }
3528  if (!r.percent_n.empty()) {
3530  if (!msg.empty()) {
3531  msg += ",";
3532  }
3533  msg += kTooManyNs + ":" + r.percent_n;
3534  }
3535  if (r.collection_date) {
3536  ADD_BARCODE_ERR(BadCollectionDate)
3537  }
3538  if (r.order_assignment) {
3539  ADD_BARCODE_ERR(MissingOrderAssignment)
3540  }
3541  if (r.low_trace) {
3542  ADD_BARCODE_ERR(LowTrace)
3543  }
3544  if (r.frame_shift) {
3545  ADD_BARCODE_ERR(FrameShift)
3546  }
3547  if (!r.structured_voucher) {
3548  ADD_BARCODE_ERR(StructuredVoucher)
3549  }
3550  PostErr(eDiag_Info, eErr_GENERIC_BarcodeTestFails, "FAIL (" + msg + ")", sq);
3551  } else {
3553  }
3554  }
3555 }
3556 
3557 
3561 bool CValidError_imp::IsGPS() const { return GetEntryInfo().IsGPS(); }
3562 bool CValidError_imp::IsGED() const { return GetEntryInfo().IsGED(); }
3563 bool CValidError_imp::IsPDB() const { return GetEntryInfo().IsPDB(); }
3566 bool CValidError_imp::IsEmbl() const { return GetEntryInfo().IsEmbl(); }
3567 bool CValidError_imp::IsDdbj() const { return GetEntryInfo().IsDdbj(); }
3568 bool CValidError_imp::IsTPE() const { return GetEntryInfo().IsTPE(); }
3569 bool CValidError_imp::IsNC() const { return m_IsNC; }
3570 bool CValidError_imp::IsNG() const { return m_IsNG; }
3571 bool CValidError_imp::IsNM() const { return m_IsNM; }
3572 bool CValidError_imp::IsNP() const { return m_IsNP; }
3573 bool CValidError_imp::IsNR() const { return m_IsNR; }
3574 bool CValidError_imp::IsNS() const { return m_IsNS; }
3575 bool CValidError_imp::IsNT() const { return m_IsNT; }
3576 bool CValidError_imp::IsNW() const { return m_IsNW; }
3577 bool CValidError_imp::IsNZ() const { return m_IsNZ; }
3578 bool CValidError_imp::IsWP() const { return m_IsWP; }
3579 bool CValidError_imp::IsXR() const { return m_IsXR; }
3580 bool CValidError_imp::IsGI() const { return GetEntryInfo().IsGI(); }
3582 bool CValidError_imp::IsGpipe() const { return GetEntryInfo().IsGpipe(); }
3595 
3596 
3597 
3598 // =============================================================================
3599 // CValidError_base Implementation
3600 // =============================================================================
3601 
3602 
3604  m_Imp(imp), m_Scope(imp.GetScope())
3605 {
3606 }
3607 
3608 
3610 {
3611 }
3612 
3613 
3615 (EDiagSev sv,
3616  EErrType et,
3617  const string& msg,
3618  const CSerialObject& obj)
3619 {
3620  m_Imp.PostErr(sv, et, msg, obj);
3621 }
3622 
3623 
3624 //void CValidError_base::PostErr
3625 //(EDiagSev sv,
3626 // EErrType et,
3627 // const string& msg,
3628 // TDesc ds)
3629 //{
3630 // m_Imp.PostErr(sv, et, msg, ds);
3631 //}
3632 
3633 
3635 (EDiagSev sv,
3636  EErrType et,
3637  const string& msg,
3638  const CSeq_feat& ft)
3639 {
3640  m_Imp.PostErr(sv, et, msg, ft);
3641 }
3642 
3643 
3645 (EDiagSev sv,
3646  EErrType et,
3647  const string& msg,
3648  const CBioseq& sq)
3649 {
3650  m_Imp.PostErr(sv, et, msg, sq);
3651 }
3652 
3653 
3655 (EDiagSev sv,
3656  EErrType et,
3657  const string& msg,
3658  const CSeq_entry& ctx,
3659  const CSeqdesc& ds)
3660 {
3661  m_Imp.PostErr(sv, et, msg, ctx, ds);
3662 }
3663 
3664 
3666 (EDiagSev sv,
3667  EErrType et,
3668  const string& msg,
3669  const CBioseq_set& set)
3670 {
3671  m_Imp.PostErr(sv, et, msg, set);
3672 }
3673 
3674 
3676 (EDiagSev sv,
3677  EErrType et,
3678  const string& msg,
3679  const CSeq_annot& annot)
3680 {
3681  m_Imp.PostErr(sv, et, msg, annot);
3682 }
3683 
3685 (EDiagSev sv,
3686  EErrType et,
3687  const string& msg,
3688  const CSeq_graph& graph)
3689 {
3690  m_Imp.PostErr(sv, et, msg, graph);
3691 }
3692 
3693 
3695 (EDiagSev sv,
3696  EErrType et,
3697  const string& msg,
3698  const CBioseq& sq,
3699  const CSeq_graph& graph)
3700 {
3701  m_Imp.PostErr(sv, et, msg, sq, graph);
3702 }
3703 
3704 
3706 (EDiagSev sv,
3707  EErrType et,
3708  const string& msg,
3709  const CSeq_align& align)
3710 {
3711  m_Imp.PostErr(sv, et, msg, align);
3712 }
3713 
3714 
3716 (EDiagSev sv,
3717  EErrType et,
3718  const string& msg,
3719  const CSeq_entry& entry)
3720 {
3721  m_Imp.PostErr(sv, et, msg, entry);
3722 }
3723 
3724 CCacheImpl&
3726 {
3727  return m_Imp.GetCache();
3728 }
3729 
3730 
3732 {
3733  CSeq_entry_Handle parent = seh.GetParentEntry();
3734  if (!parent || !parent.IsSet()) {
3735  return false;
3736  }
3738  if (!pset) {
3739  return false;
3740  }
3741  if (pset->IsSetSeq_set() && pset->GetSeq_set().size() > 10) {
3742  return true;
3743  } else {
3744  return s_HasTopSetSiblings(parent);
3745  }
3746 }
3747 
3748 
3750 {
3751  CSeq_entry_Handle appropriate_parent;
3752 
3753  CSeq_entry_Handle np;
3754  CSeq_entry_Handle gps;
3755  if (seh.IsSet() && seh.GetSet().IsSetClass()) {
3756  if (seh.GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
3757  np = seh;
3758  } else if (s_IsGoodTopSetClass(seh.GetSet().GetClass())) {
3759  gps = seh;
3760  }
3761  } else if (seh.IsSeq()) {
3763  if (p && p.IsSet() && p.GetSet().IsSetClass()) {
3765  np = p;
3766  } else if (s_IsGoodTopSetClass(p.GetSet().GetClass())) {
3767  gps = p;
3768  }
3769  }
3770  }
3771  if (gps) {
3772  appropriate_parent = gps;
3773  } else if (np) {
3775  if (gp && gp.IsSet() && gp.GetSet().IsSetClass() &&
3777  appropriate_parent = gp;
3778  } else {
3779  appropriate_parent = np;
3780  }
3781  } else {
3782  appropriate_parent = seh;
3783  }
3784  return appropriate_parent;
3785 }
3786 
3787 
3790  CConstRef<CPubdesc> pub)
3791 {
3792  // first, try to receive from cache
3794  m_pubdescCache.find(pub);
3795  if( find_iter != m_pubdescCache.end() ) {
3796  return *find_iter->second;
3797  }
3798 
3799  CRef<CPubdescInfo> pInfo(new CPubdescInfo);
3801  *pub, pInfo->m_pmids, pInfo->m_muids,
3802  pInfo->m_serials, pInfo->m_published_labels,
3803  pInfo->m_unpublished_labels);
3804  m_pubdescCache[pub] = pInfo;
3805  return *pInfo;
3806 }
3807 
3808 bool
3810  const SFeatKey & rhs) const
3811 {
3812  if( feat_type != rhs.feat_type ) {
3813  return feat_type < rhs.feat_type;
3814  } else if( feat_subtype != rhs.feat_subtype ) {
3815  return feat_subtype < rhs.feat_subtype;
3816  } else {
3817  return bioseq_h < rhs.bioseq_h;
3818  }
3819 }
3820 
3821 bool
3823  const SFeatKey & rhs) const
3824 {
3825  return (feat_type == rhs.feat_type) &&
3826  (feat_subtype == rhs.feat_subtype) && (bioseq_h == rhs.bioseq_h);
3827 }
3828 
3829 const CCacheImpl::TFeatValue &
3831  const CCacheImpl::SFeatKey & featKey)
3832 {
3833  // check common case where already in the cache
3834  TFeatCache::iterator find_iter = m_featCache.find(featKey);
3835  if( find_iter != m_featCache.end() ) {
3836  return find_iter->second;
3837  }
3838 
3839  // check if bioseq already processed, but had no entry requested above
3840  SFeatKey bioseq_check_key(
3842  TFeatCache::const_iterator bioseq_find_iter =
3843  m_featCache.find(bioseq_check_key);
3844  if( bioseq_find_iter != m_featCache.end() ) {
3845  // bioseq was already processed,
3846  // it just happened to not have an entry here
3847  return kEmptyFeatValue;
3848  }
3849 
3850  // bioseq never added to cache, so calculate that now
3851 
3852  // to avoid expensive constructions of CFeat_CI's,
3853  // we iterate through all the seqs on
3854  // the bioseq and load them into the cache.
3855  CFeat_CI feat_ci(featKey.bioseq_h);
3856  for( ; feat_ci; ++feat_ci ) {
3857  SFeatKey inner_feat_key(
3858  feat_ci->GetFeatType(), feat_ci->GetFeatSubtype(), featKey.bioseq_h);
3859 
3860  m_featCache[inner_feat_key].push_back(*feat_ci);
3861 
3862  // also add "don't care" entries for partial searches
3863  // (e.g. if caller just wants to search on type but not on
3864  // subtype they can set subtype to kAnyFeatSubtype)
3865  SFeatKey any_type_key = inner_feat_key;
3866  any_type_key.feat_type = kAnyFeatType;
3867  m_featCache[any_type_key].push_back(*feat_ci);
3868 
3869  SFeatKey any_subtype_key = inner_feat_key;
3870  any_subtype_key.feat_subtype = kAnyFeatSubtype;
3871  m_featCache[any_subtype_key].push_back(*feat_ci);
3872 
3873  // for when the caller wants all feats on a bioseq
3874  SFeatKey any_type_or_subtype_key = inner_feat_key;
3875  any_type_or_subtype_key.feat_type = kAnyFeatType;
3876  any_type_or_subtype_key.feat_subtype = kAnyFeatSubtype;
3877  m_featCache[any_type_or_subtype_key].push_back(*feat_ci);
3878  }
3879 
3880  // in case a bioseq has no features, we add a dummy key just to
3881  // remember that so we don't use CFeat_CI again on the same bioseq
3882  m_featCache[bioseq_check_key]; // gets default val
3883 
3884  return m_featCache[featKey];
3885 }
3886 
3889  const vector<SFeatKey> &featKeys)
3890 {
3891  if( featKeys.empty() ) {
3892  return new TFeatValue;
3893  }
3894 
3895  // all featKeys must have the same bioseq
3896  const CBioseq_Handle & bioseq_h = featKeys[0].bioseq_h;
3897  ITERATE(vector<SFeatKey>, feat_it, featKeys) {
3898  if( feat_it->bioseq_h != bioseq_h ) {
3899  throw runtime_error("GetFeatFromCacheMulti must be called with only 1 bioseq in its args");
3900  }
3901  }
3902 
3903  // set prevents dups
3904  set<TFeatValue::value_type> set_of_feats;
3905 
3906  // combine the answers from every key into the set
3907  ITERATE(vector<SFeatKey>, key_it, featKeys ) {
3908  const TFeatValue & feat_value = GetFeatFromCache(*key_it);
3909  copy(BEGIN_COMMA_END(feat_value), inserter(
3910  set_of_feats, set_of_feats.begin()));
3911  }
3912 
3913  // go through every feature on the bioseq and remember any that match what's in the set
3914  // (The purpose of this step is to return the feats in the same
3915  // order they were on the original bioseq. In the future, we may
3916  // consider adding a flag to avoid sorting for time purposes).
3917  AutoPtr<TFeatValue> answer(new TFeatValue);
3918  SFeatKey all_feats_key(
3919  kAnyFeatType, kAnyFeatSubtype, bioseq_h);
3920  const TFeatValue & all_feats_vec = GetFeatFromCache(all_feats_key);
3921  ITERATE(TFeatValue, feat_it, all_feats_vec) {
3922  if( set_of_feats.find(*feat_it) != set_of_feats.end() ) {
3923  answer->push_back(*feat_it);
3924  }
3925  }
3926 
3927  return answer;
3928 }
3929 
3930 
3931 //LCOV_EXCL_START
3932 //not used
3933 bool
3935 {
3936  if( m_eFeatKeyStr != rhs.m_eFeatKeyStr ) {
3937  return m_eFeatKeyStr < rhs.m_eFeatKeyStr;
3938  }
3939  if( m_bioseq != rhs.m_bioseq ) {
3940  return m_bioseq < rhs.m_bioseq;
3941  }
3942  return s_QuickStringLess(m_feat_str, rhs.m_feat_str);
3943 }
3944 
3945 
3946 bool
3948 {
3949  if( m_eFeatKeyStr != rhs.m_eFeatKeyStr ) {
3950  return false;
3951  }
3952  if( m_bioseq != rhs.m_bioseq ) {
3953  return false;
3954  }
3955  return (m_feat_str == rhs.m_feat_str);
3956 }
3957 
3958 
3959 const CCacheImpl::TFeatValue &
3961  const SFeatStrKey & feat_str_key, const CTSE_Handle & tse_arg)
3962 {
3963  const CBioseq_Handle & search_bsh = feat_str_key.m_bioseq;
3964 
3965  // caller must give us something to work with
3966  _ASSERT(search_bsh || tse_arg);
3967 
3968  const CTSE_Handle & tse = (tse_arg ? tse_arg : search_bsh.GetTSE_Handle());
3969 
3970  // load cache if empty
3972  // (for now just indexes genes, but more may be added in the future)
3974  AutoPtr<CFeat_CI> p_gene_ci;
3975  // if we have TSE, get all features on it; otherwise, just get
3976  // the features from the bioseq
3977  if( tse ) {
3978  p_gene_ci.reset(new CFeat_CI(tse, sel));
3979  } else {
3980  p_gene_ci.reset(new CFeat_CI(search_bsh, sel));
3981  }
3982  CFeat_CI & gene_ci = *p_gene_ci; // for convenience
3983 
3984  for( ; gene_ci; ++gene_ci ) {
3985  CBioseq_Handle bsh = tse.GetScope().GetBioseqHandle(gene_ci->GetLocation());
3986  string label;
3987  const CGene_ref & gene_ref = gene_ci->GetData().GetGene();
3988 
3989  // for each one, add an entry for using given Bioseq and the
3990  // kAnyBioseq (so users can search on any bioseq)
3991  gene_ref.GetLabel(&label);
3992  SFeatStrKey label_key(eFeatKeyStr_Label, bsh, label);
3993  m_featStrKeyToFeatsCache[label_key].push_back(*gene_ci);
3994  if( bsh ) {
3995  label_key.m_bioseq = kAnyBioseq;
3996  m_featStrKeyToFeatsCache[label_key].push_back(*gene_ci);
3997  }
3998 
3999  const string & locus_tag = (
4000  gene_ref.IsSetLocus_tag() ? gene_ref.GetLocus_tag() :
4001  kEmptyStr);
4002  SFeatStrKey locus_tag_key(eFeatKeyStr_LocusTag, bsh, locus_tag);
4003  m_featStrKeyToFeatsCache[locus_tag_key].push_back(*gene_ci);
4004  if( bsh ) {
4005  locus_tag_key.m_bioseq = kAnyBioseq;
4006  m_featStrKeyToFeatsCache[locus_tag_key].push_back(*gene_ci);
4007  }
4008  }
4009  }
4010 
4011  // get from cache, if possible
4013  m_featStrKeyToFeatsCache.find(feat_str_key);
4014  if( find_iter != m_featStrKeyToFeatsCache.end() ) {
4015  return find_iter->second;
4016  } else {
4017  // nothing found
4018  return kEmptyFeatValue;
4019  }
4020 }
4021 
4022 
4025  const CCacheImpl::TFeatToBioseqKey & feat_to_bioseq_key,
4026  const CTSE_Handle & tse)
4027 {
4028  // load cache if empty
4029  if( m_featToBioseqCache.empty() ) {
4030  CBioseq_CI bioseq_ci(tse);
4031  for( ; bioseq_ci; ++bioseq_ci ) {
4032  CFeat_CI feat_ci(*bioseq_ci);
4033  for( ; feat_ci; ++feat_ci ) {
4034  m_featToBioseqCache[*feat_ci].insert(*bioseq_ci);
4035  }
4036  }
4037  }
4038 
4039  // we're being given the map to a feature, so we should've loaded
4040  // at least one feature when we loaded the cache
4042 
4043  // load from the cache
4045  m_featToBioseqCache.find(feat_to_bioseq_key);
4046  if( find_iter != m_featToBioseqCache.end() ) {
4047  return find_iter->second;
4048  } else {
4049  const static TFeatToBioseqValue kEmptyFeatToBioseqCache;
4050  return kEmptyFeatToBioseqCache;
4051  }
4052 }
4053 //LCOV_EXCL_STOP
4054 
4058  const CTSE_Handle & tse)
4059 {
4060  _ASSERT(tse);
4061 
4062  // load cache if empty
4063  if( m_IdToBioseqCache.empty() ) {
4064  CBioseq_CI bioseq_ci(tse);
4065  for( ; bioseq_ci; ++bioseq_ci ) {
4066  const CBioseq_Handle::TId & ids = bioseq_ci->GetId();
4067  ITERATE(CBioseq_Handle::TId, id_it, ids) {
4068  m_IdToBioseqCache[id_it->GetSeqId()] = *bioseq_ci;
4069  }
4070  }
4071  }
4072 
4073  // there should be at least one Bioseq otherwise there wouldn't
4074  // be anything to validate.
4076 
4078  if( find_iter != m_IdToBioseqCache.end() ) {
4079  return find_iter->second;
4080  } else {
4081  static const TIdToBioseqValue s_EmptyResult;
4082  return s_EmptyResult;
4083  }
4084 }
4085 
4088  CScope *scope, const CSeq_loc& loc, const CTSE_Handle & tse)
4089 {
4090  _ASSERT(scope || tse);
4091  if( ! tse || (!tse.GetTopLevelEntry().IsSet() && !tse.GetTopLevelEntry().IsSeq())) {
4092  // fall back on old style
4093  return BioseqHandleFromLocation(scope, loc);
4094  }
4095 
4096 
4097  for ( CSeq_loc_CI citer (loc); citer; ++citer) {
4098  CConstRef<CSeq_id> id(&citer.GetSeq_id());
4099  const TIdToBioseqValue & bioseq = GetIdToBioseq(id, tse);
4100  if( bioseq ) {
4101  return bioseq;
4102  }
4103  }
4104 
4105  // nothing found, so fall back on old style if possible
4106  if( scope ) {
4107  return BioseqHandleFromLocation(scope, loc);
4108  } else {
4109  return kEmptyBioseqHandle;
4110  }
4111 }
4112 
4113 
4115 {
4117  m_featCache.clear();
4121 }
4122 
4123 
4124 
4125 
4126 
4127 END_SCOPE(validator)
static CRef< CScope > m_Scope
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
EErrType
@ eErr_SEQ_FEAT_WrongQualOnImpFeat
@ eErr_SEQ_DESCR_ObsoleteSourceQual
@ eErr_SEQ_DESCR_ObsoleteSourceLocation
@ eErr_SEQ_INST_FarFetchFailure
@ eErr_SEQ_FEAT_WholeLocation
@ eErr_SEQ_INST_ShortSeq
@ eErr_GENERIC_MissingPubRequirement
@ eErr_SEQ_FEAT_EcNumberProblem
@ eErr_SEQ_FEAT_DuplicateAnticodonInterval
@ eErr_SEQ_INST_CompleteGenomeHasGaps
@ eErr_SEQ_FEAT_CDShasTooManyXs
@ eErr_SEQ_FEAT_TranslExceptPhase
@ eErr_SEQ_FEAT_MinusStrandProtein
@ eErr_SEQ_INST_CompleteTitleProblem
@ eErr_SEQ_PKG_EmptySet
@ eErr_SEQ_DESCR_UnwantedCompleteFlag
@ eErr_SEQ_FEAT_GeneXrefWithoutLocus
@ eErr_SEQ_FEAT_BadLocation
@ eErr_SEQ_FEAT_GenesInconsistent
@ eErr_SEQ_INST_HighNContentStretch
@ eErr_SEQ_PKG_NoBioseqFound
@ eErr_SEQ_FEAT_PseudoRnaHasProduct
@ eErr_SEQ_DESCR_InconsistentBioSources
@ eErr_GENERIC_PastReleaseDate
@ eErr_SEQ_DESCR_BioSourceDbTagConflict
@ eErr_SEQ_FEAT_UnknownImpFeatQual
@ eErr_SEQ_FEAT_DuplicateExonInterval
@ eErr_GENERIC_UnnecessaryPubEquiv
@ eErr_SEQ_DESCR_BioSourceOnProtein
@ eErr_SEQ_DESCR_LatLonRange
@ eErr_SEQ_FEAT_UnnecessaryTranslExcept
@ eErr_SEQ_GRAPH_GraphBioseqId
@ eErr_SEQ_FEAT_MixedStrand
@ eErr_SEQ_FEAT_BadRRNAcomponentOrder
@ eErr_SEQ_DESCR_DuplicatePCRPrimerSequence
@ eErr_SEQ_FEAT_BadGeneOntologyFormat
@ eErr_SEQ_DESCR_LatLonCountry
@ eErr_SEQ_PKG_NucProtSetHasTitle
@ eErr_SEQ_FEAT_IllegalDbXref
@ eErr_GENERIC_SgmlPresentInText
@ eErr_SEQ_FEAT_BadAnticodonAA
@ eErr_SEQ_FEAT_MissingCDSproduct
@ eErr_SEQ_FEAT_FeatureBeginsOrEndsInGap
@ eErr_SEQ_FEAT_TranslExceptAndRnaEditing
@ eErr_GENERIC_BarcodeTooManyNs
@ eErr_SEQ_PKG_BioseqSetClassNotSet
@ eErr_SEQ_DESCR_NoOrgFound
@ eErr_SEQ_FEAT_MissingProteinName
@ eErr_SEQ_DESCR_BadPCRPrimerSequence
@ eErr_SEQ_FEAT_GeneXrefWithoutGene
@ eErr_SEQ_DESCR_TransgenicProblem
@ eErr_SEQ_PKG_MissingSetTitle
@ eErr_SEQ_FEAT_InvalidQualifierValue
@ eErr_SEQ_FEAT_GeneOntologyTermMissingGOID
@ eErr_SEQ_FEAT_ProtRefHasNoData
@ eErr_SEQ_GRAPH_GraphSeqLocLen
@ eErr_SEQ_DESCR_InvalidForType
@ eErr_SEQ_DESCR_LatLonValue
@ eErr_SEQ_FEAT_TransLen
@ eErr_SEQ_FEAT_FeatureCitationProblem
@ eErr_SEQ_DESCR_IdenticalInstitutionCode
@ eErr_SEQ_PKG_ImproperlyNestedSets
@ eErr_SEQ_INST_UnknownLengthGapNot100
@ eErr_SEQ_FEAT_WrongQualOnFeature
@ eErr_SEQ_FEAT_MultipleProtRefs
@ eErr_SEQ_FEAT_MultipleEquivPublications
@ eErr_SEQ_PKG_SeqSubmitWithWgsSet
@ eErr_SEQ_PKG_InconsistentMoltypeSet
@ eErr_SEQ_INST_ConflictingBiomolTech
@ eErr_SEQ_FEAT_MissingQualOnImpFeat
@ eErr_SEQ_PKG_INSDRefSeqPackaging
@ eErr_SEQ_FEAT_LocusCollidesWithLocusTag
@ eErr_SEQ_PKG_GPSnonGPSPackaging
@ eErr_SEQ_DESCR_BadCollectionDate
@ eErr_SEQ_FEAT_MultipleEquivBioSources
@ eErr_SEQ_FEAT_CDSwithNoMRNAOverlap
@ eErr_SEQ_DESCR_BadInstitutionCode
@ eErr_SEQ_FEAT_PeptideFeatOutOfFrame
@ eErr_SEQ_FEAT_ProteinNameHasPMID
@ eErr_SEQ_FEAT_RepeatRegionNeedsNote
@ eErr_SEQ_DESCR_BadAltitude
@ eErr_SEQ_FEAT_GeneXrefStrandProblem
@ eErr_SEQ_FEAT_MissingTrnaAA
@ eErr_GENERIC_NonAsciiAsn
@ eErr_SEQ_FEAT_CDSwithMultipleMRNAs
@ eErr_SEQ_FEAT_CollidingFeatureIDs
@ eErr_SEQ_DESCR_IncorrectlyFormattedVoucherID
@ eErr_SEQ_FEAT_OrfCdsHasProduct
@ eErr_SEQ_FEAT_ImproperBondLocation
@ eErr_SEQ_PKG_GraphPackagingProblem
@ eErr_SEQ_INST_OverlappingDeltaRange
@ eErr_SEQ_FEAT_BadTranssplicedInterval
@ eErr_SEQ_INST_SeqLocLength
@ eErr_SEQ_DESCR_MultipleTaxonIDs
@ eErr_SEQ_DESCR_BadKeyword
@ eErr_SEQ_FEAT_UnknownImpFeatKey
@ eErr_SEQ_DESCR_Inconsistent
@ eErr_SEQ_PKG_ArchaicFeatureLocation
@ eErr_GENERIC_BadDate
@ eErr_GENERIC_BarcodeTestFails
@ eErr_SEQ_FEAT_NestedSeqLocMix
@ eErr_SEQ_FEAT_ShortIntron
@ eErr_SEQ_FEAT_UnknownFeatureQual
@ eErr_SEQ_DESCR_MultipleChromosomes
@ eErr_SEQ_FEAT_Range
@ eErr_SEQ_FEAT_InconsistentGeneOntologyTermAndId
@ eErr_SEQ_PKG_MisplacedMolInfo
@ eErr_GENERIC_EmbeddedScript
@ eErr_GENERIC_BarcodeTestPasses
@ eErr_SEQ_GRAPH_GraphAbove
@ eErr_SEQ_FEAT_FeatureInsideGap
@ eErr_SEQ_FEAT_DifferntIdTypesInSeqLoc
@ eErr_SEQ_FEAT_BadFullLengthFeature
@ eErr_SEQ_FEAT_RNAtype0
@ eErr_SEQ_FEAT_BadCharInAuthorName
@ eErr_SEQ_FEAT_FarLocation
@ eErr_SEQ_INST_BadHTGSeq
@ eErr_SEQ_FEAT_InvalidFuzz
@ eErr_SEQ_FEAT_InvalidInferenceValue
@ eErr_SEQ_FEAT_GeneXrefNeeded
@ eErr_SEQ_INST_UnexpectedIdentifierChange
@ eErr_SEQ_FEAT_InconsistentRRNAstrands
@ eErr_SEQ_PKG_ArchaicFeatureProduct
@ eErr_SEQ_DESCR_MultipleSourceQualifiers
@ eErr_SEQ_FEAT_BadRRNAcomponentOverlap
@ eErr_SEQ_FEAT_BadTrailingCharacter
@ eErr_SEQ_DESCR_WrongVoucherType
@ eErr_SEQ_INST_ProteinsHaveGeneralID
@ eErr_SEQ_GRAPH_GraphOutOfOrder
@ eErr_SEQ_FEAT_BadInternalCharacter
@ eErr_SEQ_DESCR_NoSourceDescriptor
@ eErr_SEQ_DESCR_BadCollectionCode
@ eErr_SEQ_FEAT_BadProteinName
@ eErr_SEQ_FEAT_FeatureProductInconsistency
@ eErr_GENERIC_PublicationInconsistency
@ eErr_GENERIC_BadSubmissionAuthorName
@ eErr_GENERIC_CollidingSerialNumbers
@ eErr_SEQ_PKG_ComponentMissingTitle
@ eErr_SEQ_DESCR_DBLinkMissingUserObject
@ eErr_SEQ_PKG_InternalGenBankSet
@ eErr_SEQ_DESCR_BioSourceMissing
@ eErr_SEQ_FEAT_BadAnticodonCodon
@ eErr_SEQ_FEAT_BadTrailingHyphen
@ eErr_SEQ_FEAT_OldLocusTagMismtach
@ eErr_SEQ_DESCR_MolInfoConflictsWithBioSource
@ eErr_SEQ_FEAT_UTRdoesNotAbutCDS
@ eErr_SEQ_FEAT_PseudoRnaViaGeneHasProduct
@ eErr_SEQ_FEAT_ConflictFlagSet
@ eErr_SEQ_FEAT_StrandOther
@ eErr_SEQ_PKG_FeaturePackagingProblem
@ eErr_SEQ_DESCR_MultipleNames
@ eErr_SEQ_INST_BadSeqIdFormat
@ eErr_SEQ_PKG_GenomicProductPackagingProblem
@ eErr_INTERNAL_Exception
@ eErr_SEQ_FEAT_BadEcNumberFormat
@ eErr_SEQ_FEAT_CDSproductPackagingProblem
@ eErr_SEQ_FEAT_RedundantFields
@ eErr_SEQ_INST_InternalNsInSeqRaw
@ eErr_SEQ_DESCR_BadOrgMod
@ eErr_SEQ_INST_TerminalNs
@ eErr_SEQ_DESCR_BadOrganelleLocation
@ eErr_SEQ_FEAT_NoNameForProtein
@ eErr_SEQ_FEAT_RptUnitRangeProblem
@ eErr_SEQ_FEAT_SeqLocOrder
@ eErr_SEQ_DESCR_TaxonomyIsSpeciesProblem
@ eErr_SEQ_FEAT_CDSmRNAXrefLocationProblem
@ eErr_SEQ_PKG_SingleItemSet
@ eErr_SEQ_DESCR_BioSourceNeedsChromosome
@ eErr_SEQ_FEAT_VectorContamination
@ eErr_SEQ_FEAT_AbuttingIntervals
@ eErr_SEQ_FEAT_CDSrange
@ eErr_SEQ_FEAT_LocusTagProblem
@ eErr_SEQ_DESCR_BioSourceInconsistency
@ eErr_SEQ_FEAT_OnlyGeneXrefs
@ eErr_SEQ_FEAT_TranslExcept
@ eErr_SEQ_INST_InternalGapsInSeqRaw
@ eErr_SEQ_FEAT_GeneRefHasNoData
@ eErr_SEQ_INST_DuplicateSegmentReferences
@ eErr_SEQ_FEAT_TooManyInferenceAccessions
@ eErr_SEQ_FEAT_TerminalXDiscrepancy
@ eErr_SEQ_FEAT_MiscFeatureNeedsNote
@ eErr_SEQ_DESCR_CollidingPublications
@ eErr_SEQ_FEAT_GenomeSetMixedStrand
@ eErr_SEQ_FEAT_BadCharInAuthorLastName
@ eErr_SEQ_FEAT_HypotheticalProteinMismatch
@ eErr_SEQ_INST_TpaAssemblyProblem
@ eErr_SEQ_FEAT_MissingGeneXref
AutoPtr –.
Definition: ncbimisc.hpp:401
CAlign_CI –.
Definition: align_ci.hpp:63
@Auth_list.hpp User-defined methods of the data storage class.
Definition: Auth_list.hpp:57
CAuthor –.
Definition: Author.hpp:59
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
CSeq_entry * GetParentEntry(void) const
Definition: Bioseq.hpp:174
static void GetPubdescLabels(const CPubdesc &pd, vector< TEntrezId > &pmids, vector< TEntrezId > &muids, vector< int > &serials, vector< string > &published_labels, vector< string > &unpublished_labels)
For Publication Citations Get labels for a pubdesc.
Definition: cleanup.cpp:3140
Definition: Dbtag.hpp:53
bool GetDBFlags(bool &is_refseq, bool &is_src, string &correct_caps) const
Definition: Dbtag.cpp:327
bool IsSkippable(void) const
Definition: Dbtag.cpp:281
CFeat_CI –.
Definition: feat_ci.hpp:64
void Clear()
Definition: gene_cache.hpp:89
CConstRef< CSeq_feat > GetGeneFromCache(const CSeq_feat *feat, CScope &scope)
Definition: gene_cache.cpp:106
void GetLabel(string *label) const
Definition: Gene_ref.cpp:57
CGraph_CI –.
Definition: graph_ci.hpp:234
CMappedFeat –.
Definition: mapped_feat.hpp:59
@Name_std.hpp User-defined methods of the data storage class.
Definition: Name_std.hpp:56
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:264
CObjectManager –.
const string & GetDivision(void) const
Definition: Org_ref.cpp:164
bool IsSetDivision(void) const
Definition: Org_ref.cpp:159
@ eContent
Definition: Pub.hpp:66
@Pubdesc.hpp User-defined methods of the data storage class.
Definition: Pubdesc.hpp:54
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
ESubtype GetSubtype(void) const
static bool RequireLocationIntervalsInBiologicalOrder(ESubtype subtype)
static bool AllowAdjacentIntervals(ESubtype subtype)
@ eSubtype_bad
These no longer need to match the FEATDEF values in the C toolkit's objfdef.h.
CSeq_annot_Handle –.
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
@ eContent
Definition: Seq_entry.hpp:93
void GetLabel(string *label, ELabelType type) const
Definition: Seq_entry.cpp:274
CSeq_entry * GetParentEntry(void) const
Definition: Seq_entry.hpp:131
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
Base class for all serializable objects.
Definition: serialbase.hpp:150
CSubmit_block –.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
Definition: tse_handle.cpp:205
TSeq_feat_Handles GetFeaturesWithId(CSeqFeatData::E_Choice type, TFeatureIdInt id) const
Definition: tse_handle.cpp:604
CScope & GetScope(void) const
Returns scope.
Definition: tse_handle.hpp:325
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
CTypeInfo class contains all information about C++ types (both basic and classes): members and layout...
Definition: typeinfo.hpp:76
Thrown on an attempt to write unassigned data member.
Definition: exception.hpp:84
static string GetFeatureBioseqLabel(const CSeq_feat &ft, CRef< CScope > scope, bool suppress_context)
static string GetDescriptorContent(const CSeqdesc &ds)
static string GetFeatureLocationLabel(const CSeq_feat &ft, CRef< CScope > scope, bool suppress_context)
static string GetFeatureProductLocLabel(const CSeq_feat &ft, CRef< CScope > scope, bool suppress_context)
static string GetDescriptorLabel(const CSeqdesc &ds, const CSeq_entry &ctx, CRef< CScope > scope, bool suppress_context)
static string GetFeatureContentLabel(const CSeq_feat &feat, CRef< CScope > scope)
static string GetFeatureIdLabel(const CSeq_feat &ft)
static string GetBioseqSetLabel(const CBioseq_set &st, CRef< CScope > scope, bool suppress_context)
void ValidateSeqAlign(const CSeq_align &align, int order=-1)
void ValidateSeqAnnot(const CSeq_annot_Handle &annot)
CCacheImpl & GetCache()
virtual ~CValidError_base()
static CSeq_entry_Handle GetAppropriateXrefParent(CSeq_entry_Handle seh)
CValidError_imp & m_Imp
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
CValidError_base(CValidError_imp &imp)
void ValidateBioseq(const CBioseq &seq)
bool GetTSAConflictingBiomolTechErrors(const CBioseq &seq)
bool GetTSANStretchErrors(const CBioseq &seq)
void ValidateBioseqSet(const CBioseq_set &seqset)
void ValidateSeqDesc(const CSeqdesc &desc, const CSeq_entry &ctx)
Validate descriptors as stand alone objects (no context)
void SetScope(CScope &scope)
void SetTSE(CSeq_entry_Handle seh)
bool GetTSACDSOnMinusStrandErrors(const CSeq_feat &feat, const CBioseq &seq)
static bool GetPrefixAndAccessionFromInferenceAccession(string inf_accession, string &prefix, string &accession)
void ValidateSeqFeat(const CSeq_feat &feat)
static vector< string > GetAccessionsFromInferenceString(string inference, string &prefix, string &remainder, bool &same_species)
void ValidateSeqGraph(const CSeq_graph &graph)
void x_ReportInvalidFuzz(const CPacked_seqint &packed_int, const CSerialObject &obj)
CRef< CObjectManager > m_ObjMgr
bool IsGED() const
void SetScope(const CSeq_entry &se)
void FindCollidingSerialNumbers(const CSerialObject &obj)
Definition: valid_pub.cpp:1323
const CSeq_entry_Handle & GetTSEH()
static bool BadMultipleSequenceLocation(const CSeq_loc &loc, CScope &scope)
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
Definition: validatorp.cpp:372
static bool IsTSAIntermediate(const CBioseq &seq)
void x_CheckPackedInt(const CPacked_seqint &packed_int, SLocCheck &lc, const CSerialObject &obj)
static bool IsInOrganelleSmallGenomeSet(const CSeq_id &id, CScope &scope)
bool IsNC() const
const CBioSourceKind & BioSourceKind() const
bool IsNS() const
CRef< CScope > m_Scope
bool HasGiOrAccnVer() const
void SetTSE(const CSeq_entry_Handle &seh)
const SValidatorContext & GetContext() const
Definition: validatorp.cpp:209
CValidator::TProgressCallback m_PrgCallback
IValidError * m_ErrRepository
bool IsPDB() const
CConstRef< CSeq_feat > GetmRNAGivenProduct(const CBioseq &seq)
bool IsValidateAlignments() const
CBioseq_Handle GetBioseqHandleFromTSE(const CSeq_id &id)
Definition: validatorp.cpp:253
void ValidateCitations(const CSeq_entry_Handle &seh)
bool DoesAnyFeatLocHaveGI() const
void FindNonAsciiText(const CSerialObject &obj)
void AddBioseqWithNoBiosource(const CBioseq &seq)
void ValidateSeqLocIds(const CSeq_loc &loc, const CSerialObject &obj)
bool GenerateGoldenFile() const
bool IsStandaloneAnnot() const
void x_DoBarcodeTests(CSeq_entry_Handle seh)
CConstRef< CSeq_annot > m_SeqAnnot
TSuppressed & SetSuppressed()
Definition: validatorp.cpp:194
bool IsNM() const
bool DoesAnyProductLocHaveGI() const
bool GetTSAConflictingBiomolTechErrors(const CSeq_entry_Handle &se)
bool x_IsSuppressed(CValidErrItem::TErrIndex errType) const
Definition: validatorp.cpp:365
void x_AddValidErrItem(EDiagSev sev, EErrType type, const string &msg, const string &desc, const CSerialObject &obj, const string &accession, const int version)
unique_ptr< CValidatorEntryInfo > m_pEntryInfo
bool IsNT() const
bool IsGenbank() const
void PostObjErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
TSuppressed m_SuppressedErrors
bool IsNZ() const
void Setup(const CSeq_entry_Handle &seh)
bool Validate(const CSeq_entry &se, const CCit_sub *cs=nullptr, CScope *scope=nullptr)
static bool IsWGSIntermediate(const CBioseq &seq)
CValidator::CProgressInfo m_PrgInfo
void ValidateDbxref(const CDbtag &xref, const CSerialObject &obj, bool biosource=false, const CSeq_entry *ctx=nullptr)
bool IsSerialNumberInComment(const string &comment)
bool IsGenomic() const
void ValidateTaxonomy(const CSeq_entry &se)
bool IsFarSequence(const CSeq_id &id)
Definition: validatorp.cpp:239
const CTSE_Handle & GetTSE_Handle()
size_t m_NumMisplacedFeatures
void FindEmbeddedScript(const CSerialObject &obj)
bool IsHugeFileMode() const
Definition: validatorp.cpp:216
void ValidateCitSub(const CCit_sub &cs, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
Definition: valid_pub.cpp:1078
void SetOptions(Uint4 options)
Definition: validatorp.cpp:275
bool m_ValidateInferenceAccessions
void ValidateSubmitBlock(const CSubmit_block &block, const CSeq_submit &ss)
bool IsNoCitSubPubs() const
bool IsNP() const
vector< CConstRef< CBioseq > > m_BioseqWithNoSource
void ValidateAffil(const CAffil::TStd &std, const CSerialObject &obj, const CSeq_entry *ctx)
Definition: valid_pub.cpp:988
CConstRef< CSeq_feat > GetCDSGivenProduct(const CBioseq &seq)
CBioseq_Handle GetLocalBioseqHandle(const CSeq_id &id)
Definition: validatorp.cpp:262
bool IsRefSeq() const
bool IsGPS() const
bool IsINSDInSep() const
bool IsNG() const