NCBI C++ ToolKit
validatorp.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: validatorp.cpp 103011 2024-08-21 17:32:06Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko, Mati Shomrat, ....
27  *
28  * File Description:
29  * Implementation of private parts of the validator
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/ncbiapp.hpp>
38 
49 
50 #include <serial/iterator.hpp>
51 #include <serial/enumvalues.hpp>
52 
56 
58 
61 
62 #include <objects/seq/Bioseq.hpp>
64 #include <objects/seq/Seqdesc.hpp>
66 #include <objects/seq/Pubdesc.hpp>
67 #include <objects/seq/MolInfo.hpp>
74 
79 
81 
84 
85 #include <objmgr/bioseq_ci.hpp>
86 #include <objmgr/seqdesc_ci.hpp>
87 #include <objmgr/graph_ci.hpp>
88 #include <objmgr/seq_annot_ci.hpp>
89 #include <objmgr/util/feature.hpp>
90 #include <objmgr/util/sequence.hpp>
91 
92 #include <objmgr/feat_ci.hpp>
93 #include <objmgr/align_ci.hpp>
94 #include <objmgr/seq_vector.hpp>
95 #include <objmgr/scope.hpp>
96 
97 #include <objects/pub/Pub.hpp>
99 
100 #include <objects/biblio/Author.hpp>
111 #include <objects/biblio/Title.hpp>
113 #include <objects/biblio/Affil.hpp>
116 #include <objects/taxon3/taxon3.hpp>
118 
125 
126 #include <objtools/error_codes.hpp>
132 #include <util/sgml_entity.hpp>
133 #include <util/line_reader.hpp>
134 #include <util/util_misc.hpp>
135 #include <util/static_set.hpp>
136 
137 #include <algorithm>
138 
139 
140 #include <serial/iterator.hpp>
141 
142 #define NCBI_USE_ERRCODE_X Objtools_Validator
143 
146 BEGIN_SCOPE(validator)
147 using namespace sequence;
148 
149 namespace {
150  // avoid creating a PQuickStringLess for every comparison
151  PQuickStringLess s_QuickStringLess;
152 };
153 
154 
155 // =============================================================================
156 // CValidError_imp Public
157 // =============================================================================
158 
164 
168 
170 (CObjectManager& objmgr,
171  shared_ptr<SValidatorContext> pContext,
172  IValidError* errs,
173  Uint4 options) :
174  m_ObjMgr{&objmgr},
175  m_ErrRepository{errs},
176  m_pContext{pContext}
177 {
178  x_Init(options, pContext->CumulativeInferenceCount, pContext->NotJustLocalOrGeneral, pContext->HasRefSeq);
179 }
180 
181 void CValidError_imp::x_Init(Uint4 options, size_t initialInferenceCount, bool notJustLocalOrGeneral, bool hasRefSeq)
182 {
183  SetOptions(options);
184  Reset(initialInferenceCount, notJustLocalOrGeneral, hasRefSeq);
185 
187 }
188 
189 // Destructor
191 {
192 }
193 
194 
196 {
197  return m_SuppressedErrors;
198 }
199 
201 {
202  // if (!m_pContext) {
203  // m_pContext = make_shared<SValidatorContext>();
204  // }
206  return *m_pContext;
207 }
208 
209 
211 {
213  return *m_pContext;
214 }
215 
216 
218 {
219  const auto& context = GetContext();
220  return context.PreprocessHugeFile ||
221  context.PostprocessHugeFile;
222 }
223 
224 
225 bool CValidError_imp::IsHugeSet(const CBioseq_set& bioseqSet) const
226 {
227  if (bioseqSet.IsSetClass()) {
228  return IsHugeSet(bioseqSet.GetClass());
229  }
230  return false;
231 }
232 
233 
235 {
236  return edit::CHugeAsnReader::IsHugeSet(setClass);
237 }
238 
239 
240 bool CValidError_imp::IsFarSequence(const CSeq_id& id) // const
241 {
242  if (IsHugeFileMode() && GetContext().IsIdInBlob) {
243  return !GetContext().IsIdInBlob(id);
244  }
245 
246  _ASSERT(m_Scope);
247  if (GetBioseqHandleFromTSE(id)) {
248  return false;
249  }
250  return true;
251 }
252 
253 
255 {
256  if (m_Scope) {
258  }
259  return CBioseq_Handle();
260 }
261 
262 
264 {
265  if (!IsHugeFileMode()) {
266  return GetBioseqHandleFromTSE(id);
267  }
268  // Huge-file mode
269  if (!IsFarSequence(id)) {
270  return m_Scope->GetBioseqHandle(id);
271  }
272  return CBioseq_Handle();
273 }
274 
275 
277 {
278  m_NonASCII = (options & CValidator::eVal_non_ascii) != 0;
281  m_ValidateExons = (options & CValidator::eVal_val_exons) != 0;
282  m_OvlPepErr = (options & CValidator::eVal_ovl_pep_err) != 0;
285  m_RemoteFetch = (options & CValidator::eVal_remote_fetch) != 0;
291  m_UseEntrez = (options & CValidator::eVal_use_entrez) != 0;
306 }
307 
308 
309 //LCOV_EXCL_START
310 //not used by asnvalidate
312 {
313  m_ErrRepository = errors;
314 }
315 //LCOV_EXCL_STOP
316 
317 
318 void CValidError_imp::Reset(size_t prevCumulativeInferenceCount, bool notJustLocalOrGeneral, bool hasRefSeq)
319 {
320  m_Scope = nullptr;
321  m_TSE = nullptr;
322  m_IsStandaloneAnnot = false;
323  m_SeqAnnot.Reset();
324 
325  m_pEntryInfo.reset(new CValidatorEntryInfo());
326 
327  m_CumulativeInferenceCount = prevCumulativeInferenceCount;
328  m_NotJustLocalOrGeneral = notJustLocalOrGeneral;
329  m_HasRefSeq = hasRefSeq;
330 
331  m_IsNC = false;
332  m_IsNG = false;
333  m_IsNM = false;
334  m_IsNP = false;
335  m_IsNR = false;
336  m_IsNZ = false;
337  m_IsNS = false;
338  m_IsNT = false;
339  m_IsNW = false;
340  m_IsWP = false;
341  m_IsXR = false;
342 
343  m_PrgCallback = nullptr;
344  m_NumAlign = 0;
345  m_NumAnnot = 0;
346  m_NumBioseq = 0;
347  m_NumBioseq_set = 0;
349  m_NumDesc = 0;
350  m_NumDescr = 0;
351  m_NumFeat = 0;
352  m_NumGraph = 0;
356  m_NumGenes = 0;
357  m_NumGeneXrefs = 0;
360  m_NumPseudo = 0;
361  m_NumPseudogene = 0;
362  m_FarFetchFailure = false;
363  m_IsTbl2Asn = false;
364 
365  SetSuppressed().clear();
366 }
367 
369 {
370  return (m_SuppressedErrors.find(errType) != m_SuppressedErrors.end());
371 }
372 
373 // Error post methods
375 (EDiagSev sv,
376  EErrType et,
377  const string& msg,
378  const CSerialObject& obj)
379 {
380  if (x_IsSuppressed(et)) {
381  return;
382  }
383 
384  const CTypeInfo* type_info = obj.GetThisTypeInfo();
385  if (type_info == CSeqdesc::GetTypeInfo()) {
386  const CSeqdesc* desc = dynamic_cast < const CSeqdesc* > (&obj);
387  ERR_POST_X(1, Warning << "Seqdesc validation error using default context.");
388  PostErr (sv, et, msg, GetTSE(), *desc);
389  } else if (type_info == CSeq_feat::GetTypeInfo()) {
390  const CSeq_feat* feat = dynamic_cast < const CSeq_feat* > (&obj);
391  PostErr (sv, et, msg, *feat);
392  } else if (type_info == CBioseq::GetTypeInfo()) {
393  const CBioseq* seq = dynamic_cast < const CBioseq* > (&obj);
394  PostErr (sv, et, msg, *seq);
395  } else if (type_info == CBioseq_set::GetTypeInfo()) {
396  const CBioseq_set* set = dynamic_cast < const CBioseq_set* > (&obj);
397  PostErr (sv, et, msg, *set);
398  } else if (type_info == CSeq_annot::GetTypeInfo()) {
399  const CSeq_annot* annot = dynamic_cast < const CSeq_annot* > (&obj);
400  PostErr (sv, et, msg, *annot);
401  } else if (type_info == CSeq_graph::GetTypeInfo()) {
402  const CSeq_graph* graph = dynamic_cast < const CSeq_graph* > (&obj);
403  PostErr (sv, et, msg, *graph);
404  } else if (type_info == CSeq_align::GetTypeInfo()) {
405  const CSeq_align* align = dynamic_cast < const CSeq_align* > (&obj);
406  PostErr (sv, et, msg, *align);
407  } else if (type_info == CSeq_entry::GetTypeInfo()) {
408  const CSeq_entry* entry = dynamic_cast < const CSeq_entry* > (&obj);
409  PostErr (sv, et, msg, *entry);
410  } else if (type_info == CBioSource::GetTypeInfo()) {
411  const CBioSource* src = dynamic_cast < const CBioSource* > (&obj);
412  PostErr (sv, et, msg, *src);
413  } else if (type_info == COrg_ref::GetTypeInfo()) {
414  const COrg_ref* org = dynamic_cast < const COrg_ref* > (&obj);
415  PostErr (sv, et, msg, *org);
416  } else if (type_info == CPubdesc::GetTypeInfo()) {
417  const CPubdesc* pd = dynamic_cast < const CPubdesc* > (&obj);
418  PostErr (sv, et, msg, *pd);
419  } else if (type_info == CSeq_submit::GetTypeInfo()) {
420  const CSeq_submit* ss = dynamic_cast < const CSeq_submit* > (&obj);
421  PostErr (sv, et, msg, *ss);
422  } else {
423  ERR_POST_X(1, Warning << "Unknown data type in PostErr.");
424  }
425 }
426 
427 
428 /*
429 void CValidError_imp::PostErr
430 (EDiagSev sv,
431  EErrType et,
432  const string& msg,
433  TDesc ds)
434 {
435  // Append Descriptor label
436  string desc = "DESCRIPTOR: ";
437  ds.GetLabel (&desc, CSeqdesc::eBoth);
438  desc += ", NO Descriptor Context";
439  m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, *m_Scope);
440 }
441 */
442 
443 static const EErrType sc_ValidGenomeRaise[] = {
601 };
602 
604 
610 };
611 
613 
614 
617 };
618 
620 
621 
623  EErrType et
624 )
625 
626 {
627  if (sc_GenomeRaiseExceptEmblDdbjRefSeqArray.find(et) != sc_GenomeRaiseExceptEmblDdbjRefSeqArray.end()) {
628  if (IsEmbl() || IsDdbj() || IsRefSeq()) {
629  return false;
630  } else {
631  return true;
632  }
633  }
634  if (sc_GenomeRaiseExceptEmblDdbjArray.find(et) != sc_GenomeRaiseExceptEmblDdbjArray.end()) {
635  if (IsEmbl() || IsDdbj()) {
636  return false;
637  } else {
638  return true;
639  }
640  }
641  if (sc_GenomeRaiseArray.find (et) != sc_GenomeRaiseArray.end()) {
642  return true;
643  }
644  return false;
645 }
646 
648 (EDiagSev sv,
649  EErrType et,
650  const string& msg,
651  TFeat ft)
652 {
653 
654  if (x_IsSuppressed(et)) {
655  return;
656  }
657 
659 
660  // Adjust severity
662  sv = eDiag_Error;
663  }
664 
665  item->SetSev(sv);
666  item->SetErrIndex(et);
667  item->SetMsg(msg);
668  item->SetObject(ft);
669 
670  if (GenerateGoldenFile()) {
672  return;
673  }
674 
675  string content_label = CValidErrorFormat::GetFeatureContentLabel(ft, m_Scope);
676  item->SetObj_content(content_label);
677 
678  string feature_id = CValidErrorFormat::GetFeatureIdLabel(ft);
679  if (!NStr::IsBlank(feature_id)) {
680  item->SetFeatureId(feature_id);
681  }
682 
684  if (!NStr::IsBlank(bioseq_label)) {
685  item->SetBioseq(bioseq_label);
686  }
687 
688  // Calculate sequence offset
689  TSeqPos offset = 0;
690  string location;
691  if (ft.IsSetLocation()) {
694  if (!NStr::IsBlank(loc_label)) {
695  item->SetLocation(loc_label);
696  }
697  item->SetSeqOffset(offset);
698  }
699 
700 
702  if (!NStr::IsBlank(product_label)) {
703  item->SetProduct_loc(product_label);
704  }
705 
706  int version = 0;
707  string accession;
708  if (m_Scope) {
709  accession = GetAccessionFromObjects(&ft, nullptr, *m_Scope, &version);
710  }
711  item->SetAccession(accession);
712  if (version > 0) {
713  item->SetAccnver(accession + "." + NStr::IntToString(version));
714  item->SetVersion(version);
715  } else {
716  item->SetAccnver(accession);
717  }
718 
719  if (ft.IsSetData()) {
720  if (ft.GetData().IsGene()) {
721  if (ft.GetData().GetGene().IsSetLocus_tag() &&
723  item->SetLocus_tag(ft.GetData().GetGene().GetLocus_tag());
724  }
725  } else {
726  if (m_CollectLocusTags) {
727  // TODO: this should be part of post-processing
729  if (gene && gene->GetData().GetGene().IsSetLocus_tag() &&
730  !NStr::IsBlank(gene->GetData().GetGene().GetLocus_tag())) {
731  item->SetLocus_tag(gene->GetData().GetGene().GetLocus_tag());
732  }
733  }
734  }
735  }
736 
737  item->SetFeatureObjDescFromFields();
739 }
740 
741 
743 (EDiagSev sv,
744  EErrType et,
745  const string& msg,
746  TBioseq sq)
747 {
748  if (x_IsSuppressed(et)) {
749  return;
750  }
751 
752  // Adjust severity
754  sv = eDiag_Error;
755  }
756 
757  if (GenerateGoldenFile()) {
759  return;
760  }
761 
762  // Append bioseq label
763  string desc;
765  int version = 0;
766  const string& accession = GetAccessionFromBioseq(sq, &version);
767  // GetAccessionFromObjects(&sq, nullptr, *m_Scope, &version);
768  x_AddValidErrItem(sv, et, msg, desc, sq, accession, version);
769 }
770 
771 
773 (EDiagSev sv,
774  EErrType et,
775  const string& msg,
776  TSet st)
777 {
778  if (x_IsSuppressed(et)) {
779  return;
780  }
781 
782  // Adjust severity
784  sv = eDiag_Error;
785  }
786 
787  if (GenerateGoldenFile()) {
789  return;
790  }
791 
792  // Append Bioseq_set label
793 
794  const auto isSetClass = st.IsSetClass();
795 
796  if (isSetClass && GetContext().PreprocessHugeFile) {
797  if (auto setClass = st.GetClass(); IsHugeSet(setClass)) {
798  string desc =
800  x_AddValidErrItem(sv, et, msg, desc, st, GetContext().HugeSetId, 0);
801  return;
802  }
803  }
804 
805  int version = 0;
806  const string& accession = GetAccessionFromBioseqSet(st, &version);
807  //string desc = CValidErrorFormat::GetBioseqSetLabel(st, m_SuppressContext);
808  string desc = CValidErrorFormat::GetBioseqSetLabel(accession,
809  isSetClass ? st.GetClass() : CBioseq_set::eClass_not_set,
810  isSetClass ? m_SuppressContext : true);
811  x_AddValidErrItem(sv, et, msg, desc, st, accession, version);
812 }
813 
814 
816 (EDiagSev sv,
817  EErrType et,
818  const string& msg,
819  TEntry ctx,
820  TDesc ds)
821 {
822  if (x_IsSuppressed(et)) {
823  return;
824  }
825 
826  // Adjust severity
828  sv = eDiag_Error;
829  }
830 
831  if (GenerateGoldenFile()) {
833  return;
834  }
835 
836 
837  if (GetContext().PreprocessHugeFile &&
838  ctx.IsSet() && ctx.GetSet().IsSetClass()) {
839  if (auto setClass = ctx.GetSet().GetClass(); IsHugeSet(setClass)) {
840  string desc{"DESCRIPTOR: "};
841  desc += CValidErrorFormat::GetDescriptorContent(ds) + " ";
842  desc += "BIOSEQ-SET: ";
843  if (!m_SuppressContext) {
844  if (setClass == CBioseq_set::eClass_genbank) {
845  desc += "genbank: ";
846  }
847  else {
848  desc += "wgs-set: ";
849  }
850  }
851  desc += GetContext().HugeSetId;
852  m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, GetContext().HugeSetId, 0);
853  return;
854  }
855  }
856 
857  // Append Descriptor label
859  int version = 0;
860  const string& accession = GetAccessionFromObjects(&ds, &ctx, *m_Scope, &version);
861  m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, ctx, accession, version);
862 }
863 
864 
865 //void CValidError_imp::PostErr
866 //(EDiagSev sv,
867 // EErrType et,
868 // const string& msg,
869 // TBioseq sq,
870 // TDesc ds)
871 //{
872 // // Append Descriptor label
873 // string desc("DESCRIPTOR: ");
874 // ds.GetLabel(&desc, CSeqdesc::eBoth);
875 //
876 // s_AppendBioseqLabel(desc, sq, m_SuppressContext);
877 // m_ErrRepository->AddValidErrItem(sv, et, msg, desc, ds, *m_Scope);
878 // //PostErr(sv, et, msg, sq);
879 //}
880 
881 
882 //void CValidError_imp::PostErr
883 //(EDiagSev sv,
884 // EErrType et,
885 // const string& msg,
886 // TSet st,
887 // TDesc ds)
888 //{
889 // // Append Descriptor label
890 // string desc = " DESCRIPTOR: ";
891 // ds.GetLabel(&desc, CSeqdesc::eBoth);
892 // s_AppendSetLabel(desc, st, m_SuppressContext);
893 // m_ErrRepository->AddValidErrItem(sv, et, msg, desc, st, *m_Scope);
894 //
895 //}
896 
897 
899 (EDiagSev sv,
900  EErrType et,
901  const string& msg,
902  TAnnot an)
903 {
904  if (x_IsSuppressed(et)) {
905  return;
906  }
907 
908  // Adjust severity
910  sv = eDiag_Error;
911  }
912 
913  if (GenerateGoldenFile()) {
915  return;
916  }
917 
918  // Append Annotation label
919  string desc = "ANNOTATION: ";
920 
921  // !!! need to decide on the message
922 
923  int version = 0;
924  const string& accession = GetAccessionFromObjects(&an, nullptr, *m_Scope, &version);
925  x_AddValidErrItem(sv, et, msg, desc, an, accession, version);
926 }
927 
928 
930 (EDiagSev sv,
931  EErrType et,
932  const string& msg,
933  TGraph graph)
934 {
935 
936  if (x_IsSuppressed(et)) {
937  return;
938  }
939 
940  // Adjust severity
942  sv = eDiag_Error;
943  }
944 
945  if (GenerateGoldenFile()) {
947  return;
948  }
949 
950  // Append Graph label
951  string desc = "GRAPH: ";
952  if (graph.IsSetTitle()) {
953  desc += graph.GetTitle();
954  } else {
955  desc += "<Unnamed>";
956  }
957  desc += " ";
958  graph.GetLoc().GetLabel(&desc);
959 
960  int version = 0;
961  const string& accession = GetAccessionFromObjects(&graph, nullptr, *m_Scope, &version);
962  x_AddValidErrItem(sv, et, msg, desc, graph, accession, version);
963 }
964 
965 
967 (EDiagSev sv,
968  EErrType et,
969  const string& msg,
970  TBioseq sq,
971  TGraph graph)
972 {
973 
974  if (x_IsSuppressed(et)) {
975  return;
976  }
977 
978  // Adjust severity
980  sv = eDiag_Error;
981  }
982 
983  if (GenerateGoldenFile()) {
985  return;
986  }
987 
988  // Append Graph label
989  string desc("GRAPH: ");
990  if ( graph.IsSetTitle() ) {
991  desc += graph.GetTitle();
992  } else {
993  desc += "<Unnamed>";
994  }
995  desc += " ";
996  graph.GetLoc().GetLabel(&desc);
998  int version = 0;
999  const string& accession = GetAccessionFromObjects(&graph, nullptr, *m_Scope, &version);
1000  x_AddValidErrItem(sv, et, msg, desc, graph, accession, version);
1001 }
1002 
1003 
1005 (EDiagSev sv,
1006  EErrType et,
1007  const string& msg,
1008  TAlign align)
1009 {
1010 
1011  if (x_IsSuppressed(et)) {
1012  return;
1013  }
1014 
1015  // Adjust severity
1016  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1017  sv = eDiag_Error;
1018  }
1019 
1020  if (GenerateGoldenFile()) {
1022  return;
1023  }
1024 
1026  if (id) {
1028  if (bsh) {
1029  PostErr(sv, et, msg, *(bsh.GetCompleteBioseq()));
1030  return;
1031  }
1032  }
1033 
1034  // Can't get bioseq for reporting, use other Alignment label
1035  string desc = "ALIGNMENT: ";
1036  if (align.IsSetType()) {
1037  desc += align.ENUM_METHOD_NAME(EType)()->FindName(align.GetType(), true);
1038  }
1039  try {
1040  CSeq_align::TDim dim = align.GetDim();
1041  desc += ", dim=" + NStr::NumericToString(dim);
1042  } catch ( const CUnassignedMember& ) {
1043  desc += ", dim=UNASSIGNED";
1044  }
1045 
1046  if (align.IsSetSegs()) {
1047  desc += " SEGS: ";
1048  desc += align.GetSegs().SelectionName(align.GetSegs().Which());
1049  }
1050 
1051  int version = 0;
1052  const string& accession = GetAccessionFromObjects(&align, nullptr, *m_Scope, &version);
1053  x_AddValidErrItem(sv, et, msg, desc, align, accession, version);
1054 }
1055 
1056 
1058 (EDiagSev sv,
1059  EErrType et,
1060  const string& msg,
1061  TEntry entry)
1062 {
1063  if (x_IsSuppressed(et)) {
1064  return;
1065  }
1066 
1067  // Adjust severity
1068  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1069  sv = eDiag_Error;
1070  }
1071 
1072  if (GenerateGoldenFile()) {
1074  return;
1075  }
1076 
1077  if (entry.IsSeq()) {
1078  PostErr(sv, et, msg, entry.GetSeq());
1079  } else if (entry.IsSet()) {
1080  PostErr(sv, et, msg, entry.GetSet());
1081  } else {
1082  string desc = "SEQ-ENTRY: ";
1083  entry.GetLabel(&desc, CSeq_entry::eContent);
1084 
1085  int version = 0;
1086  const string& accession = GetAccessionFromObjects(&entry, nullptr, *m_Scope, &version);
1087  x_AddValidErrItem(sv, et, msg, desc, entry, accession, version);
1088  }
1089 }
1090 
1091 
1093 (EDiagSev sv,
1094  EErrType et,
1095  const string& msg,
1096  const CBioSource& src)
1097 {
1098 
1099  if (x_IsSuppressed(et)) {
1100  return;
1101  }
1102 
1103  // Adjust severity
1104  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1105  sv = eDiag_Error;
1106  }
1107 
1108  if (GenerateGoldenFile()) {
1110  return;
1111  }
1112 
1113  string desc = "BioSource: ";
1114  x_AddValidErrItem(sv, et, msg, desc, src, "", 0);
1115 }
1116 
1117 
1119 (EDiagSev sv,
1120  EErrType et,
1121  const string& msg,
1122  const COrg_ref& org)
1123 {
1124 
1125  if (x_IsSuppressed(et)) {
1126  return;
1127  }
1128 
1129  // Adjust severity
1130  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1131  sv = eDiag_Error;
1132  }
1133 
1134  if (GenerateGoldenFile()) {
1136  return;
1137  }
1138 
1139  string desc = "Org-ref: ";
1140  x_AddValidErrItem(sv, et, msg, desc, org, "", 0);
1141 }
1142 
1143 
1145 (EDiagSev sv,
1146  EErrType et,
1147  const string& msg,
1148  const CPubdesc& pd)
1149 {
1150  if (x_IsSuppressed(et)) {
1151  return;
1152  }
1153 
1154  // Adjust severity
1155  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1156  sv = eDiag_Error;
1157  }
1158 
1159  if (GenerateGoldenFile()) {
1161  return;
1162  }
1163 
1164  string desc = "Pubdesc: ";
1165  x_AddValidErrItem(sv, et, msg, desc, pd, "", 0);
1166 }
1167 
1168 
1170 (EDiagSev sv,
1171  EErrType et,
1172  const string& msg,
1173  const CSeq_submit& ss)
1174 {
1175  if (x_IsSuppressed(et)) {
1176  return;
1177  }
1178 
1179  // Adjust severity
1180  if (m_genomeSubmission && RaiseGenomeSeverity(et) && sv < eDiag_Error) {
1181  sv = eDiag_Error;
1182  }
1183 
1184  if (GenerateGoldenFile()) {
1186  return;
1187  }
1188 
1189  string desc = "Seq-submit: ";
1190  x_AddValidErrItem(sv, et, msg, desc, ss, "", 0);
1191 }
1192 
1193 
1195  EDiagSev sev,
1196  EErrType type,
1197  const string& msg,
1198  const string& desc,
1199  const CSerialObject& obj,
1200  const string& accession,
1201  const int version)
1202 {
1203  if (IsHugeFileMode()) {
1204  m_ErrRepository->AddValidErrItem(sev, type, msg, desc, accession, version);
1205  return;
1206  }
1207  m_ErrRepository->AddValidErrItem(sev, type, msg, desc, obj, accession, version);
1208 }
1209 
1210 
1212 (EDiagSev sv,
1213  EErrType et,
1214  const string& msg,
1215  const CSerialObject& obj,
1216  const CSeq_entry *ctx)
1217 {
1218  if (!ctx) {
1219  PostErr (sv, et, msg, obj);
1220  } else if (obj.GetThisTypeInfo() == CSeqdesc::GetTypeInfo()) {
1221  PostErr(sv, et, msg, *ctx, *(dynamic_cast <const CSeqdesc*> (&obj)));
1222  } else {
1223  PostErr(sv, et, msg, obj);
1224  }
1225 
1226 }
1227 
1228 
1230 (EDiagSev sv,
1231  const string& msg,
1232  int flags,
1233  const CSerialObject& obj,
1234  const CSeq_entry *ctx)
1235 {
1236  string reasons = GetDateErrorDescription(flags);
1237 
1238  NStr::TruncateSpacesInPlace (reasons);
1239  reasons = msg + " - " + reasons;
1240 
1241  PostObjErr (sv, eErr_GENERIC_BadDate, reasons, obj, ctx);
1242 }
1243 
1244 
1246 (const CSeq_entry& se,
1247  const CCit_sub* cs,
1248  CScope* scope)
1249 {
1250  CSeq_entry_Handle seh;
1251  try {
1252  seh = scope->GetSeq_entryHandle(se);
1253  } catch (const CException& ) { ; }
1254  if (! seh) {
1255  seh = scope->AddTopLevelSeqEntry(se);
1256  if (!seh) {
1257  return false;
1258  }
1259  }
1260 
1261  return Validate(seh, cs);
1262 }
1263 
1264 static bool s_IsPhage(const COrg_ref& org)
1265 {
1266  if (org.IsSetDivision() && NStr::Equal(org.GetDivision(), "PHG")) {
1267  return true;
1268  } else {
1269  return false;
1270  }
1271 }
1272 
1273 
1275 {
1276  bool has_mult = false;
1277  int first_id = 0;
1278  int phage_id = 0;
1279 
1280  for (CBioseq_CI bi(seh); bi; ++bi) {
1281  for (CSeqdesc_CI desc_ci(*bi, CSeqdesc::e_Source);
1282  desc_ci && !has_mult;
1283  ++desc_ci) {
1284  if (desc_ci->GetSource().IsSetOrg()) {
1285  const COrg_ref& org = desc_ci->GetSource().GetOrg();
1286  if (org.IsSetDb()) {
1287  ITERATE(COrg_ref::TDb, it, org.GetDb()) {
1288  if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "taxon") &&
1289  (*it)->IsSetTag() && (*it)->GetTag().IsId()) {
1290  int this_id = (*it)->GetTag().GetId();
1291  if (this_id > 0) {
1292  if (s_IsPhage(org)) {
1293  phage_id = this_id;
1294  } else if (first_id == 0) {
1295  first_id = this_id;
1296  } else if (first_id != this_id) {
1297  has_mult = true;
1298  }
1299  }
1300  }
1301  }
1302  }
1303  }
1304  }
1305  }
1306  if (has_mult || (phage_id > 0 && first_id > 0)) {
1308  "There are multiple taxonIDs in this RefSeq record.",
1309  *m_TSE);
1310  }
1311 }
1312 
1313 
1315 {
1316  return *m_pEntryInfo;
1317 }
1318 
1319 
1321 {
1322  if (!m_pEntryInfo) {
1323  m_pEntryInfo.reset(new CValidatorEntryInfo());
1324  }
1325 
1326  return *m_pEntryInfo;
1327 }
1328 
1329 
1331 (const CSeq_entry_Handle& seh,
1332  const CCit_sub* cs)
1333 {
1334  _ASSERT(seh);
1335 
1336  if ( m_PrgCallback ) {
1338  if ( m_PrgCallback(&m_PrgInfo) ) {
1339  return false;
1340  }
1341  }
1342 
1343  // Check that CSeq_entry has data
1344  if (seh.Which() == CSeq_entry::e_not_set) {
1345  ERR_POST_X(2, Warning << "Seq_entry not set");
1346  return false;
1347  }
1348 
1349  Setup(seh);
1350 
1351  // Seq-submit has submission citationTest_Descr_LatLonValue
1352  if (cs) {
1353  x_SetEntryInfo().SetNoPubs(false);
1355  }
1356 
1357  // Get first CBioseq object pointer for PostErr below.
1359  if (!seq) {
1361  "No Bioseqs in this entire record.", seh.GetCompleteSeq_entry()->GetSet());
1362  return true;
1363  }
1364 
1365  // If m_NonASCII is true, then this flag was set by the caller
1366  // of validate to indicate that a non ascii character had been
1367  // read from a file being used to create a CSeq_entry, that the
1368  // error had been corrected, but that the error needs to be reported
1369  // by Validate. Note, Validate is not doing anything other than
1370  // reporting an error if m_NonASCII is true;
1371  if (m_NonASCII) {
1373  "Non-ascii chars in input ASN.1 strings", *seq);
1374  // Only report the error once
1375  m_NonASCII = false;
1376  }
1377 
1378  // Iterate thru components of record and validate each
1379 
1380  // also want to know if we have gi
1381  bool has_gi = false;
1382  // also want to know if there are any nucleotide sequences
1383  bool has_nucleotide_sequence = false;
1384 
1386  bi && (!IsINSDInSep() || !has_gi || !has_nucleotide_sequence);
1387  ++bi) {
1388  FOR_EACH_SEQID_ON_BIOSEQ (it, *(bi->GetCompleteBioseq())) {
1389  if ((*it)->IsGi()) {
1390  has_gi = true;
1391  }
1392  }
1393  if (bi->IsSetInst_Mol() && bi->IsNa()) {
1394  has_nucleotide_sequence = true;
1395  }
1396  }
1397 
1398  if (IsINSDInSep() && m_pEntryInfo->IsRefSeq()) {
1399  // NOTE: We use m_IsRefSeq to indicate the actual presence of RefSeq IDs in
1400  // the record, rather than IsRefSeq(), which indicates *either* RefSeq IDs are
1401  // present *OR* the refseq flag has been used
1403  "INSD and RefSeq records should not be present in the same set", *m_TSE);
1404  }
1405 
1406 #if 0
1407  // disabled for now
1408  // look for long IDs that would collide if truncated at 30 characters
1409  vector<string> id_strings;
1411  bi;
1412  ++bi) {
1413  FOR_EACH_SEQID_ON_BIOSEQ (it, *(bi->GetCompleteBioseq())) {
1414  if (!IsNCBIFILESeqId(**it)) {
1415  string label;
1416  (*it)->GetLabel(&label);
1417  id_strings.push_back(label);
1418  }
1419  }
1420  }
1421  stable_sort (id_strings.begin(), id_strings.end());
1422  for (vector<string>::iterator id_str_it = id_strings.begin();
1423  id_str_it != id_strings.end();
1424  ++id_str_it) {
1425  string pattern = (*id_str_it).substr(0, 30);
1426  string first_id = *id_str_it;
1427  vector<string>::iterator cmp_it = id_str_it;
1428  ++cmp_it;
1429  while (cmp_it != id_strings.end() && NStr::StartsWith(*cmp_it, pattern)) {
1430  CRef<CSeq_id> id(new CSeq_id(*cmp_it));
1433  "First 30 characters of " + first_id + " and " +
1434  *cmp_it + " are identical", *(bsh.GetCompleteBioseq()));
1435  ++id_str_it;
1436  ++cmp_it;
1437  }
1438  }
1439 #endif
1440 
1441  // look for colliding feature IDs
1442  vector < int > feature_ids;
1443  for (CFeat_CI fi(GetTSEH()); fi; ++fi) {
1444  const CSeq_feat& sf = fi->GetOriginalFeature();
1445  if (sf.IsSetId() && sf.GetId().IsLocal() && sf.GetId().GetLocal().IsId()) {
1446  feature_ids.push_back(sf.GetId().GetLocal().GetId());
1447  }
1448  }
1449 
1450  if (feature_ids.size() > 0) {
1451  const CTSE_Handle& tse = seh.GetTSE_Handle ();
1452  stable_sort (feature_ids.begin(), feature_ids.end());
1453  vector <int>::iterator it = feature_ids.begin();
1454  int id = *it;
1455  ++it;
1456  while (it != feature_ids.end()) {
1457  if (*it == id) {
1458  vector<CSeq_feat_Handle> handles = tse.GetFeaturesWithId(CSeqFeatData::e_not_set, id);
1459  ITERATE( vector<CSeq_feat_Handle>, feat_it, handles ) {
1461  "Colliding feature ID " + NStr::NumericToString (id), *(feat_it->GetSeq_feat()));
1462  }
1463  while (it != feature_ids.end() && *it == id) {
1464  ++it;
1465  }
1466  if (it != feature_ids.end()) {
1467  id = *it;
1468  ++it;
1469  }
1470  } else {
1471  id = *it;
1472  ++it;
1473  }
1474  }
1475  }
1476 
1477  // look for mixed gps and non-gps sets
1478  bool has_nongps = false;
1479  bool has_gps = false;
1480 
1481  for (CTypeConstIterator<CBioseq_set> si(*m_TSE); si && (!has_nongps || !has_gps); ++si) {
1482  if (si->IsSetClass()) {
1483  if (si->GetClass() == CBioseq_set::eClass_mut_set
1484  || si->GetClass() == CBioseq_set::eClass_pop_set
1485  || si->GetClass() == CBioseq_set::eClass_phy_set
1486  || si->GetClass() == CBioseq_set::eClass_eco_set
1487  || si->GetClass() == CBioseq_set::eClass_wgs_set
1488  || si->GetClass() == CBioseq_set::eClass_small_genome_set) {
1489  has_nongps = true;
1490  } else if (si->GetClass() == CBioseq_set::eClass_gen_prod_set) {
1491  has_gps = true;
1492  }
1493  }
1494  }
1495 
1496  if (has_nongps && has_gps) {
1498  "Genomic product set and mut/pop/phy/eco set records should not be present in the same set",
1499  *m_TSE);
1500  }
1501 
1502  // count inference accessions - if there are too many, WAS temporarily disable inference checking
1503  // now disable inference checking for rest of this validator run
1504  bool old_inference_acc_check = m_ValidateInferenceAccessions;
1506  m_IgnoreInferences = true;
1507  }
1508  if (! m_IgnoreInferences) {
1509  CFeat_CI feat_inf(seh);
1510  while (feat_inf && ! m_IgnoreInferences) {
1511  FOR_EACH_GBQUAL_ON_FEATURE (qual, *feat_inf) {
1512  if (! m_IgnoreInferences && (*qual)->IsSetQual() && (*qual)->IsSetVal() && NStr::Equal((*qual)->GetQual(), "inference")) {
1515  // disable inference checking for remainder of run
1516  m_IgnoreInferences = true;
1517 
1518  // warn about too many inferences
1520  "Skipping validation of remaining /inference qualifiers",
1521  *m_TSE);
1522  }
1523 
1525  string prefix, remainder;
1526  bool same_species;
1527  size_t num_accessions = 0;
1528  vector<string> accessions = CValidError_feat::GetAccessionsFromInferenceString ((*qual)->GetVal(), prefix, remainder, same_species);
1529  for (size_t i = 0; i < accessions.size(); i++) {
1530  NStr::TruncateSpacesInPlace (accessions[i]);
1531  string acc_prefix, accession;
1532  if (CValidError_feat::GetPrefixAndAccessionFromInferenceAccession (accessions[i], acc_prefix, accession)) {
1533  num_accessions++;
1535  }
1536  }
1537  if (num_accessions > 0) {
1538  m_CumulativeInferenceCount += num_accessions;
1540  // disable inference checking for remainder of run
1541  m_IgnoreInferences = true;
1542 
1543  // warn about too many inferences
1545  "Skipping validation of remaining /inference qualifiers",
1546  *m_TSE);
1547  }
1548  }
1549  }
1550  }
1551  }
1552  ++feat_inf;
1553  }
1554  }
1555 
1556  // validate the main data
1557  if (seh.IsSeq()) {
1558  const CBioseq& seq2 = seh.GetCompleteSeq_entry()->GetSeq();
1559  CValidError_bioseq bioseq_validator(*this);
1560  try {
1561  bioseq_validator.ValidateBioseq(seq2);
1562  } catch ( const exception& e ) {
1564  string("Exception while validating bioseq. EXCEPTION: ") +
1565  e.what(), seq2);
1566  return true;
1567  }
1568  } else if (seh.IsSet()) {
1569  const CBioseq_set& set = seh.GetCompleteSeq_entry()->GetSet();
1570  CValidError_bioseqset bioseqset_validator(*this);
1571 
1572  try {
1573  bioseqset_validator.ValidateBioseqSet(set);
1574 
1575  } catch ( const exception& e ) {
1577  string("Exception while validating bioseq set. EXCEPTION: ") +
1578  e.what(), set);
1579  return true;
1580  }
1581  }
1582 
1583  // put flag for validating inference accessions back to original value
1584  m_ValidateInferenceAccessions = old_inference_acc_check;
1585 
1586  // validation from data collected during previous step
1587 
1588  if (!GetContext().PreprocessHugeFile) {
1589  if ( m_NumTpaWithHistory > 0 &&
1590  m_NumTpaWithoutHistory > 0 ) {
1592  "There are " +
1594  " TPAs with history and " +
1596  " without history in this record.", *seq);
1597  }
1598  if ( m_NumTpaWithoutHistory > 0 && has_gi) {
1600  "There are " +
1602  " TPAs without history in this record, but the record has a gi number assignment.", *m_TSE);
1603  }
1604  }
1605 
1606  if (IsIndexerVersion() && DoesAnyProteinHaveGeneralID() && !IsRefSeq() && has_nucleotide_sequence) {
1607  call_once(SetContext().ProteinHaveGeneralIDOnceFlag,
1608  [](CValidError_imp* imp, CSeq_entry_Handle seh2) {
1610  "INDEXER_ONLY - Protein bioseqs have general seq-id.",
1611  *(seh2.GetCompleteSeq_entry()));
1612  }, this, seh);
1613  }
1614 
1615  ReportMissingPubs(*m_TSE, cs);
1617 
1618  if (m_NumMisplacedFeatures > 1) {
1620  "There are " + NStr::SizetToString (m_NumMisplacedFeatures) + " mispackaged features in this record.",
1621  *(seh.GetCompleteSeq_entry()));
1622  } else if (m_NumMisplacedFeatures == 1) {
1624  "There is 1 mispackaged feature in this record.",
1625  *(seh.GetCompleteSeq_entry()));
1626  }
1627  if (m_NumSmallGenomeSetMisplaced > 1) {
1629  "There are " + NStr::SizetToString (m_NumSmallGenomeSetMisplaced) + " mispackaged features in this small genome set record.",
1630  *(seh.GetCompleteSeq_entry()));
1631  } else if (m_NumSmallGenomeSetMisplaced == 1) {
1633  "There is 1 mispackaged feature in this small genome set record.",
1634  *(seh.GetCompleteSeq_entry()));
1635  }
1636  if ( !GetContext().PreprocessHugeFile ) {
1637  if ( m_NumGenes == 0 && m_NumGeneXrefs > 0 ) {
1639  "There are " + NStr::SizetToString(m_NumGeneXrefs) +
1640  " gene xrefs and no gene features in this record.", *m_TSE);
1641  }
1642  }
1643  ValidateCitations (seh);
1644 
1645 
1646  if ( m_NumMisplacedGraphs > 0 ) {
1649  string("There ") + ((m_NumMisplacedGraphs > 1) ? "are " : "is ") + num +
1650  " mispackaged graph" + ((m_NumMisplacedGraphs > 1) ? "s" : "") + " in this record.",
1651  *m_TSE);
1652  }
1653 
1654  if ( IsRefSeq() && ! IsWP() ) {
1656  }
1657 
1658 
1661  if (!GetContext().PreprocessHugeFile) {
1663  }
1664 
1665  if (m_FarFetchFailure) {
1667  "Far fetch failures caused some validator tests to be bypassed",
1668  *m_TSE);
1669  }
1670 
1671  if (m_DoTaxLookup) {
1673  }
1674 
1675  // validate cit-sub
1676  if (cs) {
1678  }
1679 
1680  // optional barcode tests
1681  if (m_DoBarcodeTests) {
1682  x_DoBarcodeTests(seh);
1683  }
1684  return true;
1685 }
1686 
1687 
1689 {
1690  if (block.IsSetHup() && block.GetHup() && block.IsSetReldate() &&
1691  IsDateInPast(block.GetReldate())) {
1693  "Record release date has already passed", ss);
1694  }
1695 
1696  if (block.IsSetContact() && block.GetContact().IsSetContact()) {
1697  const CAuthor& author = block.GetContact().GetContact();
1698  if (author.IsSetAffil() && author.GetAffil().IsStd()) {
1699  ValidateAffil(author.GetAffil().GetStd(), ss, nullptr);
1700  }
1701  const CPerson_id& pid = author.GetName();
1702  if (pid.IsName()) {
1703  const CName_std& nstd = pid.GetName();
1704  string first = "";
1705  string last = "";
1706  if (nstd.IsSetLast()) {
1707  last = nstd.GetLast();
1710  "Bad last name '" + last + "'", ss);
1711  }
1712  }
1713  if (nstd.IsSetFirst()) {
1714  first = nstd.GetFirst();
1717  "Bad first name '" + first + "'", ss);
1718  }
1719  }
1720  if (first != "" && last != "" && NStr::EqualNocase(last, "last") && NStr::EqualNocase(first, "first")) {
1722  "Bad first and last name", ss);
1723  }
1724  }
1725  }
1726  if (block.IsSetCit()) {
1727  const CCit_sub& sub = block.GetCit();
1728  if (sub.IsSetAuthors()) {
1729  const CAuth_list& auth_list = sub.GetAuthors();
1730  const CAuth_list::TNames& names = auth_list.GetNames();
1731  if (names.IsStd()) {
1732  ITERATE ( CAuth_list::C_Names::TStd, name, names.GetStd() ) {
1733  if ( (*name)->GetName().IsName() ) {
1734  const CName_std& nstd = (*name)->GetName().GetName();
1735  string first = "";
1736  string last = "";
1737  if (nstd.IsSetLast()) {
1738  last = nstd.GetLast();
1741  "Bad last name '" + last + "'", ss);
1742  }
1743  }
1744  if (nstd.IsSetFirst()) {
1745  first = nstd.GetFirst();
1748  "Bad first name '" + first + "'", ss);
1749  }
1750  }
1751  if (first != "" && last != "" && NStr::EqualNocase(last, "last") && NStr::EqualNocase(first, "first")) {
1753  "Bad first and last name", ss);
1754  }
1755  }
1756  }
1757  }
1758  }
1759  }
1760 }
1761 
1762 
1764  const CSeq_submit& ss, CScope* scope)
1765 {
1766  // Check that ss is type e_Entrys
1767  if ( ss.GetData().Which() != CSeq_submit::C_Data::e_Entrys ) {
1768  return;
1769  }
1770 
1772  if (ss.IsSetSub()) {
1773  if (IsHugeFileMode()) {
1774  call_once(SetContext().SubmitBlockOnceFlag,
1775  [this, &ss](){ ValidateSubmitBlock(ss.GetSub(), ss); });
1776  }
1777  else {
1778  ValidateSubmitBlock(ss.GetSub(), ss);
1779  }
1780  }
1781 
1782  // Get CCit_sub pointer
1783  const CCit_sub* cs = &ss.GetSub().GetCit();
1784 
1785  if (ss.IsSetSub() && ss.GetSub().IsSetTool() && NStr::StartsWith(ss.GetSub().GetTool(), "Geneious")) {
1787  }
1788 
1789  // Just loop thru CSeq_entrys
1790  FOR_EACH_SEQENTRY_ON_SEQSUBMIT (se_itr, ss) {
1791  const CSeq_entry& se = **se_itr;
1792  if(se.IsSet())
1793  {
1794  const CBioseq_set &set = se.GetSet();
1795  if(set.IsSetClass() &&
1796  set.GetClass() == CBioseq_set::eClass_wgs_set)
1797  {
1799  CSeq_entry_Handle seh;
1800  seh = scope->GetSeq_entryHandle(se);
1801  Setup(seh);
1802  call_once(SetContext().WgsSetInSeqSubmitOnceFlag,
1803  [this, seh]() {
1805  "File was created as a wgs-set, but should be a batch submission instead.",
1806  seh.GetCompleteSeq_entry()->GetSet());
1807  });
1808  } else {
1809  CSeq_entry_Handle seh;
1810  seh = scope->GetSeq_entryHandle(se);
1811  Setup(seh);
1813  "File was created as a wgs-set, but should be a batch submission instead.",
1814  seh.GetCompleteSeq_entry()->GetSet());
1815  }
1816  }
1817  }
1818  Validate (se, cs, scope);
1819  }
1820 }
1821 
1822 
1824  const CSeq_annot_Handle& sah)
1825 {
1826  Setup(sah);
1827 
1828  // Iterate thru components of record and validate each
1829 
1830  CValidError_annot annot_validator(*this);
1831  annot_validator.ValidateSeqAnnot(sah);
1832 
1833  switch (sah.Which()) {
1835  {
1836  CValidError_feat feat_validator(*this);
1837  for (CFeat_CI fi (sah); fi; ++fi) {
1838  const CSeq_feat& sf = fi->GetOriginalFeature();
1839  feat_validator.ValidateSeqFeat(sf);
1840  }
1841  }
1842  break;
1843 
1845  {
1846  if (IsValidateAlignments()) {
1847  CValidError_align align_validator(*this);
1848  int order = 1;
1849  for (CAlign_CI ai(sah); ai; ++ai) {
1850  const CSeq_align& sa = ai.GetOriginalSeq_align();
1851  align_validator.ValidateSeqAlign(sa, order++);
1852  }
1853  }
1854  }
1855  break;
1856 
1858  {
1859  CValidError_graph graph_validator(*this);
1860  // for (CTypeConstIterator <CSeq_graph> gi (sa); gi; ++gi) {
1861  for (CGraph_CI gi(sah); gi; ++gi) {
1862  const CSeq_graph& sg = gi->GetOriginalGraph();
1863  graph_validator.ValidateSeqGraph(sg);
1864  }
1865  }
1866  break;
1867  default:
1868  break;
1869  }
1873 }
1874 
1875 
1876 void CValidError_imp::Validate(const CSeq_feat& feat, CScope* scope)
1877 {
1878  // automatically restores m_Scope to its old value when we leave
1879  // the function
1880  CScopeRestorer scopeRestorer( m_Scope );
1881 
1882  if( scope ) {
1883  m_Scope.Reset(scope);
1884  }
1885  if (!m_Scope) {
1886  // set up a temporary local scope if there is no scope set already
1887  m_Scope.Reset(new CScope(*m_ObjMgr));
1888  }
1889 
1890  CValidError_feat feat_validator(*this);
1891  feat_validator.SetScope(*m_Scope);
1893  feat_validator.SetTSE(empty);
1894  feat_validator.ValidateSeqFeat(feat);
1895  if (feat.IsSetData() && feat.GetData().IsBiosrc()) {
1896  const CBioSource& src = feat.GetData().GetBiosrc();
1897  if (src.IsSetOrg()) {
1899  }
1900  }
1901  FindEmbeddedScript(feat);
1902  FindNonAsciiText(feat);
1904 }
1905 
1906 
1908 {
1909  // automatically restores m_Scope to its old value when we leave
1910  // the function
1911  CScopeRestorer scopeRestorer( m_Scope );
1912 
1913  if( scope ) {
1914  m_Scope.Reset(scope);
1915  }
1916  if (!m_Scope) {
1917  // set up a temporary local scope if there is no scope set already
1918  m_Scope.Reset(new CScope(*m_ObjMgr));
1919  }
1920 
1921  ValidateBioSource(src, src);
1922  if (src.IsSetOrg()) {
1924  }
1925  FindEmbeddedScript(src);
1926  FindNonAsciiText(src);
1928 }
1929 
1930 
1931 void CValidError_imp::Validate(const CPubdesc& pubdesc, CScope* scope)
1932 {
1933  // automatically restores m_Scope to its old value when we leave
1934  // the function
1935  CScopeRestorer scopeRestorer( m_Scope );
1936 
1937  if( scope ) {
1938  m_Scope.Reset(scope);
1939  }
1940  if (!m_Scope) {
1941  // set up a temporary local scope if there is no scope set already
1942  m_Scope.Reset(new CScope(*m_ObjMgr));
1943  }
1944 
1945  ValidatePubdesc(pubdesc, pubdesc);
1946  FindEmbeddedScript(pubdesc);
1947  FindNonAsciiText(pubdesc);
1948  FindCollidingSerialNumbers(pubdesc);
1949 }
1950 
1952 {
1953  CValidError_desc seqdesc_validator(*this);
1954  m_Scope.Reset(new CScope(*m_ObjMgr));
1956  seqdesc_validator.ValidateSeqDesc(desc,ctx);
1957 }
1958 
1959 
1962  void* user_data)
1963 {
1964  m_PrgCallback = callback;
1965  m_PrgInfo.m_UserData = user_data;
1966 }
1967 
1968 
1970 (const CDbtag& xref,
1971  const CSerialObject& obj,
1972  bool biosource,
1973  const CSeq_entry *ctx)
1974 {
1975  bool refseq_or_gps = IsRefSeq() || IsGPS();
1977  refseq_or_gps);
1978 
1979  const string& db = xref.IsSetDb() ? xref.GetDb() : kEmptyStr;
1980 
1983  "dbxref value " + xref.GetTag().GetStr() + " has SGML",
1984  obj, ctx);
1985  }
1988  "dbxref value " + xref.GetTag().GetStr() + " contains space character",
1989  obj, ctx);
1990  }
1991  if (flags & CValidator::eDbHasSgml) {
1993  "dbxref database " + db + " has SGML",
1994  obj, ctx);
1995  }
1996 
1997  bool isStr = false;
1998  string dbv;
1999  if (xref.IsSetTag() && xref.GetTag().IsStr()) {
2000  dbv = xref.GetTag().GetStr();
2001  isStr = true;
2002  } else if (xref.IsSetTag() && xref.GetTag().IsId()) {
2003  dbv = NStr::NumericToString(xref.GetTag().GetId());
2004  }
2005 
2008  "Illegal db_xref type " + db + " (" + dbv + ")", obj, ctx);
2009  }
2011  // capitalization is bad
2012  bool refseq_db = false, src_db = false;
2013  string correct_caps;
2014  xref.GetDBFlags(refseq_db, src_db, correct_caps);
2015  string message = "Illegal db_xref type " + db + " (" + dbv + "), legal capitalization is " + correct_caps;
2017  message += ", but should not be used on an OrgRef";
2018  } else if (flags & CValidator::eOnlyForSource) {
2019  message += ", but should only be used on an OrgRef";
2020  }
2021 
2023  } else {
2027  "RefSeq-specific db_xref type " + db + " (" + dbv + ") should not be used on a non-RefSeq OrgRef",
2028  obj, ctx);
2029  } else {
2031  "db_xref type " + db + " (" + dbv + ") is only legal for RefSeq",
2032  obj, ctx);
2033  }
2034  } else if (flags & CValidator::eNotForSource) {
2037  "RefSeq-specific db_xref type " + db + " (" + dbv + ") should not be used on an OrgRef",
2038  obj, ctx);
2039  } else {
2041  "db_xref type " + db + " (" + dbv + ") should not be used on an OrgRef",
2042  obj, ctx);
2043  }
2044  } else if (flags & CValidator::eOnlyForSource) {
2046  "db_xref type " + db + " (" + dbv + ") should only be used on an OrgRef",
2047  obj, ctx);
2048  }
2049  }
2050 
2051  if (isStr && db == "GeneID") {
2053  "db_xref type " + db + " (" + dbv + ") is required to be an integer",
2054  obj, ctx);
2055  }
2056 }
2057 
2058 
2060 (TDbtags& xref_list,
2061  const CSerialObject& obj,
2062  bool biosource,
2063  const CSeq_entry *ctx)
2064 {
2065  string last_db;
2066 
2067  ITERATE( TDbtags, xref, xref_list) {
2068  if (biosource
2069  && (*xref)->IsSetDb()) {
2070  if (!NStr::IsBlank(last_db)
2071  && NStr::EqualNocase((*xref)->GetDb(), last_db)) {
2073  "BioSource uses db " + last_db + " multiple times",
2074  obj, ctx);
2075  }
2076  last_db = (*xref)->GetDb();
2077  }
2078  ValidateDbxref(**xref, obj, biosource, ctx);
2079  }
2080 }
2081 
2082 
2084 (const CPacked_seqint& packed_int,
2085  SLocCheck& lc,
2086  const CSerialObject& obj)
2087 {
2088  ITERATE(CPacked_seqint::Tdata, it, packed_int.Get()) {
2089  lc.int_cur = (*it);
2090  lc.chk &= x_CheckSeqInt(lc.id_cur, lc.int_cur, lc.strand_cur);
2091 
2093 
2094  lc.id_prv = lc.id_cur;
2095  lc.strand_prv = lc.strand_cur;
2096  lc.int_prv = lc.int_cur;
2097  }
2098 }
2099 
2100 
2102  CConstRef<CSeq_id>& id_cur,
2103  const CSeq_interval* int_cur,
2104  ENa_strand& strand_cur)
2105 {
2106  strand_cur = int_cur->IsSetStrand() ?
2107  int_cur->GetStrand() : eNa_strand_unknown;
2108  id_cur = &int_cur->GetId();
2109  bool chk = IsValid(*int_cur, m_Scope);
2110  return chk;
2111 }
2112 
2113 
2115 {
2116  ITERATE(CPacked_seqint::Tdata, it, packed_int.Get()) {
2117  x_ReportInvalidFuzz(**it, obj);
2118  }
2119 }
2120 
2121 
2122 static const string kSpaceLeftFirst = "Should not specify 'space to left' at first position of non-circular sequence";
2123 static const string kSpaceRightLast = "Should not specify 'space to right' at last position of non-circular sequence";
2124 
2125 static const string kSpaceLeftCircle = "Should not specify 'circle to left' except at first position of circular sequence";
2126 static const string kSpaceRightCircle = "Should not specify 'circle to right' except at last position of circular sequence";
2127 
2129 {
2132  bool has_fuzz_from = false;
2133  bool has_fuzz_to = false;
2134 
2135  if (interval.IsSetFuzz_from() && interval.GetFuzz_from().IsLim()) {
2136  fuzz_from = interval.GetFuzz_from().GetLim();
2137  has_fuzz_from = true;
2138  }
2139  if (interval.IsSetFuzz_to() && interval.GetFuzz_to().IsLim()) {
2140  fuzz_to = interval.GetFuzz_to().GetLim();
2141  has_fuzz_to = true;
2142  }
2143  if (! has_fuzz_from && ! has_fuzz_to) {
2144  return;
2145  }
2146 
2147  // check for invalid fuzz on both ends of Interval
2148  if (has_fuzz_from && has_fuzz_to && fuzz_from == fuzz_to) {
2149  if (fuzz_from == CInt_fuzz::eLim_tl) {
2152  "Should not specify 'space to left' for both ends of interval", obj);
2153  }
2154  else if (fuzz_from == CInt_fuzz::eLim_tr) {
2157  "Should not specify 'space to right' for both ends of interval", obj);
2158  }
2159  else if (fuzz_from == CInt_fuzz::eLim_circle) {
2162  "Should not specify 'origin of circle' for both ends of interval", obj);
2163  }
2164  }
2165 
2166  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(interval.GetId());
2167  if (! bsh) {
2168  return;
2169  }
2170 
2172  if (bsh.IsSetInst_Topology()) {
2173  top = bsh.GetInst_Topology();
2174  }
2175 
2176  if (top != CSeq_inst::eTopology_circular) {
2177 
2178  // VR-15
2179  // look for space to left at beginning of sequence or space to right at end
2180  if (fuzz_from == CInt_fuzz::eLim_tl && interval.IsSetFrom() && interval.GetFrom() == 0) {
2182  }
2183  if (fuzz_to == CInt_fuzz::eLim_tr && interval.IsSetTo() && interval.GetTo() == bsh.GetBioseqLength() - 1) {
2185  }
2186 
2187  } else if (fuzz_from == CInt_fuzz::eLim_circle || fuzz_to == CInt_fuzz::eLim_circle) {
2188 
2189  if (obj.GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
2190  const CSeq_feat* sfp = dynamic_cast<const CSeq_feat*>(&obj);
2191  if (sfp && sfp->IsSetExcept() && sfp->CanGetExcept_text() && NStr::FindNoCase(sfp->GetExcept_text(), "ribosomal slippage") != NPOS) {
2192  return;
2193  }
2194  }
2195 
2196  // VR-832
2197  if (fuzz_from == CInt_fuzz::eLim_circle && interval.IsSetFrom() && interval.GetFrom() != 0) {
2199  }
2200  if (fuzz_to == CInt_fuzz::eLim_circle && interval.IsSetTo() && interval.GetTo() != bsh.GetBioseqLength() - 1) {
2202  }
2203  }
2204 }
2205 
2206 
2208 {
2209  // VR-15
2210  if (!point.IsSetFuzz() || !point.GetFuzz().IsLim() ||
2211  (point.GetFuzz().GetLim() != CInt_fuzz::eLim_tl && point.GetFuzz().GetLim() != CInt_fuzz::eLim_tr) ||
2212  !point.IsSetId() || !point.IsSetPoint()) {
2213  return;
2214  }
2215  CBioseq_Handle bsh = m_Scope->GetBioseqHandle(point.GetId());
2216  if (!bsh) {
2217  return;
2218  }
2220  return;
2221  }
2222  if (point.GetPoint() == 0 && point.GetFuzz().GetLim() == CInt_fuzz::eLim_tl) {
2224  }
2225  if (point.GetPoint() == bsh.GetBioseqLength() - 1) {
2227  }
2228 }
2229 
2230 
2231 void CValidError_imp::x_ReportInvalidFuzz(const CSeq_loc& loc, const CSerialObject& obj)
2232 {
2234  for (; lit; ++lit) {
2235  CSeq_loc::E_Choice loc_choice = lit->Which();
2236  switch (loc_choice) {
2237  case CSeq_loc::e_Int:
2238  x_ReportInvalidFuzz(lit->GetInt(), obj);
2239  break;
2241  x_ReportInvalidFuzz(lit->GetPacked_int(), obj);
2242  break;
2243  case CSeq_loc::e_Pnt:
2244  x_ReportInvalidFuzz(lit->GetPnt(), obj);
2245  break;
2246  default:
2247  break;
2248  }
2249  }
2250 }
2251 
2252 
2253 unsigned int s_CountMix(const CSeq_loc& loc)
2254 {
2255  unsigned int num_mix = 0;
2257  for (; lit; ++lit) {
2258  if (lit->IsMix()) {
2259  num_mix++;
2260  }
2261  }
2262  return num_mix;
2263 }
2264 
2265 
2266 void CValidError_imp::x_InitLocCheck(SLocCheck& lc, const string& prefix)
2267 {
2268  lc.chk = true;
2269  lc.unmarked_strand = false;
2270  lc.mixed_strand = false;
2271  lc.has_other = false;
2272  lc.has_not_other = false;
2273  lc.id_cur = nullptr;
2274  lc.id_prv = nullptr;
2275  lc.int_cur = nullptr;
2276  lc.int_prv = nullptr;
2277  lc.strand_cur = eNa_strand_unknown;
2278  lc.strand_prv = eNa_strand_unknown;
2279  lc.prefix = prefix;
2280 }
2281 
2283 {
2284  if (lc.strand_prv != eNa_strand_other &&
2285  lc.strand_cur != eNa_strand_other) {
2286  if (lc.id_cur && lc.id_prv &&
2287  IsSameBioseq(*lc.id_cur, *lc.id_prv, m_Scope)) {
2288  if (lc.strand_prv != lc.strand_cur) {
2289  if ((lc.strand_prv == eNa_strand_plus &&
2290  lc.strand_cur == eNa_strand_unknown) ||
2291  (lc.strand_prv == eNa_strand_unknown &&
2292  lc.strand_cur == eNa_strand_plus)) {
2293  lc.unmarked_strand = true;
2294  } else {
2295  lc.mixed_strand = true;
2296  }
2297  }
2298  }
2299  }
2300  if (lc.strand_cur == eNa_strand_other) {
2301  lc.has_other = true;
2302  } else if (lc.strand_cur == eNa_strand_minus || lc.strand_cur == eNa_strand_plus) {
2303  lc.has_not_other = true;
2304  }
2305 
2306 }
2307 
2308 void CValidError_imp::x_CheckLoc(const CSeq_loc& loc, const CSerialObject& obj, SLocCheck& lc, bool lowerSev)
2309 {
2310  try {
2311  switch (loc.Which()) {
2312  case CSeq_loc::e_Int:
2313  lc.int_cur = &loc.GetInt();
2314  lc.chk = x_CheckSeqInt(lc.id_cur, lc.int_cur, lc.strand_cur);
2315  if (lc.strand_cur == eNa_strand_other) {
2316  lc.has_other = true;
2317  }
2318  if ((!lc.chk) && lowerSev) {
2319  TSeqPos length = GetLength(loc.GetInt().GetId(), m_Scope);
2320  TSeqPos fr = loc.GetInt().GetFrom();
2321  TSeqPos to = loc.GetInt().GetTo();
2322  if (fr < length && to >= length) {
2323  // RefSeq variation feature with dbSNP xref and interval flanking the length is ERROR
2324  } else {
2325  // otherwise keep severity at REJECT
2326  lowerSev = false;
2327  }
2328  }
2329  break;
2330  case CSeq_loc::e_Pnt:
2331  lc.strand_cur = loc.GetPnt().IsSetStrand() ?
2332  loc.GetPnt().GetStrand() : eNa_strand_unknown;
2333  if (lc.strand_cur == eNa_strand_other) {
2334  lc.has_other = true;
2335  }
2336  lc.id_cur = &loc.GetPnt().GetId();
2337  lc.chk = IsValid(loc.GetPnt(), m_Scope);
2338  lc.int_prv = nullptr;
2339  break;
2341  lc.strand_cur = loc.GetPacked_pnt().IsSetStrand() ?
2342  loc.GetPacked_pnt().GetStrand() : eNa_strand_unknown;
2343  if (lc.strand_cur == eNa_strand_other) {
2344  lc.has_other = true;
2345  }
2346  lc.id_cur = &loc.GetPacked_pnt().GetId();
2347  lc.chk = IsValid(loc.GetPacked_pnt(), m_Scope);
2348  lc.int_prv = nullptr;
2349  break;
2351  x_CheckPackedInt(loc.GetPacked_int(), lc, obj);
2352  break;
2353  case CSeq_loc::e_Null:
2354  break;
2355  case CSeq_loc::e_Mix:
2356  for (auto l : loc.GetMix().Get()) {
2357  x_CheckLoc(*l, obj, lc, lowerSev);
2359  }
2360  break;
2361  default:
2362  lc.strand_cur = eNa_strand_other;
2363  lc.id_cur = nullptr;
2364  lc.int_prv = nullptr;
2365  break;
2366  }
2367  if (!lc.chk) {
2368  string lbl = GetValidatorLocationLabel (loc, *m_Scope);
2369  EDiagSev sev = eDiag_Critical;
2370  if (lowerSev) {
2371  sev = eDiag_Error;
2372  }
2374  lc.prefix + ": SeqLoc [" + lbl + "] out of range", obj);
2375  }
2376 
2377  if (loc.Which() != CSeq_loc::e_Null) {
2379 
2380  lc.strand_prv = lc.strand_cur;
2381  lc.id_prv = lc.id_cur;
2382  }
2383  } catch( const exception& e ) {
2384  string label = GetValidatorLocationLabel(loc, *m_Scope);
2386  "Exception caught while validating location " +
2387  label + ". Exception: " + e.what(), obj);
2388 
2389  lc.strand_cur = eNa_strand_other;
2390  lc.id_cur = nullptr;
2391  lc.int_prv = nullptr;
2392  }
2393 }
2394 
2396 (const CSeq_loc& loc,
2397  const CBioseq_Handle& seq,
2398  bool report_abutting,
2399  const string& prefix,
2400  const CSerialObject& obj,
2401  bool lowerSev)
2402 {
2403  SLocCheck lc;
2404 
2405  x_InitLocCheck(lc, prefix);
2406 
2407  x_CheckLoc(loc, obj, lc, lowerSev);
2408 
2409  if (lc.has_other && lc.has_not_other) {
2410  string label = GetValidatorLocationLabel(loc, *m_Scope);
2412  prefix + ": Inconsistent use of other strand SeqLoc [" + label + "]", obj);
2413  } else if (lc.has_other && NStr::Equal(prefix, "Location")) {
2416  "Strand 'other' in location", obj);
2417  }
2418 
2419  x_ReportInvalidFuzz(loc, obj);
2420 
2424  "Duplicate exons in location", obj);
2425  }
2426 
2427  if (s_CountMix(loc) > 1) {
2428  string label;
2429  loc.GetLabel(&label);
2431  prefix + ": SeqLoc [" + label + "] has nested SEQLOC_MIX elements",
2432  obj);
2433  }
2434 
2435  // Warn if different parts of a seq-loc refer to the same bioseq using
2436  // differnt id types (i.e. gi and accession)
2437  ValidateSeqLocIds(loc, obj);
2438 
2439  bool trans_splice = false;
2440  bool circular_rna = false;
2441  bool exception = false;
2442  const CSeq_feat* sfp = nullptr;
2443  if (obj.GetThisTypeInfo() == CSeq_feat::GetTypeInfo()) {
2444  sfp = dynamic_cast<const CSeq_feat*>(&obj);
2445  }
2446  if (sfp) {
2447  // primer_bind intervals MAY be in on opposite strands
2449  lc.mixed_strand = false;
2450  lc.unmarked_strand = false;
2451  }
2452 
2453  exception = sfp->IsSetExcept() ? sfp->GetExcept() : false;
2454  if (exception && sfp->CanGetExcept_text()) {
2455  if (NStr::FindNoCase(sfp->GetExcept_text(), "trans-splicing") != NPOS) {
2456  // trans splicing exception turns off both mixed_strand and
2457  // out_of_order messages
2458  trans_splice = true;
2459  } else if (NStr::FindNoCase(sfp->GetExcept_text(), "circular RNA") != NPOS) {
2460  // circular RNA exception turns off out_of_order message
2461  circular_rna = true;
2462  }
2463  }
2464  }
2465 
2466  string loc_lbl;
2467  if (report_abutting && (!sfp || !CSeqFeatData::AllowAdjacentIntervals(sfp->GetData().GetSubtype())) &&
2469  loc_lbl = GetValidatorLocationLabel(loc, *m_Scope);
2470 
2471  EDiagSev sev = exception ? eDiag_Warning : eDiag_Error;
2473  prefix + ": Adjacent intervals in SeqLoc [" +
2474  loc_lbl + "]", obj);
2475  }
2476 
2477  if (trans_splice && !NStr::Equal(prefix, "Product")) {
2478  CSeq_loc_CI li(loc);
2479  ++li;
2480  if (!li) {
2481  PostErr(eDiag_Warning, eErr_SEQ_FEAT_BadTranssplicedInterval, "Trans-spliced feature should have multiple intervals", obj);
2482  }
2483  return;
2484  }
2485 
2486  bool ordered = true;
2487  bool circular = false;
2488  if ( seq &&
2489  seq.IsSetInst() && seq.GetInst().IsSetTopology() &&
2491  circular = true;
2492  }
2493  try {
2494  if (m_Scope && (!sfp || CSeqFeatData::RequireLocationIntervalsInBiologicalOrder(sfp->GetData().GetSubtype())) && !circular) {
2496  }
2497  } catch ( const CException& ex) {
2498  string label;
2499  loc.GetLabel(&label);
2501  "Exception caught while validating location " +
2502  label + ". Exception: " + ex.what(), obj);
2503  }
2504 
2505  if (lc.mixed_strand || lc.unmarked_strand || !ordered) {
2506  if (loc_lbl.empty()) {
2507  loc_lbl = GetValidatorLocationLabel(loc, *m_Scope);
2508  }
2509  if (lc.mixed_strand) {
2510  if (IsSmallGenomeSet()) {
2512  prefix + ": Mixed strands in SeqLoc ["
2513  + loc_lbl + "] in small genome set - set trans-splicing exception if appropriate", obj);
2514  } else {
2515  EDiagSev sev = eDiag_Error;
2516  if (IsGeneious() || (sfp && sequence::IsPseudo(*sfp, *m_Scope))) {
2517  sev = eDiag_Warning;
2518  }
2520  prefix + ": Mixed strands in SeqLoc ["
2521  + loc_lbl + "]", obj);
2522  }
2523  } else if (lc.unmarked_strand) {
2525  prefix + ": Mixed plus and unknown strands in SeqLoc ["
2526  + loc_lbl + "]", obj);
2527  }
2528  if (!ordered && !circular_rna) {
2529  if (IsSmallGenomeSet()) {
2531  prefix + ": Intervals out of order in SeqLoc [" +
2532  loc_lbl + "]", obj);
2533  } else {
2535  prefix + ": Intervals out of order in SeqLoc [" +
2536  loc_lbl + "]", obj);
2537  }
2538  }
2539  return;
2540  }
2541 
2542  if ( seq &&
2543  seq.IsSetInst_Repr() &&
2544  seq.GetInst_Repr() != CSeq_inst::eRepr_seg ) {
2545  return;
2546  }
2547 
2548  // Check for intervals out of order on segmented Bioseq
2549  if ( seq && BadSeqLocSortOrder(seq, loc) && !circular_rna ) {
2550  if (loc_lbl.empty()) {
2551  loc.GetLabel(&loc_lbl);
2552  }
2554  prefix + "Intervals out of order in SeqLoc [" +
2555  loc_lbl + "]", obj);
2556  }
2557 
2558  // Check for mixed strand on segmented Bioseq
2559  if ( IsMixedStrands(loc) ) {
2560  if (loc_lbl.empty()) {
2561  loc.GetLabel(&loc_lbl);
2562  }
2564  prefix + ": Mixed strands in SeqLoc [" +
2565  loc_lbl + "]", obj);
2566  }
2567 }
2568 
2569 
2571 {
2572  if (!SeqIsPatent(seq)) {
2573  m_BioseqWithNoSource.push_back(CConstRef<CBioseq>(&seq));
2574  }
2575 }
2576 
2577 
2579 {
2580  if (!SeqIsPatent (seq)) {
2582  "The product name is missing from this protein.", *(seq.GetCompleteBioseq()));
2583  }
2584 }
2585 
2586 
2588 {
2589  bool wgs = false;
2590 
2591  FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
2592  if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech()
2593  && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_wgs) {
2594  wgs = true;
2595  break;
2596  }
2597  }
2598  if (!wgs) {
2599  return false;
2600  }
2601 
2602  bool is_other = false;
2603  bool has_gi = false;
2604 
2605  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2606  if ((*it)->IsOther()) {
2607  is_other = true;
2608  break;
2609  } else if ((*it)->IsGi()) {
2610  has_gi = true;
2611  break;
2612  }
2613  }
2614  if (!is_other || has_gi) {
2615  return false;
2616  }
2617 
2618  return true;
2619 }
2620 
2621 
2623 {
2624  bool tsa = false;
2625 
2626  FOR_EACH_DESCRIPTOR_ON_BIOSEQ (it, seq) {
2627  if ((*it)->IsMolinfo() && (*it)->GetMolinfo().IsSetTech()
2628  && (*it)->GetMolinfo().GetTech() == CMolInfo::eTech_tsa) {
2629  tsa = true;
2630  break;
2631  }
2632  }
2633  if (!tsa) {
2634  return false;
2635  }
2636 
2637  bool is_other = false;
2638  bool has_gi = false;
2639 
2640  FOR_EACH_SEQID_ON_BIOSEQ (it, seq) {
2641  if ((*it)->IsOther()) {
2642  is_other = true;
2643  break;
2644  } else if ((*it)->IsGi()) {
2645  has_gi = true;
2646  break;
2647  }
2648  }
2649  if (!is_other || has_gi) {
2650  return false;
2651  }
2652 
2653  return true;
2654 }
2655 
2656 
2658 {
2659  if (GetContext().PreprocessHugeFile) {
2660  if (m_pEntryInfo->IsNoBioSource() && !GetContext().IsPatent && !GetContext().IsPDB) {
2661  return;
2662  }
2663  }
2664  else if (m_pEntryInfo->IsNoBioSource() && !m_pEntryInfo->IsPatent() && !m_pEntryInfo->IsPDB()) {
2666  "No source information included on this record.", se);
2667 
2668  if (!GetContext().PostprocessHugeFile) {
2669  return;
2670  }
2671  }
2672 
2673  size_t num_no_source = m_BioseqWithNoSource.size();
2674 
2675  for ( size_t i = 0; i < num_no_source; ++i ) {
2677  "No organism name included in the source. Other qualifiers may exist.",
2678  *(m_BioseqWithNoSource[i]));
2679  }
2680 }
2681 
2682 
2684 {
2685  CConstRef<CSeq_feat> feat;
2686 
2688 
2689  if ( bsh ) {
2690  if ( IsNT() && m_TSE ) {
2691  // In case of a NT bioseq limit the search to features packaged on the
2692  // NT (we assume features have been pulled from the segments to the NT).
2694  sel.SetByProduct()
2696  CFeat_CI fi(bsh, sel);
2697  if ( fi ) {
2698  // return the first one (should be the one packaged on the
2699  // nuc-prot set).
2700  feat.Reset(&(fi->GetOriginalFeature()));
2701  }
2702  } else {
2704  sel.SetByProduct();
2705  CFeat_CI fi(bsh, sel);
2706  if ( fi ) {
2707  // return the first one (should be the one packaged on the
2708  // nuc-prot set).
2709  feat.Reset(&(fi->GetOriginalFeature()));
2710  }
2711  }
2712  }
2713 
2714  return feat;
2715 }
2716 
2717 
2719 {
2721  return GetmRNAGivenProduct(bsh);
2722 }
2723 
2724 
2726 {
2727  CConstRef<CSeq_feat> feat;
2728  if ( bsh ) {
2729  // In case of a NT bioseq limit the search to features packaged on the
2730  // NT (we assume features have been pulled from the segments to the NT).
2731  CSeq_entry_Handle limit;
2732  if ( IsNT() && m_TSE ) {
2733  limit = m_Scope->GetSeq_entryHandle(*m_TSE);
2734  }
2735 
2736  if (limit) {
2738  sel.SetByProduct() .SetLimitTSE(limit);
2739  CFeat_CI fi(bsh, sel);
2740  if ( fi ) {
2741  // return the first one (should be the one packaged on the
2742  // nuc-prot set).
2743  feat.Reset(&(fi->GetOriginalFeature()));
2744  }
2745  } else {
2747  sel.SetByProduct();
2748  CFeat_CI fi(bsh, sel);
2749  if ( fi ) {
2750  // return the first one (should be the one packaged on the
2751  // nuc-prot set).
2752  feat.Reset(&(fi->GetOriginalFeature()));
2753  }
2754  }
2755  }
2756 
2757  return feat;
2758 }
2759 
2760 
2762 (const CBioseq& seq,
2763  CBioseq_set::EClass clss)
2764 {
2765  const CSeq_entry* parent = nullptr;
2766  for ( parent = seq.GetParentEntry();
2767  parent;
2768  parent = parent->GetParentEntry() ) {
2769  if ( parent->IsSet() ) {
2770  const CBioseq_set& set = parent->GetSet();
2771  if ( set.IsSetClass() && set.GetClass() == clss ) {
2772  break;
2773  }
2774  }
2775  }
2776  return parent;
2777 }
2778 
2779 
2780 bool CValidError_imp::IsSerialNumberInComment(const string& comment)
2781 {
2782  size_t pos = comment.find('[', 0);
2783  while ( pos != string::npos ) {
2784  ++pos;
2785  bool okay = true;
2786  if ( isdigit((unsigned char) comment[pos]) ) {
2787  // skip if first character after bracket is 0
2788  if (comment[pos] == '0') {
2789  okay = false;
2790  }
2791  while ( isdigit((unsigned char) comment[pos]) ) {
2792  ++pos;
2793  }
2794  if ( comment[pos] == ']' && okay ) {
2795  return true;
2796  }
2797  }
2798 
2799  pos = comment.find('[', pos);
2800  }
2801  return false;
2802 }
2803 
2804 
2806 {
2807  // okay to have far RefSeq product, but only if genomic product set
2808  if ( sid && sid->IsOther() ) {
2809  if ( IsGPS() ) {
2810  return false;
2811  }
2812  }
2813  // or just a bioseq
2814  if ( GetTSE().IsSeq() ) {
2815  return false;
2816  }
2817 
2818  // or in a standalone Seq-annot
2819  if (IsStandaloneAnnot() ) {
2820  return false;
2821  }
2822  return true;
2823 }
2824 
2825 
2827  vector<TEntrezId>& pmids, vector<TEntrezId>& muids, vector<int>& serials,
2828  vector<string>& published_labels, vector<string>& unpublished_labels)
2829 {
2830  FOR_EACH_SEQDESC_ON_SEQENTRY (it, se) {
2831  if ((*it)->IsPub()) {
2832  CCleanup::GetPubdescLabels ((*it)->GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
2833  }
2834  }
2835 
2836  if (se.IsSet()) {
2837  FOR_EACH_SEQENTRY_ON_SEQSET (it, se.GetSet()) {
2838  s_CollectPubDescriptorLabels (**it, pmids, muids, serials, published_labels, unpublished_labels);
2839  }
2840  }
2841 }
2842 
2843 
2845 {
2846  vector<TEntrezId> pmids;
2847  vector<TEntrezId> muids;
2848  vector<int> serials;
2849  vector<string> published_labels;
2850  vector<string> unpublished_labels;
2851 
2852  // collect labels for pubs on record
2853  s_CollectPubDescriptorLabels (*(seh.GetCompleteSeq_entry()), pmids, muids, serials, published_labels, unpublished_labels);
2854 
2856  while (feat) {
2857  CCleanup::GetPubdescLabels (feat->GetData().GetPub(), pmids, muids, serials, published_labels, unpublished_labels);
2858  ++feat;
2859  }
2860 
2861  // now examine citations to determine whether they match a pub on the record
2862  CFeat_CI f (seh);
2863  while (f) {
2864  if (f->IsSetCit() && f->GetCit().IsPub()) {
2865  ITERATE (CPub_set::TPub, cit_it, f->GetCit().GetPub()) {
2866  bool found = false;
2867 
2868  if ((*cit_it)->IsPmid()) {
2869  vector<TEntrezId>::iterator it = pmids.begin();
2870  while (it != pmids.end() && !found) {
2871  if (*it == (*cit_it)->GetPmid()) {
2872  found = true;
2873  }
2874  ++it;
2875  }
2876  if (!found) {
2878  "Citation on feature refers to uid ["
2879  + NStr::NumericToString((*cit_it)->GetPmid().Get())
2880  + "] not on a publication in the record",
2881  f->GetOriginalFeature());
2882  }
2883  } else if ((*cit_it)->IsMuid()) {
2884  vector<TEntrezId>::iterator it = muids.begin();
2885  while (it != muids.end() && !found) {
2886  if (*it == (*cit_it)->GetMuid()) {
2887  found = true;
2888  }
2889  ++it;
2890  }
2891  if (!found) {
2893  "Citation on feature refers to uid ["
2894  + NStr::NumericToString((*cit_it)->GetMuid())
2895  + "] not on a publication in the record",
2896  f->GetOriginalFeature());
2897  }
2898  } else if ((*cit_it)->IsEquiv()) {
2899  continue;
2900  } else {
2901  string label;
2902  (*cit_it)->GetLabel(&label, CPub::eContent, CPub::fLabel_Unique);
2903 
2904  if (NStr::EndsWith (label, ">")) {
2905  label = label.substr(0, label.length() - 2);
2906  }
2907  if(NStr::EndsWith (label, "|")) {
2908  label = label.substr(0, label.length() - 1);
2909  }
2910  if (NStr::EndsWith (label, " ")) {
2911  label = label.substr(0, label.length() - 1);
2912  }
2913  size_t len = label.length();
2914  vector<string>::iterator unpub_it = unpublished_labels.begin();
2915  while (unpub_it != unpublished_labels.end() && !found) {
2916  size_t it_len =(*unpub_it).length();
2917  if (NStr::EqualNocase (*unpub_it, 0, it_len > len ? len : it_len, label)) {
2918  found = true;
2919  }
2920  ++unpub_it;
2921  }
2922  vector<string>::iterator pub_it = published_labels.begin();
2923 
2924  while (pub_it != published_labels.end() && !found) {
2925  size_t it_len =(*pub_it).length();
2926  if (NStr::EqualNocase (*pub_it, 0, it_len > len ? len : it_len, label)) {
2928  "Citation on feature needs to be updated to published uid",
2929  f->GetOriginalFeature());
2930  found = true;
2931  }
2932  ++pub_it;
2933  }
2934  if (!found) {
2936  "Citation on feature refers to a publication not in the record",
2937  f->GetOriginalFeature());
2938  }
2939  }
2940  }
2941  }
2942  ++f;
2943  }
2944 }
2945 
2946 
2947 // =============================================================================
2948 // Private
2949 // =============================================================================
2950 
2951 
2952 
2954 {
2956  for( ; it; ++it) {
2957  const string& str = *it;
2958  FOR_EACH_CHAR_IN_STRING(c_it, str) {
2959  const char& ch = *c_it;
2960  unsigned char chu = ch;
2961  if (ch > 127 || (ch < 32 && ch != '\t' && ch != '\r' && ch != '\n')) {
2963  "Non-ASCII character '" + NStr::NumericToString(chu) + "' found in item", obj);
2964  break;
2965  }
2966  }
2967  }
2968 }
2969 
2970 
2972 {
2973  class CScriptTagTextFsm : public CTextFsm<int>
2974  {
2975  public:
2976  CScriptTagTextFsm() {
2977  const char * script_tags[] = {
2978  "<script", "<object", "<applet", "<embed", "<form",
2979  "javascript:", "vbscript:"};
2980  ITERATE_0_IDX(idx, ArraySize(script_tags)) {
2981  AddWord(script_tags[idx], true);
2982  }
2983  Prime();
2984  }
2985 
2986  // Returns true if the given string matches any of the strings
2987  // in the fsm anywhere.
2988  bool DoesStrHaveFsmHits(const string &str) {
2989  int state = GetInitialState();
2990  ITERATE(string, str_it, str) {
2991  state = GetNextState(state, *str_it);
2992  if( IsMatchFound(state) ) {
2993  return true;
2994  }
2995  }
2996 
2997  return false;
2998  }
2999  };
3000  static CScriptTagTextFsm s_ScriptTagFsm;
3001 
3002 
3004  for( ; it; ++it) {
3005  if (s_ScriptTagFsm.DoesStrHaveFsmHits(*it)) {
3007  "Script tag found in item", obj);
3008  return;
3009  }
3010 }
3011 }
3012 
3013 
3014 bool CValidError_imp::IsMixedStrands(const CSeq_loc& loc)
3015 {
3016  if ( SeqLocCheck(loc, m_Scope) == eSeqLocCheck_warning ) {
3017  return false;
3018  }
3019 
3020  CSeq_loc_CI curr(loc);
3021  if ( !curr ) {
3022  return false;
3023  }
3024  CSeq_loc_CI prev = curr;
3025  ++curr;
3026 
3027  while ( curr ) {
3028  ENa_strand curr_strand = curr.GetStrand();
3029  ENa_strand prev_strand = prev.GetStrand();
3030 
3031  if ( (prev_strand == eNa_strand_minus &&
3032  curr_strand != eNa_strand_minus) ||
3033  (prev_strand != eNa_strand_minus &&
3034  curr_strand == eNa_strand_minus) ) {
3035  return true;
3036  }
3037 
3038  prev = curr;
3039  ++curr;
3040  }
3041 
3042  return false;
3043 }
3044 
3045 
3046 static bool s_SeqLocHasGI (const CSeq_loc& loc)
3047 {
3048  bool rval = false;
3049 
3050  for ( CSeq_loc_CI it(loc); it && !rval; ++it ) {
3051  if (it.GetSeq_id().IsGi()) {
3052  rval = true;
3053  }
3054  }
3055  return rval;
3056 }
3057 
3058 
3060 {
3061  m_TSEH = seh;
3063  m_GeneCache.Clear();
3064 }
3065 
3066 
3068 {
3070  return true;
3071  } else {
3072  return false;
3073  }
3074 }
3075 
3076 
3078 {
3079  if (se.IsSeq()) {
3080  return 1;
3081  } else if (!se.IsSet()) {
3082  return 0;
3083  }
3084  if (se.GetSet().IsSetClass()) {
3087  return 1;
3088  }
3089  }
3090  size_t count = 0;
3091  if (se.GetSet().IsSetSeq_set()) {
3092  for (auto it = se.GetSet().GetSeq_set().begin(); it != se.GetSet().GetSeq_set().end(); it++) {
3093  count += s_CountTopSetSiblings(**it);
3094  }
3095  }
3096  return count;
3097 }
3098 
3099 
3101 {
3102  // "Save" the Seq-entry
3103  SetTSE(seh);
3104 
3107 
3108  // If no Pubs/BioSource in CSeq_entry, post only one error
3109  if (GetContext().PreprocessHugeFile) {
3110  x_SetEntryInfo().SetNoPubs(GetContext().NoPubsFound);
3111  x_SetEntryInfo().SetNoCitSubPubs(GetContext().NoCitSubsFound);
3112  x_SetEntryInfo().SetNoBioSource(GetContext().NoBioSource);
3113  } else {
3115  x_SetEntryInfo().SetNoPubs(!pub);
3116  while (pub && !pub->IsSub()) {
3117  ++pub;
3118  }
3122  }
3123 
3124 
3125  // Look for genomic product set
3127  if (si->IsSetClass ()) {
3128  if (si->GetClass () == CBioseq_set::eClass_gen_prod_set) {
3129  x_SetEntryInfo().SetGPS();
3130  }
3131  if (si->GetClass () == CBioseq_set::eClass_small_genome_set) {
3133  }
3134  }
3135  }
3136 
3137  // Examine all Seq-ids on Bioseqs
3138  for (CTypeConstIterator <CBioseq> bi (*m_TSE); bi; ++bi) {
3139  FOR_EACH_SEQID_ON_BIOSEQ (sid_itr, *bi) {
3140  const CSeq_id& sid = **sid_itr;
3141  const CTextseq_id* tsid = sid.GetTextseq_Id();
3142  CSeq_id::E_Choice typ = sid.Which();
3143  switch (typ) {
3144  case CSeq_id::e_not_set:
3145  break;
3146  case CSeq_id::e_Local:
3147  break;
3148  case CSeq_id::e_Gibbsq:
3149  break;
3150  case CSeq_id::e_Gibbmt:
3151  break;
3152  case CSeq_id::e_Giim:
3153  break;
3154  case CSeq_id::e_Genbank:
3157  x_SetEntryInfo().SetGED();
3158  break;
3159  case CSeq_id::e_Embl:
3161  x_SetEntryInfo().SetGED();
3162  x_SetEntryInfo().SetEmbl();
3163  break;
3164  case CSeq_id::e_Pir:
3165  break;
3166  case CSeq_id::e_Swissprot:
3167  break;
3168  case CSeq_id::e_Patent:
3170  break;
3171  case CSeq_id::e_Other:
3173  // and do RefSeq subclasses up front as well
3174  if (sid.GetOther().IsSetAccession()) {
3175  string acc = sid.GetOther().GetAccession().substr(0, 3);
3176  if (acc == "NC_") {
3177  m_IsNC = true;
3178  } else if (acc == "NG_") {
3179  m_IsNG = true;
3180  } else if (acc == "NM_") {
3181  m_IsNM = true;
3182  } else if (acc == "NP_") {
3183  m_IsNP = true;
3184  } else if (acc == "NR_") {
3185  m_IsNR = true;
3186  } else if (acc == "NZ_") {
3187  m_IsNZ = true;
3188  } else if (acc == "NS_") {
3189  m_IsNS = true;
3190  } else if (acc == "NT_") {
3191  m_IsNT = true;
3192  } else if (acc == "NW_") {
3193  m_IsNW = true;
3194  } else if (acc == "WP_") {
3195  m_IsWP = true;
3196  } else if (acc == "XR_") {
3197  m_IsXR = true;
3198  }
3199  }
3200  break;
3201  case CSeq_id::e_General:
3202  if ((*bi).IsAa() && !sid.GetGeneral().IsSkippable()) {
3204  }
3205  break;
3206  case CSeq_id::e_Gi:
3207  x_SetEntryInfo().SetGI();
3209  break;
3210  case CSeq_id::e_Ddbj:
3212  x_SetEntryInfo().SetGED();
3213  x_SetEntryInfo().SetDdbj();
3214  break;
3215  case CSeq_id::e_Prf:
3216  break;
3217  case CSeq_id::e_Pdb:
3218  x_SetEntryInfo().SetPDB();
3219  break;
3220  case CSeq_id::e_Tpg:
3222  break;
3223  case CSeq_id::e_Tpe:
3224  x_SetEntryInfo().SetTPE();
3226  break;
3227  case CSeq_id::e_Tpd:
3229  break;
3230  case CSeq_id::e_Gpipe:
3232  break;
3233  default:
3234  break;
3235  }
3236  if ( tsid && tsid->IsSetAccession() && tsid->IsSetVersion() && tsid->GetVersion() >= 1 ) {
3238  }
3239  if (typ != CSeq_id::e_Local && typ != CSeq_id::e_General) {
3241  }
3242  }
3243  }
3244 
3245  // search all source descriptors for genomic source
3246  for (CSeqdesc_CI desc_ci (seh, CSeqdesc::e_Source);
3247  desc_ci && !m_pEntryInfo->IsGenomic();
3248  ++desc_ci) {
3249  if (desc_ci->GetSource().IsSetGenome()
3250  && desc_ci->GetSource().GetGenome() == CBioSource::eGenome_genomic) {
3252  }
3253  }
3254 
3255  // search genome build and annotation pipeline user object descriptors
3256  for (CSeqdesc_CI desc_ci (seh, CSeqdesc::e_User);
3257  desc_ci && !m_pEntryInfo->IsGpipe();
3258  ++desc_ci) {
3259  if ( desc_ci->GetUser().IsSetType() ) {
3260  const CUser_object& obj = desc_ci->GetUser();
3261  const CObject_id& oi = obj.GetType();
3262  if ( ! oi.IsStr() ) continue;
3263  if ( NStr::CompareNocase(oi.GetStr(), "GenomeBuild") == 0 ) {
3265  } else if ( NStr::CompareNocase(oi.GetStr(), "StructuredComment") == 0 ) {
3266  ITERATE (CUser_object::TData, field, obj.GetData()) {
3267  if ((*field)->IsSetLabel() && (*field)->GetLabel().IsStr()) {
3268  if (NStr::EqualNocase((*field)->GetLabel().GetStr(), "Annotation Pipeline")) {
3269  if (NStr::EqualNocase((*field)->GetData().GetStr(), "NCBI eukaryotic genome annotation pipeline")) {
3271  }
3272  }
3273  }
3274  }
3275  }
3276  }
3277  }
3278 
3279  // examine features for location gi, product gi, and locus tag
3280  for (CFeat_CI feat_ci (seh);
3282  ++feat_ci) {
3283  if (s_SeqLocHasGI(feat_ci->GetLocation())) {
3285  }
3286  if (feat_ci->IsSetProduct() && s_SeqLocHasGI(feat_ci->GetProduct())) {
3288  }
3289  if (feat_ci->IsSetData() && feat_ci->GetData().IsGene()
3290  && feat_ci->GetData().GetGene().IsSetLocus_tag()
3291  && !NStr::IsBlank (feat_ci->GetData().GetGene().GetLocus_tag())) {
3293  }
3294  }
3295 
3296  if ( m_PrgCallback ) {
3297  m_NumAlign = 0;
3298  for (CTypeConstIterator<CSeq_align> i(*m_TSE); i; ++i) {
3299  m_NumAlign++;
3300  }
3301  m_NumAnnot = 0;
3302  for (CTypeConstIterator<CSeq_annot> i(*m_TSE); i; ++i) {
3303  m_NumAnnot++;
3304  }
3305  m_NumBioseq = 0;
3306  for (CTypeConstIterator<CBioseq> i(*m_TSE); i; ++i) {
3307  m_NumBioseq++;
3308  }
3309  m_NumBioseq_set = 0;
3310  for (CTypeConstIterator<CBioseq_set> i(*m_TSE); i; ++i) {
3311  m_NumBioseq_set++;
3312  }
3313  m_NumDesc = 0;
3314  for (CTypeConstIterator<CSeqdesc> i(*m_TSE); i; ++i) {
3315  m_NumDesc++;
3316  }
3317  m_NumDescr = 0;
3318  for (CTypeConstIterator<CSeq_descr> i(*m_TSE); i; ++i) {
3319  m_NumDescr++;
3320  }
3321  m_NumFeat = 0;
3322  for (CTypeConstIterator<CSeq_feat> i(*m_TSE); i; ++i) {
3323  m_NumFeat++;
3324  }
3325  m_NumGraph = 0;
3326  for (CTypeConstIterator<CSeq_graph> i(*m_TSE); i; ++i) {
3327  m_NumGraph++;
3328  }
3331  m_NumGraph;
3332  }
3333 
3334  if (CNcbiApplication::Instance()->GetProgramDisplayName() == "table2asn") {
3335  m_IsTbl2Asn = true;
3336  }
3337 }
3338 
3339 
3341 {
3342  m_Scope.Reset(new CScope(*m_ObjMgr));
3343  m_Scope->AddTopLevelSeqEntry(*const_cast<CSeq_entry*>(&se));
3344  m_Scope->AddDefaults();
3345 }
3346 
3347 
3349 {
3350  m_IsStandaloneAnnot = true;
3351  if (! m_Scope) {
3352  m_Scope.Reset(& sah.GetScope());
3353  }
3355  m_TSE.Reset(new CSeq_entry); // set a dummy Seq-entry
3357 }
3358 
3359 
3361 {
3362  m_Scope.Reset(new CScope(*m_ObjMgr));
3363  CRef<CSeq_entry> tmp_entry(new CSeq_entry());
3364  tmp_entry->SetSeq().Assign(seq);
3365  m_TSE.Reset(tmp_entry);
3367  Setup(m_TSEH);
3368  return m_TSEH;
3369 }
3370 
3371 
3373 (const CSeq_loc& loc,
3374  const CSerialObject& obj)
3375 {
3376  for ( CSeq_loc_CI lit(loc); lit; ++lit ) {
3377  const CSeq_id& id1 = lit.GetSeq_id();
3378  CSeq_loc_CI lit2 = lit;
3379  for ( ++lit2; lit2; ++lit2 ) {
3380  const CSeq_id& id2 = lit2.GetSeq_id();
3381  if ( IsSameBioseq(id1, id2, m_Scope) && !id1.Match(id2) ) {
3384  "Two ids refer to the same bioseq but are of "
3385  "different type", obj);
3386  }
3387  }
3388  if (IsTemporary(id1)) {
3390  "Feature locations should not use Seq-ids that will be stripped during ID load", obj);
3391  }
3392  }
3395  "Feature location intervals should all be on the same sequence", obj);
3396  }
3397 }
3398 
3399 
3401 {
3402  return validator::IsInOrganelleSmallGenomeSet(id, scope);
3403 }
3404 
3405 
3406 // all ids in a location should point to the same sequence, unless the sequences are
3407 // in an organelle small genome set
3408 bool CValidError_imp::BadMultipleSequenceLocation(const CSeq_loc& loc, CScope& scope)
3409 {
3410  return validator::BadMultipleSequenceLocation(loc, scope);
3411 }
3412 
3413 
3414 bool CValidError_imp::x_IsFarFetchFailure (const CSeq_loc& loc)
3415 {
3417  && IsFarLocation(loc, GetTSEH())) {
3418  return true;
3419  } else {
3420  return false;
3421  }
3422 }
3423 
3424 
3425 //LCOV_EXCL_START
3426 // not used by asnvalidate, used by external programs
3428 {
3429  bool rval = false;
3430  Setup(se);
3431  CValidError_bioseq bioseq_validator(*this);
3433  while (bi) {
3434  rval |= bioseq_validator.GetTSANStretchErrors(*(bi->GetCompleteBioseq()));
3435  ++bi;
3436  }
3437  return rval;
3438 }
3439 
3440 
3442 {
3443  CSeq_entry_Handle seh = Setup(seq);
3444  CValidError_bioseq bioseq_validator(*this);
3445  return bioseq_validator.GetTSANStretchErrors(*(seh.GetSeq().GetCompleteBioseq()));
3446 }
3447 
3448 
3450 {
3451  bool rval = false;
3452  Setup(se);
3453  CValidError_feat feat_validator(*this);
3454  CFeat_CI fi(se);
3455  while (fi) {
3457  if (bsh) {
3458  rval |= feat_validator.GetTSACDSOnMinusStrandErrors(*(fi->GetSeq_feat()), *(bsh.GetCompleteBioseq()));
3459  }
3460  ++fi;
3461  }
3462 
3463  return rval;
3464 }
3465 
3466 
3468 {
3469  CSeq_entry_Handle seh = Setup(seq);
3470  CValidError_feat feat_validator(*this);
3471  return feat_validator.GetTSACDSOnMinusStrandErrors(f, *(seh.GetSeq().GetCompleteBioseq()));
3472 }
3473 
3474 
3476 {
3477  bool rval = false;
3478  Setup(se);
3479  CValidError_bioseq bioseq_validator(*this);
3481  while (bi) {
3482  rval |= bioseq_validator.GetTSAConflictingBiomolTechErrors(*(bi->GetCompleteBioseq()));
3483  ++bi;
3484  }
3485  return rval;
3486 }
3487 
3488 
3490 {
3491  CSeq_entry_Handle seh = Setup(seq);
3492  CValidError_bioseq bioseq_validator(*this);
3493  return bioseq_validator.GetTSAConflictingBiomolTechErrors(*(seh.GetSeq().GetCompleteBioseq()));
3494 }
3495 //LCOV_EXCL_STOP
3496 
3497 const string kTooShort = "Too Short";
3498 const string kMissingPrimers = "Missing Primers";
3499 const string kMissingCountry = "Missing Country";
3500 const string kMissingVoucher = "Missing Voucher";
3501 const string kBadCollectionDate = "Bad Collection Date";
3502 const string kTooManyNs = "Too Many Ns";
3503 const string kMissingOrderAssignment = "Missing Order Assignment";
3504 const string kLowTrace = "Low Trace";
3505 const string kFrameShift = "Frame Shift";
3506 const string kStructuredVoucher = "Structured Voucher";
3507 
3508 #define ADD_BARCODE_ERR(TestName) \
3509  PostErr(eDiag_Warning, eErr_GENERIC_Barcode##TestName, k##TestName, sq); \
3510  if (!msg.empty()) { \
3511  msg += ","; \
3512  } \
3513  msg += k##TestName;
3514 
3516 {
3518  for (auto r : results) {
3519  const CBioseq& sq = *(r.bsh.GetCompleteBioseq());
3520  if (BarcodeTestFails(r)){
3521  string msg;
3522  if (r.length) {
3523  ADD_BARCODE_ERR(TooShort)
3524  }
3525  if (r.primers) {
3526  ADD_BARCODE_ERR(MissingPrimers)
3527  }
3528  if (r.country) {
3529  ADD_BARCODE_ERR(MissingCountry)
3530  }
3531  if (r.voucher) {
3532  ADD_BARCODE_ERR(MissingVoucher)
3533  }
3534  if (!r.percent_n.empty()) {
3536  if (!msg.empty()) {
3537  msg += ",";
3538  }
3539  msg += kTooManyNs + ":" + r.percent_n;
3540  }
3541  if (r.collection_date) {
3542  ADD_BARCODE_ERR(BadCollectionDate)
3543  }
3544  if (r.order_assignment) {
3545  ADD_BARCODE_ERR(MissingOrderAssignment)
3546  }
3547  if (r.low_trace) {
3548  ADD_BARCODE_ERR(LowTrace)
3549  }
3550  if (r.frame_shift) {
3551  ADD_BARCODE_ERR(FrameShift)
3552  }
3553  if (!r.structured_voucher) {
3554  ADD_BARCODE_ERR(StructuredVoucher)
3555  }
3556  PostErr(eDiag_Info, eErr_GENERIC_BarcodeTestFails, "FAIL (" + msg + ")", sq);
3557  } else {
3559  }
3560  }
3561 }
3562 
3563 
3567 bool CValidError_imp::IsGPS() const { return GetEntryInfo().IsGPS(); }
3568 bool CValidError_imp::IsGED() const { return GetEntryInfo().IsGED(); }
3569 bool CValidError_imp::IsPDB() const { return GetEntryInfo().IsPDB(); }
3572 bool CValidError_imp::IsEmbl() const { return GetEntryInfo().IsEmbl(); }
3573 bool CValidError_imp::IsDdbj() const { return GetEntryInfo().IsDdbj(); }
3574 bool CValidError_imp::IsTPE() const { return GetEntryInfo().IsTPE(); }
3575 bool CValidError_imp::IsNC() const { return m_IsNC; }
3576 bool CValidError_imp::IsNG() const { return m_IsNG; }
3577 bool CValidError_imp::IsNM() const { return m_IsNM; }
3578 bool CValidError_imp::IsNP() const { return m_IsNP; }
3579 bool CValidError_imp::IsNR() const { return m_IsNR; }
3580 bool CValidError_imp::IsNS() const { return m_IsNS; }
3581 bool CValidError_imp::IsNT() const { return m_IsNT; }
3582 bool CValidError_imp::IsNW() const { return m_IsNW; }
3583 bool CValidError_imp::IsNZ() const { return m_IsNZ; }
3584 bool CValidError_imp::IsWP() const { return m_IsWP; }
3585 bool CValidError_imp::IsXR() const { return m_IsXR; }
3586 bool CValidError_imp::IsGI() const { return GetEntryInfo().IsGI(); }
3588 bool CValidError_imp::IsGpipe() const { return GetEntryInfo().IsGpipe(); }
3601 
3602 
3603 
3604 // =============================================================================
3605 // CValidError_base Implementation
3606 // =============================================================================
3607 
3608 
3610  m_Imp(imp), m_Scope(imp.GetScope())
3611 {
3612 }
3613 
3614 
3616 {
3617 }
3618 
3619 
3621 (EDiagSev sv,
3622  EErrType et,
3623  const string& msg,
3624  const CSerialObject& obj)
3625 {
3626  m_Imp.PostErr(sv, et, msg, obj);
3627 }
3628 
3629 
3630 //void CValidError_base::PostErr
3631 //(EDiagSev sv,
3632 // EErrType et,
3633 // const string& msg,
3634 // TDesc ds)
3635 //{
3636 // m_Imp.PostErr(sv, et, msg, ds);
3637 //}
3638 
3639 
3641 (EDiagSev sv,
3642  EErrType et,
3643  const string& msg,
3644  const CSeq_feat& ft)
3645 {
3646  m_Imp.PostErr(sv, et, msg, ft);
3647 }
3648 
3649 
3651 (EDiagSev sv,
3652  EErrType et,
3653  const string& msg,
3654  const CBioseq& sq)
3655 {
3656  m_Imp.PostErr(sv, et, msg, sq);
3657 }
3658 
3659 
3661 (EDiagSev sv,
3662  EErrType et,
3663  const string& msg,
3664  const CSeq_entry& ctx,
3665  const CSeqdesc& ds)
3666 {
3667  m_Imp.PostErr(sv, et, msg, ctx, ds);
3668 }
3669 
3670 
3672 (EDiagSev sv,
3673  EErrType et,
3674  const string& msg,
3675  const CBioseq_set& set)
3676 {
3677  m_Imp.PostErr(sv, et, msg, set);
3678 }
3679 
3680 
3682 (EDiagSev sv,
3683  EErrType et,
3684  const string& msg,
3685  const CSeq_annot& annot)
3686 {
3687  m_Imp.PostErr(sv, et, msg, annot);
3688 }
3689 
3691 (EDiagSev sv,
3692  EErrType et,
3693  const string& msg,
3694  const CSeq_graph& graph)
3695 {
3696  m_Imp.PostErr(sv, et, msg, graph);
3697 }
3698 
3699 
3701 (EDiagSev sv,
3702  EErrType et,
3703  const string& msg,
3704  const CBioseq& sq,
3705  const CSeq_graph& graph)
3706 {
3707  m_Imp.PostErr(sv, et, msg, sq, graph);
3708 }
3709 
3710 
3712 (EDiagSev sv,
3713  EErrType et,
3714  const string& msg,
3715  const CSeq_align& align)
3716 {
3717  m_Imp.PostErr(sv, et, msg, align);
3718 }
3719 
3720 
3722 (EDiagSev sv,
3723  EErrType et,
3724  const string& msg,
3725  const CSeq_entry& entry)
3726 {
3727  m_Imp.PostErr(sv, et, msg, entry);
3728 }
3729 
3730 CCacheImpl&
3732 {
3733  return m_Imp.GetCache();
3734 }
3735 
3736 
3738 {
3739  CSeq_entry_Handle parent = seh.GetParentEntry();
3740  if (!parent || !parent.IsSet()) {
3741  return false;
3742  }
3744  if (!pset) {
3745  return false;
3746  }
3747  if (pset->IsSetSeq_set() && pset->GetSeq_set().size() > 10) {
3748  return true;
3749  } else {
3750  return s_HasTopSetSiblings(parent);
3751  }
3752 }
3753 
3754 
3756 {
3757  CSeq_entry_Handle appropriate_parent;
3758 
3759  CSeq_entry_Handle np;
3760  CSeq_entry_Handle gps;
3761  if (seh.IsSet() && seh.GetSet().IsSetClass()) {
3762  if (seh.GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
3763  np = seh;
3764  } else if (s_IsGoodTopSetClass(seh.GetSet().GetClass())) {
3765  gps = seh;
3766  }
3767  } else if (seh.IsSeq()) {
3769  if (p && p.IsSet() && p.GetSet().IsSetClass()) {
3771  np = p;
3772  } else if (s_IsGoodTopSetClass(p.GetSet().GetClass())) {
3773  gps = p;
3774  }
3775  }
3776  }
3777  if (gps) {
3778  appropriate_parent = gps;
3779  } else if (np) {
3781  if (gp && gp.IsSet() && gp.GetSet().IsSetClass() &&
3783  appropriate_parent = gp;
3784  } else {
3785  appropriate_parent = np;
3786  }
3787  } else {
3788  appropriate_parent = seh;
3789  }
3790  return appropriate_parent;
3791 }
3792 
3793 
3796  CConstRef<CPubdesc> pub)
3797 {
3798  // first, try to receive from cache
3800  m_pubdescCache.find(pub);
3801  if( find_iter != m_pubdescCache.end() ) {
3802  return *find_iter->second;
3803  }
3804 
3805  CRef<CPubdescInfo> pInfo(new CPubdescInfo);
3807  *pub, pInfo->m_pmids, pInfo->m_muids,
3808  pInfo->m_serials, pInfo->m_published_labels,
3809  pInfo->m_unpublished_labels);
3810  m_pubdescCache[pub] = pInfo;
3811  return *pInfo;
3812 }
3813 
3814 bool
3816  const SFeatKey & rhs) const
3817 {
3818  if( feat_type != rhs.feat_type ) {
3819  return feat_type < rhs.feat_type;
3820  } else if( feat_subtype != rhs.feat_subtype ) {
3821  return feat_subtype < rhs.feat_subtype;
3822  } else {
3823  return bioseq_h < rhs.bioseq_h;
3824  }
3825 }
3826 
3827 bool
3829  const SFeatKey & rhs) const
3830 {
3831  return (feat_type == rhs.feat_type) &&
3832  (feat_subtype == rhs.feat_subtype) && (bioseq_h == rhs.bioseq_h);
3833 }
3834 
3835 const CCacheImpl::TFeatValue &
3837  const CCacheImpl::SFeatKey & featKey)
3838 {
3839  // check common case where already in the cache
3840  TFeatCache::iterator find_iter = m_featCache.find(featKey);
3841  if( find_iter != m_featCache.end() ) {
3842  return find_iter->second;
3843  }
3844 
3845  // check if bioseq already processed, but had no entry requested above
3846  SFeatKey bioseq_check_key(
3848  TFeatCache::const_iterator bioseq_find_iter =
3849  m_featCache.find(bioseq_check_key);
3850  if( bioseq_find_iter != m_featCache.end() ) {
3851  // bioseq was already processed,
3852  // it just happened to not have an entry here
3853  return kEmptyFeatValue;
3854  }
3855 
3856  // bioseq never added to cache, so calculate that now
3857 
3858  // to avoid expensive constructions of CFeat_CI's,
3859  // we iterate through all the seqs on
3860  // the bioseq and load them into the cache.
3861  CFeat_CI feat_ci(featKey.bioseq_h);
3862  for( ; feat_ci; ++feat_ci ) {
3863  SFeatKey inner_feat_key(
3864  feat_ci->GetFeatType(), feat_ci->GetFeatSubtype(), featKey.bioseq_h);
3865 
3866  m_featCache[inner_feat_key].push_back(*feat_ci);
3867 
3868  // also add "don't care" entries for partial searches
3869  // (e.g. if caller just wants to search on type but not on
3870  // subtype they can set subtype to kAnyFeatSubtype)
3871  SFeatKey any_type_key = inner_feat_key;
3872  any_type_key.feat_type = kAnyFeatType;
3873  m_featCache[any_type_key].push_back(*feat_ci);
3874 
3875  SFeatKey any_subtype_key = inner_feat_key;
3876  any_subtype_key.feat_subtype = kAnyFeatSubtype;
3877  m_featCache[any_subtype_key].push_back(*feat_ci);
3878 
3879  // for when the caller wants all feats on a bioseq
3880  SFeatKey any_type_or_subtype_key = inner_feat_key;
3881  any_type_or_subtype_key.feat_type = kAnyFeatType;
3882  any_type_or_subtype_key.feat_subtype = kAnyFeatSubtype;
3883  m_featCache[any_type_or_subtype_key].push_back(*feat_ci);
3884  }
3885 
3886  // in case a bioseq has no features, we add a dummy key just to
3887  // remember that so we don't use CFeat_CI again on the same bioseq
3888  m_featCache[bioseq_check_key]; // gets default val
3889 
3890  return m_featCache[featKey];
3891 }
3892 
3895  const vector<SFeatKey> &featKeys)
3896 {
3897  if( featKeys.empty() ) {
3898  return new TFeatValue;
3899  }
3900 
3901  // all featKeys must have the same bioseq
3902  const CBioseq_Handle & bioseq_h = featKeys[0].bioseq_h;
3903  ITERATE(vector<SFeatKey>, feat_it, featKeys) {
3904  if( feat_it->bioseq_h != bioseq_h ) {
3905  throw runtime_error("GetFeatFromCacheMulti must be called with only 1 bioseq in its args");
3906  }
3907  }
3908 
3909  // set prevents dups
3910  set<TFeatValue::value_type> set_of_feats;
3911 
3912  // combine the answers from every key into the set
3913  ITERATE(vector<SFeatKey>, key_it, featKeys ) {
3914  const TFeatValue & feat_value = GetFeatFromCache(*key_it);
3915  copy(BEGIN_COMMA_END(feat_value), inserter(
3916  set_of_feats, set_of_feats.begin()));
3917  }
3918 
3919  // go through every feature on the bioseq and remember any that match what's in the set
3920  // (The purpose of this step is to return the feats in the same
3921  // order they were on the original bioseq. In the future, we may
3922  // consider adding a flag to avoid sorting for time purposes).
3923  AutoPtr<TFeatValue> answer(new TFeatValue);
3924  SFeatKey all_feats_key(
3925  kAnyFeatType, kAnyFeatSubtype, bioseq_h);
3926  const TFeatValue & all_feats_vec = GetFeatFromCache(all_feats_key);
3927  ITERATE(TFeatValue, feat_it, all_feats_vec) {
3928  if( set_of_feats.find(*feat_it) != set_of_feats.end() ) {
3929  answer->push_back(*feat_it);
3930  }
3931  }
3932 
3933  return answer;
3934 }
3935 
3936 
3937 //LCOV_EXCL_START
3938 //not used
3939 bool
3941 {
3942  if( m_eFeatKeyStr != rhs.m_eFeatKeyStr ) {
3943  return m_eFeatKeyStr < rhs.m_eFeatKeyStr;
3944  }
3945  if( m_bioseq != rhs.m_bioseq ) {
3946  return m_bioseq < rhs.m_bioseq;
3947  }
3948  return s_QuickStringLess(m_feat_str, rhs.m_feat_str);
3949 }
3950 
3951 
3952 bool
3954 {
3955  if( m_eFeatKeyStr != rhs.m_eFeatKeyStr ) {
3956  return false;
3957  }
3958  if( m_bioseq != rhs.m_bioseq ) {
3959  return false;
3960  }
3961  return (m_feat_str == rhs.m_feat_str);
3962 }
3963 
3964 
3965 const CCacheImpl::TFeatValue &
3967  const SFeatStrKey & feat_str_key, const CTSE_Handle & tse_arg)
3968 {
3969  const CBioseq_Handle & search_bsh = feat_str_key.m_bioseq;
3970 
3971  // caller must give us something to work with
3972  _ASSERT(search_bsh || tse_arg);
3973 
3974  const CTSE_Handle & tse = (tse_arg ? tse_arg : search_bsh.GetTSE_Handle());
3975 
3976  // load cache if empty
3978  // (for now just indexes genes, but more may be added in the future)
3980  AutoPtr<CFeat_CI> p_gene_ci;
3981  // if we have TSE, get all features on it; otherwise, just get
3982  // the features from the bioseq
3983  if( tse ) {
3984  p_gene_ci.reset(new CFeat_CI(tse, sel));
3985  } else {
3986  p_gene_ci.reset(new CFeat_CI(search_bsh, sel));
3987  }
3988  CFeat_CI & gene_ci = *p_gene_ci; // for convenience
3989 
3990  for( ; gene_ci; ++gene_ci ) {
3991  CBioseq_Handle bsh = tse.GetScope().GetBioseqHandle(gene_ci->GetLocation());
3992  string label;
3993  const CGene_ref & gene_ref = gene_ci->GetData().GetGene();
3994 
3995  // for each one, add an entry for using given Bioseq and the
3996  // kAnyBioseq (so users can search on any bioseq)
3997  gene_ref.GetLabel(&label);
3998  SFeatStrKey label_key(eFeatKeyStr_Label, bsh, label);
3999  m_featStrKeyToFeatsCache[label_key].push_back(*gene_ci);
4000  if( bsh ) {
4001  label_key.m_bioseq = kAnyBioseq;
4002  m_featStrKeyToFeatsCache[label_key].push_back(*gene_ci);
4003  }
4004 
4005  const string & locus_tag = (
4006  gene_ref.IsSetLocus_tag() ? gene_ref.GetLocus_tag() :
4007  kEmptyStr);
4008  SFeatStrKey locus_tag_key(eFeatKeyStr_LocusTag, bsh, locus_tag);
4009  m_featStrKeyToFeatsCache[locus_tag_key].push_back(*gene_ci);
4010  if( bsh ) {
4011  locus_tag_key.m_bioseq = kAnyBioseq;
4012  m_featStrKeyToFeatsCache[locus_tag_key].push_back(*gene_ci);
4013  }
4014  }
4015  }
4016 
4017  // get from cache, if possible
4019  m_featStrKeyToFeatsCache.find(feat_str_key);
4020  if( find_iter != m_featStrKeyToFeatsCache.end() ) {
4021  return find_iter->second;
4022  } else {
4023  // nothing found
4024  return kEmptyFeatValue;
4025  }
4026 }
4027 
4028 
4031  const CCacheImpl::TFeatToBioseqKey & feat_to_bioseq_key,
4032  const CTSE_Handle & tse)
4033 {
4034  // load cache if empty
4035  if( m_featToBioseqCache.empty() ) {
4036  CBioseq_CI bioseq_ci(tse);
4037  for( ; bioseq_ci; ++bioseq_ci ) {
4038  CFeat_CI feat_ci(*bioseq_ci);
4039  for( ; feat_ci; ++feat_ci ) {
4040  m_featToBioseqCache[*feat_ci].insert(*bioseq_ci);
4041  }
4042  }
4043  }
4044 
4045  // we're being given the map to a feature, so we should've loaded
4046  // at least one feature when we loaded the cache
4048 
4049  // load from the cache
4051  m_featToBioseqCache.find(feat_to_bioseq_key);
4052  if( find_iter != m_featToBioseqCache.end() ) {
4053  return find_iter->second;
4054  } else {
4055  const static TFeatToBioseqValue kEmptyFeatToBioseqCache;
4056  return kEmptyFeatToBioseqCache;
4057  }
4058 }
4059 //LCOV_EXCL_STOP
4060 
4064  const CTSE_Handle & tse)
4065 {
4066  _ASSERT(tse);
4067 
4068  // load cache if empty
4069  if( m_IdToBioseqCache.empty() ) {
4070  CBioseq_CI bioseq_ci(tse);
4071  for( ; bioseq_ci; ++bioseq_ci ) {
4072  const CBioseq_Handle::TId & ids = bioseq_ci->GetId();
4073  ITERATE(CBioseq_Handle::TId, id_it, ids) {
4074  m_IdToBioseqCache[id_it->GetSeqId()] = *bioseq_ci;
4075  }
4076  }
4077  }
4078 
4079  // there should be at least one Bioseq otherwise there wouldn't
4080  // be anything to validate.
4082 
4084  if( find_iter != m_IdToBioseqCache.end() ) {
4085  return find_iter->second;
4086  } else {
4087  static const TIdToBioseqValue s_EmptyResult;
4088  return s_EmptyResult;
4089  }
4090 }
4091 
4094  CScope *scope, const CSeq_loc& loc, const CTSE_Handle & tse)
4095 {
4096  _ASSERT(scope || tse);
4097  if( ! tse || (!tse.GetTopLevelEntry().IsSet() && !tse.GetTopLevelEntry().IsSeq())) {
4098  // fall back on old style
4099  return BioseqHandleFromLocation(scope, loc);
4100  }
4101 
4102 
4103  for ( CSeq_loc_CI citer (loc); citer; ++citer) {
4104  CConstRef<CSeq_id> id(&citer.GetSeq_id());
4105  const TIdToBioseqValue & bioseq = GetIdToBioseq(id, tse);
4106  if( bioseq ) {
4107  return bioseq;
4108  }
4109  }
4110 
4111  // nothing found, so fall back on old style if possible
4112  if( scope ) {
4113  return BioseqHandleFromLocation(scope, loc);
4114  } else {
4115  return kEmptyBioseqHandle;
4116  }
4117 }
4118 
4119 
4121 {
4123  m_featCache.clear();
4127 }
4128 
4129 
4130 
4131 
4132 
4133 END_SCOPE(validator)
static CRef< CScope > m_Scope
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
EErrType
@ eErr_SEQ_FEAT_WrongQualOnImpFeat
@ eErr_SEQ_DESCR_ObsoleteSourceQual
@ eErr_SEQ_DESCR_ObsoleteSourceLocation
@ eErr_SEQ_INST_FarFetchFailure
@ eErr_SEQ_FEAT_WholeLocation
@ eErr_SEQ_INST_ShortSeq
@ eErr_GENERIC_MissingPubRequirement
@ eErr_SEQ_FEAT_EcNumberProblem
@ eErr_SEQ_FEAT_DuplicateAnticodonInterval
@ eErr_SEQ_INST_CompleteGenomeHasGaps
@ eErr_SEQ_FEAT_CDShasTooManyXs
@ eErr_SEQ_FEAT_TranslExceptPhase
@ eErr_SEQ_FEAT_MinusStrandProtein
@ eErr_SEQ_INST_CompleteTitleProblem
@ eErr_SEQ_PKG_EmptySet
@ eErr_SEQ_DESCR_UnwantedCompleteFlag
@ eErr_SEQ_FEAT_GeneXrefWithoutLocus
@ eErr_SEQ_FEAT_BadLocation
@ eErr_SEQ_FEAT_GenesInconsistent
@ eErr_SEQ_INST_HighNContentStretch
@ eErr_SEQ_PKG_NoBioseqFound
@ eErr_SEQ_FEAT_PseudoRnaHasProduct
@ eErr_SEQ_DESCR_InconsistentBioSources
@ eErr_GENERIC_PastReleaseDate
@ eErr_SEQ_DESCR_BioSourceDbTagConflict
@ eErr_SEQ_FEAT_UnknownImpFeatQual
@ eErr_SEQ_FEAT_DuplicateExonInterval
@ eErr_GENERIC_UnnecessaryPubEquiv
@ eErr_SEQ_DESCR_BioSourceOnProtein
@ eErr_SEQ_DESCR_LatLonRange
@ eErr_SEQ_FEAT_UnnecessaryTranslExcept
@ eErr_SEQ_GRAPH_GraphBioseqId
@ eErr_SEQ_FEAT_MixedStrand
@ eErr_SEQ_FEAT_BadRRNAcomponentOrder
@ eErr_SEQ_DESCR_DuplicatePCRPrimerSequence
@ eErr_SEQ_FEAT_BadGeneOntologyFormat
@ eErr_SEQ_DESCR_LatLonCountry
@ eErr_SEQ_PKG_NucProtSetHasTitle
@ eErr_SEQ_FEAT_IllegalDbXref
@ eErr_GENERIC_SgmlPresentInText
@ eErr_SEQ_FEAT_BadAnticodonAA
@ eErr_SEQ_FEAT_MissingCDSproduct
@ eErr_SEQ_FEAT_FeatureBeginsOrEndsInGap
@ eErr_SEQ_FEAT_TranslExceptAndRnaEditing
@ eErr_GENERIC_BarcodeTooManyNs
@ eErr_SEQ_PKG_BioseqSetClassNotSet
@ eErr_SEQ_DESCR_NoOrgFound
@ eErr_SEQ_FEAT_MissingProteinName
@ eErr_SEQ_DESCR_BadPCRPrimerSequence
@ eErr_SEQ_FEAT_GeneXrefWithoutGene
@ eErr_SEQ_DESCR_TransgenicProblem
@ eErr_SEQ_PKG_MissingSetTitle
@ eErr_SEQ_FEAT_InvalidQualifierValue
@ eErr_SEQ_FEAT_GeneOntologyTermMissingGOID
@ eErr_SEQ_FEAT_ProtRefHasNoData
@ eErr_SEQ_GRAPH_GraphSeqLocLen
@ eErr_SEQ_DESCR_InvalidForType
@ eErr_SEQ_DESCR_LatLonValue
@ eErr_SEQ_FEAT_TransLen
@ eErr_SEQ_FEAT_FeatureCitationProblem
@ eErr_SEQ_DESCR_IdenticalInstitutionCode
@ eErr_SEQ_PKG_ImproperlyNestedSets
@ eErr_SEQ_INST_UnknownLengthGapNot100
@ eErr_SEQ_FEAT_WrongQualOnFeature
@ eErr_SEQ_FEAT_MultipleProtRefs
@ eErr_SEQ_FEAT_MultipleEquivPublications
@ eErr_SEQ_PKG_SeqSubmitWithWgsSet
@ eErr_SEQ_PKG_InconsistentMoltypeSet
@ eErr_SEQ_INST_ConflictingBiomolTech
@ eErr_SEQ_FEAT_MissingQualOnImpFeat
@ eErr_SEQ_PKG_INSDRefSeqPackaging
@ eErr_SEQ_FEAT_LocusCollidesWithLocusTag
@ eErr_SEQ_PKG_GPSnonGPSPackaging
@ eErr_SEQ_DESCR_BadCollectionDate
@ eErr_SEQ_FEAT_MultipleEquivBioSources
@ eErr_SEQ_FEAT_CDSwithNoMRNAOverlap
@ eErr_SEQ_DESCR_BadInstitutionCode
@ eErr_SEQ_FEAT_PeptideFeatOutOfFrame
@ eErr_SEQ_FEAT_ProteinNameHasPMID
@ eErr_SEQ_FEAT_RepeatRegionNeedsNote
@ eErr_SEQ_DESCR_BadAltitude
@ eErr_SEQ_FEAT_GeneXrefStrandProblem
@ eErr_SEQ_FEAT_MissingTrnaAA
@ eErr_GENERIC_NonAsciiAsn
@ eErr_SEQ_FEAT_CDSwithMultipleMRNAs
@ eErr_SEQ_FEAT_CollidingFeatureIDs
@ eErr_SEQ_DESCR_IncorrectlyFormattedVoucherID
@ eErr_SEQ_FEAT_OrfCdsHasProduct
@ eErr_SEQ_FEAT_ImproperBondLocation
@ eErr_SEQ_PKG_GraphPackagingProblem
@ eErr_SEQ_INST_OverlappingDeltaRange
@ eErr_SEQ_FEAT_BadTranssplicedInterval
@ eErr_SEQ_INST_SeqLocLength
@ eErr_SEQ_DESCR_MultipleTaxonIDs
@ eErr_SEQ_DESCR_BadKeyword
@ eErr_SEQ_FEAT_UnknownImpFeatKey
@ eErr_SEQ_DESCR_Inconsistent
@ eErr_SEQ_PKG_ArchaicFeatureLocation
@ eErr_GENERIC_BadDate
@ eErr_GENERIC_BarcodeTestFails
@ eErr_SEQ_FEAT_NestedSeqLocMix
@ eErr_SEQ_FEAT_ShortIntron
@ eErr_SEQ_FEAT_UnknownFeatureQual
@ eErr_SEQ_DESCR_MultipleChromosomes
@ eErr_SEQ_FEAT_Range
@ eErr_SEQ_FEAT_InconsistentGeneOntologyTermAndId
@ eErr_SEQ_PKG_MisplacedMolInfo
@ eErr_GENERIC_EmbeddedScript
@ eErr_GENERIC_BarcodeTestPasses
@ eErr_SEQ_GRAPH_GraphAbove
@ eErr_SEQ_FEAT_FeatureInsideGap
@ eErr_SEQ_FEAT_DifferntIdTypesInSeqLoc
@ eErr_SEQ_FEAT_BadFullLengthFeature
@ eErr_SEQ_FEAT_RNAtype0
@ eErr_SEQ_FEAT_BadCharInAuthorName
@ eErr_SEQ_FEAT_FarLocation
@ eErr_SEQ_INST_BadHTGSeq
@ eErr_SEQ_FEAT_InvalidFuzz
@ eErr_SEQ_FEAT_InvalidInferenceValue
@ eErr_SEQ_FEAT_GeneXrefNeeded
@ eErr_SEQ_INST_UnexpectedIdentifierChange
@ eErr_SEQ_FEAT_InconsistentRRNAstrands
@ eErr_SEQ_PKG_ArchaicFeatureProduct
@ eErr_SEQ_DESCR_MultipleSourceQualifiers
@ eErr_SEQ_FEAT_BadRRNAcomponentOverlap
@ eErr_SEQ_FEAT_BadTrailingCharacter
@ eErr_SEQ_DESCR_WrongVoucherType
@ eErr_SEQ_INST_ProteinsHaveGeneralID
@ eErr_SEQ_GRAPH_GraphOutOfOrder
@ eErr_SEQ_FEAT_BadInternalCharacter
@ eErr_SEQ_DESCR_NoSourceDescriptor
@ eErr_SEQ_DESCR_BadCollectionCode
@ eErr_SEQ_FEAT_BadProteinName
@ eErr_SEQ_FEAT_FeatureProductInconsistency
@ eErr_GENERIC_PublicationInconsistency
@ eErr_GENERIC_BadSubmissionAuthorName
@ eErr_GENERIC_CollidingSerialNumbers
@ eErr_SEQ_PKG_ComponentMissingTitle
@ eErr_SEQ_DESCR_DBLinkMissingUserObject
@ eErr_SEQ_PKG_InternalGenBankSet
@ eErr_SEQ_DESCR_BioSourceMissing
@ eErr_SEQ_FEAT_BadAnticodonCodon
@ eErr_SEQ_FEAT_BadTrailingHyphen
@ eErr_SEQ_FEAT_OldLocusTagMismtach
@ eErr_SEQ_DESCR_MolInfoConflictsWithBioSource
@ eErr_SEQ_FEAT_UTRdoesNotAbutCDS
@ eErr_SEQ_FEAT_PseudoRnaViaGeneHasProduct
@ eErr_SEQ_FEAT_ConflictFlagSet
@ eErr_SEQ_FEAT_StrandOther
@ eErr_SEQ_PKG_FeaturePackagingProblem
@ eErr_SEQ_DESCR_MultipleNames
@ eErr_SEQ_INST_BadSeqIdFormat
@ eErr_SEQ_PKG_GenomicProductPackagingProblem
@ eErr_INTERNAL_Exception
@ eErr_SEQ_FEAT_BadEcNumberFormat
@ eErr_SEQ_FEAT_CDSproductPackagingProblem
@ eErr_SEQ_FEAT_RedundantFields
@ eErr_SEQ_INST_InternalNsInSeqRaw
@ eErr_SEQ_DESCR_BadOrgMod
@ eErr_SEQ_INST_TerminalNs
@ eErr_SEQ_DESCR_BadOrganelleLocation
@ eErr_SEQ_FEAT_NoNameForProtein
@ eErr_SEQ_FEAT_RptUnitRangeProblem
@ eErr_SEQ_FEAT_SeqLocOrder
@ eErr_SEQ_DESCR_TaxonomyIsSpeciesProblem
@ eErr_SEQ_FEAT_CDSmRNAXrefLocationProblem
@ eErr_SEQ_PKG_SingleItemSet
@ eErr_SEQ_DESCR_BioSourceNeedsChromosome
@ eErr_SEQ_FEAT_VectorContamination
@ eErr_SEQ_FEAT_AbuttingIntervals
@ eErr_SEQ_FEAT_CDSrange
@ eErr_SEQ_FEAT_LocusTagProblem
@ eErr_SEQ_DESCR_BioSourceInconsistency
@ eErr_SEQ_FEAT_OnlyGeneXrefs
@ eErr_SEQ_FEAT_TranslExcept
@ eErr_SEQ_INST_InternalGapsInSeqRaw
@ eErr_SEQ_FEAT_GeneRefHasNoData
@ eErr_SEQ_INST_DuplicateSegmentReferences
@ eErr_SEQ_FEAT_TooManyInferenceAccessions
@ eErr_SEQ_FEAT_TerminalXDiscrepancy
@ eErr_SEQ_FEAT_MiscFeatureNeedsNote
@ eErr_SEQ_DESCR_CollidingPublications
@ eErr_SEQ_FEAT_GenomeSetMixedStrand
@ eErr_SEQ_FEAT_BadCharInAuthorLastName
@ eErr_SEQ_FEAT_HypotheticalProteinMismatch
@ eErr_SEQ_INST_TpaAssemblyProblem
@ eErr_SEQ_FEAT_MissingGeneXref
AutoPtr –.
Definition: ncbimisc.hpp:401
CAlign_CI –.
Definition: align_ci.hpp:63
@Auth_list.hpp User-defined methods of the data storage class.
Definition: Auth_list.hpp:57
CAuthor –.
Definition: Author.hpp:59
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
CSeq_entry * GetParentEntry(void) const
Definition: Bioseq.hpp:174
static void GetPubdescLabels(const CPubdesc &pd, vector< TEntrezId > &pmids, vector< TEntrezId > &muids, vector< int > &serials, vector< string > &published_labels, vector< string > &unpublished_labels)
For Publication Citations Get labels for a pubdesc.
Definition: cleanup.cpp:3140
Definition: Dbtag.hpp:53
bool GetDBFlags(bool &is_refseq, bool &is_src, string &correct_caps) const
Definition: Dbtag.cpp:327
bool IsSkippable(void) const
Definition: Dbtag.cpp:281
CFeat_CI –.
Definition: feat_ci.hpp:64
void Clear()
Definition: gene_cache.hpp:89
CConstRef< CSeq_feat > GetGeneFromCache(const CSeq_feat *feat, CScope &scope)
Definition: gene_cache.cpp:106
void GetLabel(string *label) const
Definition: Gene_ref.cpp:57
CGraph_CI –.
Definition: graph_ci.hpp:234
CMappedFeat –.
Definition: mapped_feat.hpp:59
@Name_std.hpp User-defined methods of the data storage class.
Definition: Name_std.hpp:56
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:264
CObjectManager –.
const string & GetDivision(void) const
Definition: Org_ref.cpp:164
bool IsSetDivision(void) const
Definition: Org_ref.cpp:159
@ eContent
Definition: Pub.hpp:66
@Pubdesc.hpp User-defined methods of the data storage class.
Definition: Pubdesc.hpp:54
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
ESubtype GetSubtype(void) const
static bool RequireLocationIntervalsInBiologicalOrder(ESubtype subtype)
static bool AllowAdjacentIntervals(ESubtype subtype)
@ eSubtype_bad
These no longer need to match the FEATDEF values in the C toolkit's objfdef.h.
CSeq_annot_Handle –.
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
@ eContent
Definition: Seq_entry.hpp:93
void GetLabel(string *label, ELabelType type) const
Definition: Seq_entry.cpp:274
CSeq_entry * GetParentEntry(void) const
Definition: Seq_entry.hpp:131
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
Base class for all serializable objects.
Definition: serialbase.hpp:150
CSubmit_block –.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
Definition: tse_handle.cpp:205
TSeq_feat_Handles GetFeaturesWithId(CSeqFeatData::E_Choice type, TFeatureIdInt id) const
Definition: tse_handle.cpp:604
CScope & GetScope(void) const
Returns scope.
Definition: tse_handle.hpp:325
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
CTypeInfo class contains all information about C++ types (both basic and classes): members and layout...
Definition: typeinfo.hpp:76
Thrown on an attempt to write unassigned data member.
Definition: exception.hpp:84
static string GetFeatureBioseqLabel(const CSeq_feat &ft, CRef< CScope > scope, bool suppress_context)
static string GetDescriptorContent(const CSeqdesc &ds)
static string GetFeatureLocationLabel(const CSeq_feat &ft, CRef< CScope > scope, bool suppress_context)
static string GetFeatureProductLocLabel(const CSeq_feat &ft, CRef< CScope > scope, bool suppress_context)
static string GetDescriptorLabel(const CSeqdesc &ds, const CSeq_entry &ctx, CRef< CScope > scope, bool suppress_context)
static string GetFeatureContentLabel(const CSeq_feat &feat, CRef< CScope > scope)
static string GetFeatureIdLabel(const CSeq_feat &ft)
static string GetBioseqSetLabel(const CBioseq_set &st, CRef< CScope > scope, bool suppress_context)
void ValidateSeqAlign(const CSeq_align &align, int order=-1)
void ValidateSeqAnnot(const CSeq_annot_Handle &annot)
CCacheImpl & GetCache()
virtual ~CValidError_base()
static CSeq_entry_Handle GetAppropriateXrefParent(CSeq_entry_Handle seh)
CValidError_imp & m_Imp
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
CValidError_base(CValidError_imp &imp)
void ValidateBioseq(const CBioseq &seq)
bool GetTSAConflictingBiomolTechErrors(const CBioseq &seq)
bool GetTSANStretchErrors(const CBioseq &seq)
void ValidateBioseqSet(const CBioseq_set &seqset)
void ValidateSeqDesc(const CSeqdesc &desc, const CSeq_entry &ctx)
Validate descriptors as stand alone objects (no context)
void SetScope(CScope &scope)
void SetTSE(CSeq_entry_Handle seh)
bool GetTSACDSOnMinusStrandErrors(const CSeq_feat &feat, const CBioseq &seq)
static bool GetPrefixAndAccessionFromInferenceAccession(string inf_accession, string &prefix, string &accession)
void ValidateSeqFeat(const CSeq_feat &feat)
static vector< string > GetAccessionsFromInferenceString(string inference, string &prefix, string &remainder, bool &same_species)
void ValidateSeqGraph(const CSeq_graph &graph)
void x_ReportInvalidFuzz(const CPacked_seqint &packed_int, const CSerialObject &obj)
CRef< CObjectManager > m_ObjMgr
bool IsGED() const
void SetScope(const CSeq_entry &se)
void FindCollidingSerialNumbers(const CSerialObject &obj)
Definition: valid_pub.cpp:1323
const CSeq_entry_Handle & GetTSEH()
static bool BadMultipleSequenceLocation(const CSeq_loc &loc, CScope &scope)
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
Definition: validatorp.cpp:375
static bool IsTSAIntermediate(const CBioseq &seq)
void x_CheckPackedInt(const CPacked_seqint &packed_int, SLocCheck &lc, const CSerialObject &obj)
static bool IsInOrganelleSmallGenomeSet(const CSeq_id &id, CScope &scope)
bool IsNC() const
const CBioSourceKind & BioSourceKind() const
bool IsNS() const
CRef< CScope > m_Scope
bool HasGiOrAccnVer() const
void SetTSE(const CSeq_entry_Handle &seh)
const SValidatorContext & GetContext() const
Definition: validatorp.cpp:210
CValidator::TProgressCallback m_PrgCallback
IValidError * m_ErrRepository
bool IsPDB() const
CConstRef< CSeq_feat > GetmRNAGivenProduct(const CBioseq &seq)
bool IsValidateAlignments() const
CBioseq_Handle GetBioseqHandleFromTSE(const CSeq_id &id)
Definition: validatorp.cpp:254
void ValidateCitations(const CSeq_entry_Handle &seh)
bool DoesAnyFeatLocHaveGI() const
void FindNonAsciiText(const CSerialObject &obj)
void AddBioseqWithNoBiosource(const CBioseq &seq)
void ValidateSeqLocIds(const CSeq_loc &loc, const CSerialObject &obj)
bool GenerateGoldenFile() const
bool IsStandaloneAnnot() const
void x_DoBarcodeTests(CSeq_entry_Handle seh)
CConstRef< CSeq_annot > m_SeqAnnot
TSuppressed & SetSuppressed()
Definition: validatorp.cpp:195
bool IsNM() const
bool DoesAnyProductLocHaveGI() const
bool GetTSAConflictingBiomolTechErrors(const CSeq_entry_Handle &se)
bool x_IsSuppressed(CValidErrItem::TErrIndex errType) const
Definition: validatorp.cpp:368
void x_AddValidErrItem(EDiagSev sev, EErrType type, const string &msg, const string &desc, const CSerialObject &obj, const string &accession, const int version)
unique_ptr< CValidatorEntryInfo > m_pEntryInfo
void x_Init(Uint4 options, size_t initialInferenceCount, bool notJustLocalOrGeneral, bool hasRefSeq)
Definition: validatorp.cpp:181
bool IsNT() const
bool IsGenbank() const
void PostObjErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
TSuppressed m_SuppressedErrors
bool IsNZ() const
void Setup(const CSeq_entry_Handle &seh)
bool Validate(const CSeq_entry &se, const CCit_sub *cs=nullptr, CScope *scope=nullptr)
static bool IsWGSIntermediate(const CBioseq &seq)
CValidator::CProgressInfo m_PrgInfo
void ValidateDbxref(const CDbtag &xref, const CSerialObject &obj, bool biosource=false, const CSeq_entry *ctx=nullptr)
bool IsSerialNumberInComment(const string &comment)
bool IsGenomic() const
void ValidateTaxonomy(const CSeq_entry &se)
bool IsFarSequence(const CSeq_id &id)
Definition: validatorp.cpp:240
const CTSE_Handle & GetTSE_Handle()
size_t m_NumMisplacedFeatures