NCBI C++ ToolKit
validerror_desc.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: validerror_desc.cpp 101299 2023-11-28 18:18:38Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko......
27  *
28  * File Description:
29  * validation of seq_desc
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <corelib/ncbiapp.hpp>
42 
52 
53 #include <objects/seq/Seqdesc.hpp>
54 #include <objects/seq/MolInfo.hpp>
56 
58 
59 #include <util/utf8.hpp>
60 
63 BEGIN_SCOPE(validator)
64 
65 
67  CValidError_base(imp)
68 {
69 }
70 
71 
73 {
74 }
75 
76 
77 static string s_AsciiString(const string& src)
78 {
79  string dst;
80 
81  for (char ch : src) {
82  unsigned char chu = ch;
83  if (chu > 31 && chu < 128) {
84  dst += chu;
85  } else {
86  dst += '#';
87  }
88  }
89 
90  return dst;
91 }
92 
93 
94 /**
95  * Validate descriptors as stand alone objects (no context)
96  **/
98  const CSeqdesc& desc,
99  const CSeq_entry& ctx)
100 {
101  m_Ctx.Reset(&ctx);
102 
103  // check for non-ascii characters
105 
106  for (; it; ++it) {
107  const string& str = *it;
109  const char& ch = *c_it;
110  unsigned char chu = ch;
111  if (ch > 127 || (ch < 32 && ch != '\t' && ch != '\r' && ch != '\n')) {
112  string txt = s_AsciiString(str);
114  "Non-ASCII character '" + NStr::NumericToString(chu) + "' found (" + txt + ")", ctx, desc);
115  break;
116  }
117  }
118  }
119 
120 
121  // switch on type, e.g., call ValidateBioSource, ValidatePubdesc, ...
122  switch ( desc.Which() ) {
123  case CSeqdesc::e_Modif: {
125  "Modif descriptor is obsolete", *m_Ctx, desc);
126  CSeqdesc::TModif::const_iterator it2 = desc.GetModif().begin();
127  while (it2 != desc.GetModif().end()) {
128  if (*it2 == eGIBB_mod_other) {
129  PostErr (eDiag_Error, eErr_SEQ_DESCR_Unknown, "GIBB-mod = other used", ctx, desc);
130  }
131  ++it2;
132  }
133  } break;
134 
137  "MolType descriptor is obsolete", *m_Ctx, desc);
138  break;
139 
140  case CSeqdesc::e_Method:
142  "Method descriptor is obsolete", *m_Ctx, desc);
143  break;
144 
145  case CSeqdesc::e_Comment:
146  ValidateComment(desc.GetComment(), desc);
147  break;
148 
149  case CSeqdesc::e_Pub:
150  m_Imp.ValidatePubdesc(desc.GetPub(), desc, &ctx);
151  break;
152 
153  case CSeqdesc::e_User:
154  ValidateUser(desc.GetUser(), desc);
155  break;
156 
157  case CSeqdesc::e_Source:
158  m_Imp.ValidateBioSource (desc.GetSource(), desc, &ctx);
159  break;
160 
161  case CSeqdesc::e_Molinfo:
162  ValidateMolInfo(desc.GetMolinfo(), desc);
163  break;
164 
165  case CSeqdesc::e_not_set:
166  break;
167  case CSeqdesc::e_Name:
168  if (NStr::IsBlank (desc.GetName())) {
170  "Name descriptor needs text", ctx, desc);
171  }
172  break;
173  case CSeqdesc::e_Title:
174  ValidateTitle(desc.GetTitle(), desc, ctx);
175  break;
176  case CSeqdesc::e_Org:
178  "OrgRef descriptor is obsolete", *m_Ctx, desc);
179  break;
180  case CSeqdesc::e_Num:
181  break;
182  case CSeqdesc::e_Maploc:
183  break;
184  case CSeqdesc::e_Pir:
185  break;
186  case CSeqdesc::e_Genbank:
187  break;
188  case CSeqdesc::e_Region:
189  if (NStr::IsBlank (desc.GetRegion())) {
191  "Region descriptor needs text", ctx, desc);
192  }
193  break;
194  case CSeqdesc::e_Sp:
195  break;
196  case CSeqdesc::e_Dbxref:
197  break;
198  case CSeqdesc::e_Embl:
199  break;
201  {
202  int rval = CheckDate (desc.GetCreate_date(), true);
203  if (rval != eDateValid_valid) {
204  m_Imp.PostBadDateError (eDiag_Error, "Create date has error", rval, desc, &ctx);
205  }
206  }
207  break;
209  {
210  int rval = CheckDate (desc.GetUpdate_date(), true);
211  if (rval != eDateValid_valid) {
212  m_Imp.PostBadDateError (eDiag_Error, "Update date has error", rval, desc, &ctx);
213  }
214  }
215  break;
216  case CSeqdesc::e_Prf:
217  break;
218  case CSeqdesc::e_Pdb:
219  break;
220  case CSeqdesc::e_Het:
221  break;
222  default:
223  break;
224  }
225 
226  m_Ctx.Reset();
227 }
228 
229 
231  const string& comment,
232  const CSeqdesc& desc)
233 {
234  if ( m_Imp.IsSerialNumberInComment(comment) ) {
236  "Comment may refer to reference by serial number - "
237  "attach reference specific comments to the reference "
238  "REMARK instead.", *m_Ctx, desc);
239  }
240  if (NStr::IsBlank (comment)) {
242  "Comment descriptor needs text", *m_Ctx, desc);
243  } else {
244  if (NStr::Find (comment, "::") != string::npos) {
246  "Comment may be formatted to look like a structured comment.", *m_Ctx, desc);
247  }
248  }
249 }
250 
251 
252 void CValidError_desc::ValidateTitle(const string& title, const CSeqdesc& desc, const CSeq_entry& ctx)
253 {
254  if (NStr::IsBlank(title)) {
256  "Title descriptor needs text", ctx, desc);
257  } else {
258  if (s_StringHasPMID(title)) {
260  "Title descriptor has internal PMID", ctx, desc);
261  }
262  string cpy = title;
264  char end = cpy.c_str()[cpy.length() - 1];
265 
266  if (end == '.' && cpy.length() > 4) {
267  end = cpy.c_str()[cpy.length() - 2];
268  }
269  if (end == ','
270  || end == '.'
271  || end == ';'
272  || end == ':') {
274  "Title descriptor ends in bad punctuation", ctx, desc);
275  }
276  if (!m_Imp.IsRefSeq() && NStr::FindNoCase(title, "RefSeq") != string::npos) {
277  PostErr(eDiag_Error, eErr_SEQ_FEAT_RefSeqInText, "Definition line contains 'RefSeq'", ctx, desc);
278  }
279  }
280 }
281 
282 
284 {
285  EDiagSev sev = eDiag_Error;
286  switch (severity) {
288  sev = eDiag_Info;
289  break;
291  sev = eDiag_Info;
292  break;
294  sev = eDiag_Warning;
295  break;
297  sev = eDiag_Error;
298  break;
300  sev = eDiag_Critical;
301  break;
303  sev = eDiag_Fatal;
304  break;
305  }
306  return sev;
307 }
308 
309 
311 {
312  if (!f1->IsSetLabel()) return true;
313  if (!f2->IsSetLabel()) return false;
314  return f1->GetLabel().Compare(f2->GetLabel()) < 0;
315 }
316 
317 
319 {
321 
322  if (NStr::Find(msg, "is not a valid value") != string::npos) {
324  } else if (NStr::Find(msg, "field is out of order") != string::npos) {
326  } else if (NStr::StartsWith(msg, "Required field")) {
328  } else if (NStr::Find(msg, "is not a valid field name") != string::npos
329  || NStr::Find(msg, "field without label") != string::npos) {
331  } else if (NStr::StartsWith(msg, "Multiple values")) {
333  } else if (NStr::StartsWith(msg, "Structured comment field")) {
335  }
336 
337  return et;
338 }
339 
340 
342  const CUser_object& usr,
343  const CSeqdesc& desc,
344  const CComment_rule& rule,
345  bool report)
346 {
347  bool is_valid = true;
348 
349  CComment_rule::TErrorList errors = rule.IsValid(usr);
350  if (errors.size() > 0) {
351  is_valid = false;
352  if (report) {
353  x_ReportStructuredCommentErrors(desc, errors);
354  }
355  }
356  return is_valid;
357 }
358 
359 
361 {
362  ITERATE(CComment_rule::TErrorList, it, errors) {
363  EErrType et = s_GetErrTypeFromString(it->second);
364  EDiagSev sev = s_ErrorLevelFromFieldRuleSev(it->first);
367  sev = eDiag_Error;
368  }
369  PostErr(sev, et, it->second, *m_Ctx, desc);
370  }
371 }
372 
373 
375 {
376  bool is_valid = true;
377 
379  if (errors.size() > 0) {
380  is_valid = false;
381  if (report) {
382  x_ReportStructuredCommentErrors(desc, errors);
383  }
384  }
385  return is_valid;
386 }
387 
388 
389 static string s_OfficialPrefixList[] = {
390  "Assembly-Data",
391  "BWP:1.0",
392  "EpifluData",
393  "Evidence-Data",
394  "Evidence-For-Name-Assignment",
395  "FluData",
396  "Genome-Annotation-Data",
397  "Genome-Assembly-Data",
398  "GISAID_EpiFlu(TM)Data",
399  "HCVDataBaseData",
400  "HIVDataBaseData",
401  "HumanSTR",
402  "International Barcode of Life (iBOL)Data",
403  "MIENS-Data",
404  "MIGS-Data",
405  "MIGS:3.0-Data",
406  "MIGS:4.0-Data",
407  "MIMARKS:3.0-Data",
408  "MIMARKS:4.0-Data",
409  "MIMS-Data",
410  "MIMS:3.0-Data",
411  "MIMS:4.0-Data",
412  "MIGS:5.0-Data",
413  "MIMAG:5.0-Data",
414  "MIMARKS:5.0-Data",
415  "MIMS:5.0-Data",
416  "MISAG:5.0-Data",
417  "MIUVIG:5.0-Data",
418  "RefSeq-Attributes",
419  "SIVDataBaseData",
420  "SymbiotaSpecimenReference",
421  "Taxonomic-Update-Statistics",
422 };
423 
424 static bool s_IsAllowedPrefix(const string& val)
425 {
426  for (size_t i = 0; i < ArraySize(s_OfficialPrefixList); ++i) {
428  return true;
429  }
430  }
431  return false;
432 }
433 
434 
436 {
437  if (!usr.IsSetData()) {
438  return false;
439  }
440  ITERATE(CUser_object::TData, it, usr.GetData()) {
441  if ((*it)->IsSetLabel() && (*it)->GetLabel().IsStr() &&
442  NStr::EqualNocase((*it)->GetLabel().GetStr(), "Assembly Name") &&
443  (*it)->IsSetData() && (*it)->GetData().IsStr()) {
444  const string& val = (*it)->GetData().GetStr();
445  if (NStr::StartsWith(val, "NCBI", NStr::eNocase) ||
446  NStr::StartsWith(val, "GenBank", NStr::eNocase)) {
447  return true;
448  }
449  }
450  }
451  return false;
452 }
453 
454 
456 {
457  if (!desc.IsUser()) {
458  return false;
459  }
460  const bool report = false;
461  return x_ValidateStructuredComment(desc.GetUser(), desc, report);
462 }
463 
464 
466  const CSeqdesc& desc,
467  bool report)
468 {
469  return x_ValidateStructuredComment(desc.GetUser(), desc, report);
470 }
471 
473  const string& prefix,
474  const CSeqdesc& desc,
475  bool report)
476 {
477  if (!s_IsAllowedPrefix(prefix)) {
478  if (report) {
479  string report_prefix = CComment_rule::GetStructuredCommentPrefix(desc.GetUser(), false);
481  report_prefix + " is not a valid value for StructuredCommentPrefix", *m_Ctx, desc);
482  }
483  return false;
484  }
485 
486  return true;
487 }
488 
490  const string& prefix,
491  const CUser_field& suffix,
492  const CSeqdesc& desc,
493  bool report)
494 { // The suffix may be empty. However, If it isn't empty, it must match the prefix.
495  if (!suffix.IsSetData() || !suffix.GetData().IsStr()) {
496  return true;
497  }
498 
499  string report_sfx = suffix.GetData().GetStr();
500  string sfx = report_sfx;
502 
503  if (NStr::IsBlank(sfx) || NStr::Equal(sfx, prefix)) {
504  return true;
505  }
506 
507  if (report) {
509  "StructuredCommentSuffix '" + report_sfx + "' does not match prefix", *m_Ctx, desc);
510  }
511 
512  return false;
513 }
514 
515 
517  const CComment_rule& rule,
518  const CSeqdesc& desc,
519  bool report)
520 {
521  if (rule.GetRequire_order()) {
522  return ValidateStructuredComment(desc.GetUser(), desc, rule, report);
523  }
524 
526  tmp.Assign(desc.GetUser());
527  auto& fields = tmp.SetData();
528  stable_sort (fields.begin(), fields.end(), s_UserFieldCompare);
529  return ValidateStructuredComment(tmp, desc, rule, report);
530 }
531 
532 
534  const CUser_object& usr,
535  const CSeqdesc& desc,
536  bool report)
537 {
538  if (!usr.IsSetType() || !usr.GetType().IsStr()
539  || !NStr::EqualCase(usr.GetType().GetStr(), "StructuredComment")) {
540  return false;
541  }
542 
543  bool is_valid = true;
544  if (!usr.IsSetData() || usr.GetData().size() == 0) {
545  if (report) {
547  "Structured Comment user object descriptor is empty", *m_Ctx, desc);
548  is_valid = false;
549  } else {
550  return false;
551  }
552  }
553 
555  if (NStr::IsBlank(prefix)) {
556  if (report) {
558  "Structured Comment lacks prefix and/or suffix", *m_Ctx, desc);
559  }
560  is_valid &= ValidateStructuredCommentGeneric(usr, desc, report);
561  return is_valid;
562  }
563 
564  // Has a prefix
566  if (!report && !is_valid) {
567  return false;
568  }
569 
570  try {
572  if (comment_rules) {
573  const bool isV2Prefix =
574  (prefix == "HumanSTR" && usr.HasField("Bracketed record seq.", ""));
575  const string queryPrefix = isV2Prefix ? "HumanSTRv2" : prefix;
576  CConstRef<CComment_rule> pRule = comment_rules->FindCommentRuleEx(queryPrefix);
577  if (pRule) {
578  is_valid &= x_ValidateStructuredCommentUsingRule(*pRule, desc, report);
579  } else {
580  // no rule for this prefix
581  is_valid &= ValidateStructuredCommentGeneric(usr, desc, report);
582  }
583  if (!report && !is_valid) {
584  return false;
585  }
586  }
587 
588  if (auto pSuffix = usr.GetFieldRef("StructuredCommentSuffix"); pSuffix) {
589  is_valid &= x_ValidateStructuredCommentSuffix(prefix, *pSuffix, desc, report);
590  if (!report && !is_valid) {
591  return false;
592  }
593  }
594  } catch (CException&) {
595  // no prefix, in which case no rules
596  // but it is still an error - should have prefix
597  is_valid = false;
598  if (report) {
600  "Structured Comment lacks prefix and/or suffix", *m_Ctx, desc);
601  ValidateStructuredCommentGeneric(usr, desc, true);
602  } else {
603  return false;
604  }
605  }
606  if (NStr::Equal(prefix, "Genome-Assembly-Data") && HasBadGenomeAssemblyName(usr)) {
607  is_valid = false;
608  if (report) {
610  "Assembly Name should not start with 'NCBI' or 'GenBank' in structured comment", *m_Ctx, desc);
611  } else {
612  return false;
613  }
614  }
615  if (report && !is_valid && !NStr::IsBlank(prefix)) {
617  "Structured Comment invalid; the field value and/or name are incorrect", *m_Ctx, desc);
618  }
619  return is_valid;
620 }
621 
622 static bool x_IsBadBioSampleFormat(const string& str)
623 {
624  char ch;
625  unsigned int i;
626  unsigned int skip = 4;
627 
628  if (str.length() < 5) return true;
629 
630  if (str [0] != 'S') return true;
631  if (str [1] != 'A') return true;
632  if (str [2] != 'M') return true;
633  if (str [3] != 'E' && str [3] != 'N' && str [3] != 'D') return true;
634 
635  if (str [3] == 'E') {
636  ch = str [4];
637  if (isalpha (ch)) {
638  skip++;
639  }
640  }
641 
642  for (i = skip; i < str.length(); i++) {
643  ch = str [i];
644  if (! isdigit (ch)) return true;
645  }
646 
647  return false;
648 }
649 
650 static bool x_IsNotAltBioSampleFormat(const string& str)
651 {
652  char ch;
653  unsigned int i;
654 
655  if (str.length() < 9) return true;
656 
657  if (str [0] != 'S') return true;
658  if (str [1] != 'R') return true;
659  if (str [2] != 'S') return true;
660 
661  for (i = 3; i < str.length(); i++) {
662  ch = str [i];
663  if (! isdigit (ch)) return true;
664  }
665 
666  return false;
667 }
668 
669 static bool x_IsBadSRAFormat(const string& str)
670 {
671  char ch;
672  unsigned int i;
673 
674  if (str.length() < 9) return true;
675 
676  ch = str [0];
677  if (ch != 'S' && ch != 'D' && ch != 'E') return true;
678  ch = str [1];
679  if (! isupper (ch)) return true;
680  ch = str [2];
681  if (! isupper (ch)) return true;
682 
683  for (i = 3; i < str.length(); i++) {
684  ch = str [i];
685  if (! isdigit (ch)) return true;
686  }
687 
688  return false;
689 }
690 
691 static bool x_IsBadBioProjectFormat(const string& str)
692 
693 {
694  char ch;
695  unsigned int i;
696 
697  if (str.length() < 6) return true;
698 
699  if (str [0] != 'P') return true;
700  if (str [1] != 'R') return true;
701  if (str [2] != 'J') return true;
702  if (str [3] != 'E' && str [3] != 'N' && str [3] != 'D') return true;
703  if (str [4] != 'A' && str [4] != 'B') return true;
704 
705  for (i = 5; i < str.length(); i++) {
706  ch = str [i];
707  if (! isdigit (ch)) return true;
708  }
709 
710  return false;
711 }
712 
713 static string s_legalDblinkNames [] = {
714  "Trace Assembly Archive",
715  "ProbeDB",
716  "Assembly",
717  "BioSample",
718  "Sequence Read Archive",
719  "BioProject"
720 };
721 
723  const CUser_object& usr,
724  const CSeqdesc& desc,
725  bool report)
726 {
727  bool is_valid = true;
728  if (!usr.IsSetType() || !usr.GetType().IsStr()
729  || !NStr::EqualCase(usr.GetType().GetStr(), "DBLink")) {
730  return false;
731  }
732  if (!usr.IsSetData() || usr.GetData().size() == 0) {
733  if (report) {
735  "DBLink user object descriptor is empty", *m_Ctx, desc);
736  }
737  return false;
738  }
739 
740  FOR_EACH_USERFIELD_ON_USEROBJECT(ufd_it, usr) {
741  const CUser_field& fld = **ufd_it;
742  if (FIELD_IS_SET_AND_IS(fld, Label, Str)) {
743  const string &label_str = GET_FIELD(fld.GetLabel(), Str);
744  if (NStr::EqualNocase(label_str, "BioSample")) {
745  if (fld.IsSetData()) {
746  const auto& fdata = fld.GetData();
747  if (fdata.IsStrs()) {
748  const CUser_field::C_Data::TStrs& strs = fdata.GetStrs();
749  ITERATE(CUser_field::C_Data::TStrs, st_itr, strs) {
750  const string& str = *st_itr;
754  "Bad BioSample format - " + str, *m_Ctx, desc);
755  } else {
757  "Old BioSample format - " + str, *m_Ctx, desc);
758  }
759  }
760  }
761  } else if (fdata.IsStr()) {
762  const string& str = fdata.GetStr();
766  "Bad BioSample format - " + fdata.GetStr(), *m_Ctx, desc);
767  } else {
769  "Old BioSample format - " + fdata.GetStr(), *m_Ctx, desc);
770  }
771  }
772  }
773  }
774  } else if (NStr::EqualNocase(label_str, "Sequence Read Archive")) {
775  if (fld.IsSetData() && fld.GetData().IsStrs()) {
776  const CUser_field::C_Data::TStrs& strs = fld.GetData().GetStrs();
777  ITERATE(CUser_field::C_Data::TStrs, st_itr, strs) {
778  const string& str = *st_itr;
779  if (x_IsBadSRAFormat (str)) {
781  "Bad Sequence Read Archive format - " + str, *m_Ctx, desc);
782  }
783  }
784  }
785  } else if (NStr::EqualNocase(label_str, "BioProject")) {
786  if (fld.IsSetData() && fld.GetData().IsStrs()) {
787  const CUser_field::C_Data::TStrs& strs = fld.GetData().GetStrs();
788  ITERATE(CUser_field::C_Data::TStrs, st_itr, strs) {
789  const string& str = *st_itr;
792  "Bad BioProject format - " + str, *m_Ctx, desc);
793  }
794  }
795  }
796  } else if (NStr::EqualNocase(label_str, "Trace Assembly Archive")) {
797  if (fld.IsSetData() && fld.GetData().IsStrs()) {
798  const CUser_field::C_Data::TStrs& strs = fld.GetData().GetStrs();
799  ITERATE(CUser_field::C_Data::TStrs, st_itr, strs) {
800  const string& str = *st_itr;
801  if ( ! NStr::StartsWith (str, "TI", NStr::eNocase) ) {
803  "Trace Asssembly Archive accession " + str + " does not begin with TI prefix", *m_Ctx, desc);
804  }
805  }
806  }
807  }
808 
809  for ( size_t i = 0; i < sizeof(s_legalDblinkNames) / sizeof(string); ++i) {
810  if (NStr::EqualNocase (label_str, s_legalDblinkNames[i]) && ! NStr::EqualCase (label_str, s_legalDblinkNames[i])) {
812  "Bad DBLink capitalization - " + label_str, *m_Ctx, desc);
813  }
814  }
815  }
816  }
817 
818  return is_valid;
819 }
820 
821 
823  const CUser_object& usr,
824  const CSeqdesc& desc)
825 {
826  if ( !usr.CanGetType() ) {
828  "User object with no type", *m_Ctx, desc);
829  return;
830  }
831  const CObject_id& oi = usr.GetType();
832  if ( !oi.IsStr() && !oi.IsId() ) {
834  "User object with no type", *m_Ctx, desc);
835  return;
836  }
837  if ( !usr.IsSetData() || usr.GetData().size() == 0) {
838  if (! NStr::EqualNocase(oi.GetStr(), "NcbiAutofix") && ! NStr::EqualNocase(oi.GetStr(), "Unverified")) {
840  "User object with no data", *m_Ctx, desc);
841  }
842  }
843  if ( usr.IsRefGeneTracking()) {
844  bool has_ref_track_status = false;
845  ITERATE(CUser_object::TData, field, usr.GetData()) {
846  if ( (*field)->CanGetLabel() ) {
847  const CObject_id& obj_id = (*field)->GetLabel();
848  if ( !obj_id.IsStr() ) {
849  continue;
850  }
851  if ( NStr::CompareNocase(obj_id.GetStr(), "Status") == 0 ) {
852  has_ref_track_status = true;
853  if ((*field)->IsSetData() && (*field)->GetData().IsStr()) {
856  "RefGeneTracking object has illegal Status '"
857  + (*field)->GetData().GetStr() + "'",
858  *m_Ctx, desc);
859  }
860  }
861  }
862  }
863  }
864  if ( !has_ref_track_status ) {
866  "RefGeneTracking object needs to have Status set", *m_Ctx, desc);
867  }
868  } else if ( usr.IsStructuredComment()) {
869  x_ValidateStructuredComment(usr, desc);
870  } else if ( usr.IsDBLink()) {
871  ValidateDblink(usr, desc);
872  }
873 }
874 
875 
876 // for MolInfo validation that does not rely on contents of sequence
878  const CMolInfo& minfo,
879  const CSeqdesc& desc)
880 {
881  if ( !minfo.IsSetBiomol() || minfo.GetBiomol() == CMolInfo::eBiomol_unknown) {
883  "Molinfo-biomol unknown used", *m_Ctx, desc);
884  }
885 
886  if(minfo.IsSetTech() && minfo.GetTech() == CMolInfo::eTech_tsa)
887  {
888  string p;
889  int bm;
890 
891  if(!minfo.IsSetBiomol())
893  else
894  bm = minfo.GetBiomol();
895 
897  p = "unknown";
898  else if(bm == CMolInfo::eBiomol_genomic)
899  p = "genomic";
900  else if(bm == CMolInfo::eBiomol_pre_RNA)
901  p = "pre-RNA";
902  else if(bm == CMolInfo::eBiomol_tRNA)
903  p = "tRNA";
904  else if(bm == CMolInfo::eBiomol_snRNA)
905  p = "snRNA";
906  else if(bm == CMolInfo::eBiomol_scRNA)
907  p = "scRNA";
908  else if(bm == CMolInfo::eBiomol_peptide)
909  p = "peptide";
911  p = "other-genetic";
913  p = "genomic-mRNA";
914  else if(bm == CMolInfo::eBiomol_cRNA)
915  p = "cRNA";
916  else if(bm == CMolInfo::eBiomol_snoRNA)
917  p = "snoRNA";
918  else if(bm == CMolInfo::eBiomol_tmRNA)
919  p = "tmRNA";
920  else if(bm == CMolInfo::eBiomol_other)
921  p = "other";
922  else
923  p.clear();
924 
925  if(!p.empty())
927  "Biomol \"" + p + "\" is not appropriate for sequences that use the TSA technique.",
928  *m_Ctx, desc);
929  }
930 }
931 
932 
933 END_SCOPE(validator)
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
EErrType
@ eErr_SEQ_DESCR_DBLinkBadBioSample
@ eErr_SEQ_DESCR_DBLinkBadCapitalization
@ eErr_SEQ_DESCR_BadStrucCommInvalidSuffix
@ eErr_SEQ_DESCR_BadAssemblyName
@ eErr_SEQ_DESCR_StrucCommMissingUserObject
@ eErr_SEQ_DESCR_BadStrucCommInvalidFieldName
@ eErr_SEQ_DESCR_BadStrucCommInvalidFieldValue
@ eErr_SEQ_DESCR_Unknown
@ eErr_SEQ_DESCR_SerialInComment
@ eErr_SEQ_DESCR_BadPunctuation
@ eErr_SEQ_DESCR_RefGeneTrackingIllegalStatus
@ eErr_SEQ_DESCR_TitleMissingText
@ eErr_SEQ_DESCR_InvalidForType
@ eErr_SEQ_DESCR_MoltypeUnknown
@ eErr_GENERIC_NonAsciiAsn
@ eErr_SEQ_DESCR_TitleHasPMID
@ eErr_SEQ_FEAT_RefSeqInText
@ eErr_SEQ_DESCR_BadStrucCommMultipleFields
@ eErr_SEQ_DESCR_StrucCommMissingPrefixOrSuffix
@ eErr_SEQ_DESCR_WrongBiomolForTSA
@ eErr_SEQ_DESCR_BadStrucCommMissingField
@ eErr_SEQ_DESCR_DBLinkBadFormat
@ eErr_SEQ_DESCR_UserObjectNoType
@ eErr_SEQ_DESCR_DBLinkMissingUserObject
@ eErr_SEQ_DESCR_BadStrucCommFieldOutOfOrder
@ eErr_SEQ_DESCR_BadStrucCommInvalidPrefix
@ eErr_SEQ_DESCR_DBLinkBadBioProject
@ eErr_SEQ_DESCR_DBLinkBadSRAaccession
@ eErr_SEQ_DESCR_MissingText
@ eErr_SEQ_DESCR_FakeStructuredComment
@ eErr_SEQ_DESCR_UserObjectNoData
@ eErr_SEQ_DESCR_RegionMissingText
@ eErr_SEQ_DESCR_RefGeneTrackingWithoutStatus
@ eErr_SEQ_DESCR_CommentMissingText
static TRefTrackStatus GetRefTrackStatus(const CUser_object &uo, string *st=0)
static TErrorList CheckGeneralStructuredComment(const CUser_object &user)
vector< TError > TErrorList
TErrorList IsValid(const CUser_object &user) const
static void NormalizePrefix(string &prefix)
static string GetStructuredCommentPrefix(const CUser_object &user, bool normalize=true)
static CConstRef< CComment_set > GetCommentRules()
CConstRef –.
Definition: ncbiobj.hpp:1266
int Compare(const CObject_id &oid2) const
Definition: Object_id.cpp:145
Definition: Seq_entry.hpp:56
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
CConstRef< CUser_field > GetFieldRef(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Definition: User_object.cpp:84
bool IsRefGeneTracking() const
bool IsDBLink() const
bool HasField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Verify that a named field exists.
bool IsStructuredComment() const
CValidError_imp & m_Imp
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
static bool IsWGSMaster(const CBioseq &seq, CScope &scope)
bool x_ValidateStructuredComment(const CUser_object &usr, const CSeqdesc &desc, bool report=true)
CConstRef< CSeq_entry > m_Ctx
void ValidateSeqDesc(const CSeqdesc &desc, const CSeq_entry &ctx)
Validate descriptors as stand alone objects (no context)
void ValidateTitle(const string &title, const CSeqdesc &desc, const CSeq_entry &ctx)
void x_ReportStructuredCommentErrors(const CSeqdesc &desc, const CComment_rule::TErrorList &errors)
void ValidateMolInfo(const CMolInfo &minfo, const CSeqdesc &desc)
~CValidError_desc() override
bool x_ValidateStructuredCommentPrefix(const string &prefix, const CSeqdesc &desc, bool report)
void ValidateUser(const CUser_object &usr, const CSeqdesc &desc)
bool ValidateStructuredCommentGeneric(const CUser_object &usr, const CSeqdesc &desc, bool report)
bool IsValidStructuredComment(const CSeqdesc &desc)
void ValidateComment(const string &comment, const CSeqdesc &desc)
bool ValidateStructuredCommentInternal(const CSeqdesc &desc, bool report=true)
bool ValidateDblink(const CUser_object &usr, const CSeqdesc &desc, bool report=true)
bool x_ValidateStructuredCommentUsingRule(const CComment_rule &rule, const CSeqdesc &desc, bool report)
bool x_ValidateStructuredCommentSuffix(const string &prefix, const CUser_field &suffix, const CSeqdesc &desc, bool report)
bool ValidateStructuredComment(const CUser_object &usr, const CSeqdesc &desc, const CComment_rule &rule, bool report)
bool IsSerialNumberInComment(const string &comment)
bool IsRefSeq() const
void PostBadDateError(EDiagSev sv, const string &msg, int flags, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
void ValidateBioSource(const CBioSource &bsrc, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
void ValidatePubdesc(const CPubdesc &pub, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
Definition: valid_pub.cpp:77
static bool is_valid(const char *num, int type, CONV_RESULT *cr)
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
CS_CONTEXT * ctx
Definition: t0006.c:12
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
#define FOR_EACH_USERFIELD_ON_USEROBJECT(Itr, Var)
FOR_EACH_USERFIELD_ON_USEROBJECT EDIT_EACH_USERFIELD_ON_USEROBJECT.
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
@ eDiag_Info
Informational message.
Definition: ncbidiag.hpp:651
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
@ eDiag_Fatal
Fatal error – guarantees exit(or abort)
Definition: ncbidiag.hpp:655
@ eDiag_Critical
Critical error message.
Definition: ncbidiag.hpp:654
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5325
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5384
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool CanGetType(void) const
Check if it is safe to call GetType method.
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
bool IsStrs(void) const
Check if variant Strs is selected.
const TStrs & GetStrs(void) const
Get the variant data.
bool IsId(void) const
Check if variant Id is selected.
Definition: Object_id_.hpp:264
const TData & GetData(void) const
Get the Data member data.
bool IsSetLabel(void) const
field label Check if a value has been assigned to Label data member.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TData & GetData(void) const
Get the Data member data.
const TLabel & GetLabel(void) const
Get the Label member data.
const TType & GetType(void) const
Get the Type member data.
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
vector< CStringUTF8 > TStrs
vector< CRef< CUser_field > > TData
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
const TUpdate_date & GetUpdate_date(void) const
Get the variant data.
Definition: Seqdesc_.cpp:494
const TTitle & GetTitle(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1032
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
const TPub & GetPub(void) const
Get the variant data.
Definition: Seqdesc_.cpp:356
bool IsSetBiomol(void) const
Check if a value has been assigned to Biomol data member.
Definition: MolInfo_.hpp:422
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
const TModif & GetModif(void) const
Get the variant data.
Definition: Seqdesc_.hpp:965
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seqdesc_.hpp:903
const TCreate_date & GetCreate_date(void) const
Get the variant data.
Definition: Seqdesc_.cpp:472
const TComment & GetComment(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1058
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
const TName & GetName(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1012
const TRegion & GetRegion(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1108
bool IsUser(void) const
Check if variant User is selected.
Definition: Seqdesc_.hpp:1122
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
@ eBiomol_pre_RNA
precursor RNA of any sort really
Definition: MolInfo_.hpp:102
@ eBiomol_cRNA
viral RNA genome copy intermediate
Definition: MolInfo_.hpp:111
@ eBiomol_snoRNA
small nucleolar RNA
Definition: MolInfo_.hpp:112
@ eBiomol_genomic_mRNA
reported a mix of genomic and cdna sequence
Definition: MolInfo_.hpp:110
@ eBiomol_other_genetic
other genetic material
Definition: MolInfo_.hpp:109
@ eGIBB_mod_other
Definition: GIBB_mod_.hpp:92
@ e_Embl
EMBL specific information.
Definition: Seqdesc_.hpp:127
@ e_Het
cofactor, etc associated but not bound
Definition: Seqdesc_.hpp:132
@ e_Org
if all from one organism
Definition: Seqdesc_.hpp:116
@ e_Num
a numbering system
Definition: Seqdesc_.hpp:118
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Update_date
date of last update
Definition: Seqdesc_.hpp:129
@ e_Pub
a reference to the publication
Definition: Seqdesc_.hpp:122
@ e_Pir
PIR specific info.
Definition: Seqdesc_.hpp:120
@ e_Genbank
GenBank specific info.
Definition: Seqdesc_.hpp:121
@ e_Prf
PRF specific information.
Definition: Seqdesc_.hpp:130
@ e_Mol_type
type of molecule
Definition: Seqdesc_.hpp:111
@ e_Sp
SWISSPROT specific info.
Definition: Seqdesc_.hpp:125
@ e_Dbxref
xref to other databases
Definition: Seqdesc_.hpp:126
@ e_Comment
a more extensive comment
Definition: Seqdesc_.hpp:117
@ e_Method
sequencing method
Definition: Seqdesc_.hpp:113
@ e_Region
overall region (globin locus)
Definition: Seqdesc_.hpp:123
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Modif
modifiers
Definition: Seqdesc_.hpp:112
@ e_Maploc
map location of this sequence
Definition: Seqdesc_.hpp:119
@ e_Create_date
date entry first created/released
Definition: Seqdesc_.hpp:128
@ e_Title
a title for this sequence
Definition: Seqdesc_.hpp:115
@ e_Pdb
PDB specific information.
Definition: Seqdesc_.hpp:131
@ e_not_set
No variant selected.
Definition: Seqdesc_.hpp:110
@ e_Name
a name for this sequence
Definition: Seqdesc_.hpp:114
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
TRequire_order GetRequire_order(void) const
Get the Require_order member data.
ESeverity_level
Access to ESeverity_level's attributes (values, names) as defined in spec.
@ eSeverity_level_error
@ eSeverity_level_none
@ eSeverity_level_fatal
@ eSeverity_level_warning
@ eSeverity_level_info
@ eSeverity_level_reject
int i
#include<zmmintrin.h>
Definition: bm.h:78
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
int isupper(Uchar c)
Definition: ncbictype.hpp:70
bool s_StringHasPMID(const string &str)
Definition: utilities.cpp:727
@ eDateValid_valid
Definition: utilities.hpp:123
int CheckDate(const CDate &date, bool require_full_date=false)
Definition: utilities.cpp:780
static const char * suffix[]
Definition: pcregrep.c:408
static const char * prefix[]
Definition: pcregrep.c:405
#define FIELD_IS_SET_AND_IS(Var, Fld, Chs)
FIELD_IS_SET_AND_IS base macro.
#define GET_FIELD(Var, Fld)
GET_FIELD base macro.
#define FOR_EACH_CHAR_IN_STRING(Itr, Var)
FOR_EACH_CHAR_IN_STRING EDIT_EACH_CHAR_IN_STRING.
static bool x_IsBadBioSampleFormat(const string &str)
static string s_AsciiString(const string &src)
EErrType s_GetErrTypeFromString(const string &msg)
static bool x_IsBadSRAFormat(const string &str)
static bool s_IsAllowedPrefix(const string &val)
static string s_legalDblinkNames[]
bool HasBadGenomeAssemblyName(const CUser_object &usr)
static string s_OfficialPrefixList[]
bool s_UserFieldCompare(const CRef< CUser_field > &f1, const CRef< CUser_field > &f2)
static EDiagSev s_ErrorLevelFromFieldRuleSev(CField_rule::TSeverity severity)
static bool x_IsBadBioProjectFormat(const string &str)
static bool x_IsNotAltBioSampleFormat(const string &str)
Modified on Thu Apr 25 08:17:30 2024 by modify_doxy.py rev. 669887