NCBI C++ ToolKit
single_feat_validator.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: single_feat_validator.cpp 101704 2024-01-29 18:17:37Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko......
27  *
28  * File Description:
29  * validation of Seq_feat
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
42 
43 #include <serial/serialbase.hpp>
44 
45 #include <objmgr/seqdesc_ci.hpp>
48 #include <util/sgml_entity.hpp>
50 #include <atomic>
51 
54 BEGIN_SCOPE(validator)
55 using namespace sequence;
56 
58  : m_Feat(feat), m_Scope(scope), m_Imp(imp), m_ProductIsFar(false)
59 {
60 
61 }
62 
63 
65 {
66  if (!m_Feat.IsSetLocation()) {
68  "The feature is missing a location");
69  return;
70  }
71 
73  bool lowerSev = false;
75  if ( m_Feat.IsSetDbxref() ) {
77  const CDbtag& dbtag = **it;
78  if ( dbtag.GetDb() == "dbSNP" ) {
79  lowerSev = true;
80  }
81  }
82  }
83  }
85  (m_Feat.GetData().IsGene() || !m_Imp.IsGpipe()),
86  "Location", m_Feat, lowerSev);
87 
90 
91  if (m_Feat.IsSetProduct()) {
94  PostErr(eDiag_Error, eErr_SEQ_FEAT_SelfReferentialProduct, "Self-referential feature product");
95  }
97  }
98 
100 
102 
106  }
107 
109 
111 
112  if (m_Feat.IsSetQual()) {
113  for (auto it = m_Feat.GetQual().begin(); it != m_Feat.GetQual().end(); it++) {
114  x_ValidateGbQual(**it);
115  }
116  }
117 
119 
120  if (m_Feat.IsSetExp_ev() && m_Feat.GetExp_ev() > 0 &&
121  !x_HasNamedQual("inference") &&
122  !x_HasNamedQual("experiment") &&
125  "Inference or experiment qualifier missing but obsolete experimental evidence qualifier set");
126  }
127 
129 
132 
134 
136 
138 
140 }
141 
142 void CSingleFeatValidator::PostErr(EDiagSev sv, EErrType et, const string& msg)
143 {
144  m_Imp.PostErr(sv, et, msg, m_Feat);
145 }
146 
147 
150 {
151  if (loc.IsInt() || loc.IsWhole()) {
152  return m_Scope.GetBioseqHandle(loc);
153  }
154  CBioseq_Handle rval;
156  for (CSeq_loc_CI citer(loc); citer; ++citer) {
157  const CSeq_id& this_id = citer.GetSeq_id();
158  if (!prev || !prev->Equals(this_id)) {
159  rval = m_Scope.GetBioseqHandle(this_id);
160  if (rval) {
161  break;
162  }
163  prev.Reset(&this_id);
164  }
165  }
166  return rval;
167 }
168 
169 
171 {
172  if (!m_Feat.IsSetProduct()) {
173  return;
174  }
175  const CSeq_id& sid = GetId(m_Feat.GetProduct(), &m_Scope);
176 
177  switch (sid.Which()) {
178  case CSeq_id::e_Genbank:
179  case CSeq_id::e_Embl:
180  case CSeq_id::e_Ddbj:
181  case CSeq_id::e_Tpg:
182  case CSeq_id::e_Tpe:
183  case CSeq_id::e_Tpd:
184  {
185  const CTextseq_id* tsid = sid.GetTextseq_Id();
186  if (tsid) {
187  if (!tsid->CanGetAccession() && tsid->CanGetName()) {
188  if (ValidateAccessionString(tsid->GetName(), false) == eAccessionFormat_valid) {
190  "Feature product should not put an accession in the Textseq-id 'name' slot");
191  } else {
193  "Feature product should not use "
194  "Textseq-id 'name' slot");
195  }
196  }
197  }
198  }
199  break;
200 
201  default:
202  break;
203  }
204 
205  if (m_ProductBioseq) {
207 
208  for (auto id : m_ProductBioseq.GetCompleteBioseq()->GetId()) {
209  if (id->Which() == sid.Which()) {
210  // check to make sure capitalization is the same
211  string from_seq = id->AsFastaString();
212  string from_loc = sid.AsFastaString();
213  if (!NStr::EqualCase(from_seq, from_loc) &&
214  NStr::EqualNocase(from_seq, from_loc)) {
216  "Capitalization change from product location on feature to product sequence");
217  }
218  }
219  switch (id->Which()) {
220  case CSeq_id::e_Genbank:
221  case CSeq_id::e_Embl:
222  case CSeq_id::e_Ddbj:
223  case CSeq_id::e_Tpg:
224  case CSeq_id::e_Tpe:
225  case CSeq_id::e_Tpd:
226  {
227  const CTextseq_id* tsid = id->GetTextseq_Id();
228  if (tsid) {
229  if (!tsid->IsSetAccession() && tsid->IsSetName()) {
230  if (ValidateAccessionString(tsid->GetName(), false) == eAccessionFormat_valid) {
232  "Protein bioseq has Textseq-id 'name' that "
233  "looks like it is derived from a nucleotide "
234  "accession");
235  } else {
237  "Protein bioseq has Textseq-id 'name' and no accession");
238  }
239  }
240  }
241  }
242  break;
243  default:
244  break;
245  }
246  }
247  }
248 }
249 
250 
252 {
253  // check for bond locations - only allowable in bond feature and under special circumstances for het
254  bool is_seqloc_bond = false;
255  if (feat.IsSetData()) {
256  if (feat.GetData().IsHet()) {
257  // heterogen can have mix of bonds with just "a" point specified */
258  for (CSeq_loc_CI it(feat.GetLocation()); it; ++it) {
259  if (it.GetEmbeddingSeq_loc().IsBond()
260  && (!it.GetEmbeddingSeq_loc().GetBond().IsSetA()
261  || it.GetEmbeddingSeq_loc().GetBond().IsSetB())) {
262  is_seqloc_bond = true;
263  break;
264  }
265  }
266  } else if (!feat.GetData().IsBond()) {
267  for (CSeq_loc_CI it(feat.GetLocation()); it; ++it) {
268  if (it.GetEmbeddingSeq_loc().IsBond()) {
269  is_seqloc_bond = true;
270  break;
271  }
272  }
273  }
274  } else {
275  for (CSeq_loc_CI it(feat.GetLocation()); it; ++it) {
276  if (it.GetEmbeddingSeq_loc().IsBond()) {
277  is_seqloc_bond = true;
278  break;
279  }
280  }
281  }
282  return is_seqloc_bond;
283 }
284 
285 
287 {
289  return;
290  }
291  bool both, both_rev;
292  x_LocHasStrandBoth(m_Feat.GetLocation(), both, both_rev);
293  if (both || both_rev) {
294  string suffix;
295  if (both && both_rev) {
296  suffix = "(forward and reverse)";
297  } else if (both) {
298  suffix = "(forward)";
299  } else if (both_rev) {
300  suffix = "(reverse)";
301  }
302 
304 
306  label + " may not be on both " + suffix + " strands");
307  }
308 }
309 
310 
311 void CSingleFeatValidator::x_LocHasStrandBoth(const CSeq_loc& loc, bool& both, bool& both_rev)
312 {
313  both = false;
314  both_rev = false;
315  for (CSeq_loc_CI it(loc); it; ++it) {
316  if (it.IsSetStrand()) {
317  ENa_strand s = it.GetStrand();
318  if (s == eNa_strand_both && !both) {
319  both = true;
320  } else if (s == eNa_strand_both_rev && !both_rev) {
321  both_rev = true;
322  }
323  }
324  if (both && both_rev) {
325  break;
326  }
327  }
328 }
329 
330 
331 bool HasGeneIdXref(const CMappedFeat& sf, const CObject_id& tag, bool& has_parent_gene_id)
332 {
333  has_parent_gene_id = false;
334  if (!sf.IsSetDbxref()) {
335  return false;
336  }
338  if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "GeneID")) {
339  has_parent_gene_id = true;
340  if ((*it)->IsSetTag() && (*it)->GetTag().Equals(tag)) {
341  return true;
342  }
343  }
344  }
345  return false;
346 }
347 
348 
350 {
351  if (!m_Feat.IsSetDbxref()) {
352  return;
353  }
354 
355  // no tse, no feat-handle
356  auto tse = m_Imp.GetTSE_Handle();
357  if (!tse) {
358  return;
359  }
360 
361  CRef<feature::CFeatTree> feat_tree;
364  if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "GeneID") &&
365  (*it)->IsSetTag()) {
366  if (!feat_tree) {
368  }
369  if (feat_tree) {
370  CMappedFeat parent = feat_tree->GetParent(mf);
371  while (parent) {
372  bool has_parent_gene_id = false;
373  if (!HasGeneIdXref(parent, (*it)->GetTag(), has_parent_gene_id)) {
374  if (has_parent_gene_id ||
377  "GeneID mismatch");
378  }
379  }
380  parent = feat_tree->GetParent(parent);
381  }
382  }
383  }
384  }
385 
386 }
387 
388 
390 {
391  if (!m_Feat.IsSetCit()) {
392  return;
393  }
394 
395  if (m_Feat.GetCit().IsPub()) {
397  if ((*pi)->IsEquiv()) {
399  "Citation on feature has unexpected internal Pub-equiv");
400  return;
401  }
402  }
403  }
404 }
405 
406 
407 const string kInferenceMessage[] = {
408  "unknown error",
409  "empty inference string",
410  "bad inference prefix",
411  "bad inference body",
412  "single inference field",
413  "spaces in inference",
414  "possible comment in inference",
415  "same species misused",
416  "the value in the accession field is not legal. The only allowed value is accession.version, eg AF123456.1. Problem =",
417  "bad inference accession version",
418  "accession.version not public",
419  "bad accession type",
420  "unrecognized database",
421 };
422 
423 
425 {
426  if (!qual.IsSetQual()) {
427  return;
428  }
429  /* first check for anything other than replace */
430  if (!qual.IsSetVal() || NStr::IsBlank(qual.GetVal())) {
431  if (NStr::EqualNocase(qual.GetQual(), "replace")) {
432  /* ok for replace */
433  } else {
435  "Qualifier other than replace has just quotation marks");
436  if (NStr::EqualNocase(qual.GetQual(), "EC_number")) {
437  PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberEmpty, "EC number should not be empty");
438  }
439  }
440  if (NStr::EqualNocase(qual.GetQual(), "inference")) {
442  "Inference qualifier problem - empty inference string ()");
443  } else if (NStr::EqualNocase(qual.GetQual(), "pseudogene")) {
444  PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidPseudoQualifier, "/pseudogene value should not be empty");
445  }
446  } else if (NStr::EqualNocase(qual.GetQual(), "EC_number")) {
449  qual.GetVal() + " is not in proper EC_number format");
450  } else {
451  string ec_number = qual.GetVal();
454  switch (status) {
457  "EC_number " + ec_number + " was deleted");
458  break;
462  "EC_number " + ec_number + " was replaced");
463  break;
465  {
466  size_t pos = NStr::Find(ec_number, "n");
467  if (pos == string::npos || !isdigit(ec_number.c_str()[pos + 1])) {
469  ec_number + " is not a legal value for qualifier EC_number");
470  } else {
472  ec_number + " is not a legal preliminary value for qualifier EC_number");
473  }
474  }
475  break;
476  default:
477  break;
478  }
479  }
480  } else if (NStr::EqualNocase(qual.GetQual(), "inference")) {
481  /* TODO: Validate inference */
482  string val;
483  if (qual.IsSetVal()) {
484  val = qual.GetVal();
485  }
488  if (NStr::IsBlank(val)) {
489  val = "?";
490  }
492  "Inference qualifier problem - " + kInferenceMessage[(int)rsult] + " ("
493  + val + ")");
494  }
495  } else if (NStr::EqualNocase(qual.GetQual(), "pseudogene")) {
499  "/pseudogene value should not be '" + qual.GetVal() + "'", m_Feat);
500  }
501  } else if (NStr::EqualNocase(qual.GetQual(), "number")) {
502  bool has_space = false;
503  bool has_char_after_space = false;
504  ITERATE(string, it, qual.GetVal()) {
505  if (isspace((unsigned char)(*it))) {
506  has_space = true;
507  } else if (has_space) {
508  // non-space after space
509  has_char_after_space = true;
510  break;
511  }
512  }
513  if (has_char_after_space) {
515  "Number qualifiers should not contain spaces");
516  }
517  }
518  if (qual.IsSetVal() && ContainsSgml(qual.GetVal())) {
520  "feature qualifier " + qual.GetVal() + " has SGML");
521  }
522 
523 }
524 
525 
527 {
528 
529  bool expected{true};
530  if (m_Imp.SetContext().CheckECNumFileStatus.compare_exchange_strong(expected,false)) {
533  "Unable to find EC number file 'ecnum_ambiguous.txt' in data directory");
534  }
537  "Unable to find EC number file 'ecnum_deleted.txt' in data directory");
538  }
541  "Unable to find EC number file 'ecnum_replaced.txt' in data directory");
542  }
545  "Unable to find EC number file 'ecnum_specific.txt' in data directory");
546  }
547  }
548 }
549 
550 
552 {
553  vector<TGoTermError> errors = GetGoTermErrors(m_Feat);
554  for (auto it : errors) {
556  it.first, it.second);
557  }
558 }
559 
560 
561 bool CSingleFeatValidator::x_HasNamedQual(const string& qual_name)
562 {
563  if (!m_Feat.IsSetQual()) {
564  return false;
565  }
567  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), qual_name)) {
568  return true;
569  }
570  }
571  return false;
572 }
573 
574 
576 {
577  if (!m_Feat.IsSetComment()) {
578  return;
579  }
580  const string& comment = m_Feat.GetComment();
581  if (m_Imp.IsSerialNumberInComment(comment)) {
583  "Feature comment may refer to reference by serial number - "
584  "attach reference specific comments to the reference "
585  "REMARK instead.", m_Feat);
586  }
587  if (ContainsSgml(comment)) {
589  "feature comment " + comment + " has SGML",
590  m_Feat);
591  }
592 }
593 
594 
596 {
597  unsigned int partial_prod = eSeqlocPartial_Complete,
598  partial_loc = eSeqlocPartial_Complete;
599 
600  bool is_partial = m_Feat.IsSetPartial() && m_Feat.GetPartial();
601  partial_loc = SeqLocPartialCheck(m_Feat.GetLocation(), &m_Scope);
602 
603  if (m_ProductBioseq) {
605  }
606 
607  if ((partial_loc != eSeqlocPartial_Complete) ||
608  (partial_prod != eSeqlocPartial_Complete) ||
609  is_partial) {
610 
611  // a feature on a partial sequence should be partial -- it often isn't
612  if (!is_partial &&
613  partial_loc != eSeqlocPartial_Complete &&
614  m_Feat.IsSetLocation() &&
615  m_Feat.GetLocation().IsWhole()) {
617  "On partial Bioseq, SeqFeat.partial should be TRUE");
618  }
619  // a partial feature, with complete location, but partial product
620  else if (is_partial &&
621  partial_loc == eSeqlocPartial_Complete &&
622  m_Feat.IsSetProduct() &&
623  m_Feat.GetProduct().IsWhole() &&
624  partial_prod != eSeqlocPartial_Complete) {
625  if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
626  // suppress in gpipe genomic
627  } else {
629  "When SeqFeat.product is a partial Bioseq, SeqFeat.location "
630  "should also be partial");
631  }
632  }
633  // gene on segmented set is now 'order', should also be partial
634  else if (m_Feat.GetData().IsGene() &&
635  !is_partial &&
636  partial_loc == eSeqlocPartial_Internal) {
638  "Gene of 'order' with otherwise complete location should "
639  "have partial flag set");
640  }
641  // inconsistent combination of partial/complete product,location,partial flag - part 1
642  else if (partial_prod == eSeqlocPartial_Complete && m_Feat.IsSetProduct()) {
643  // if not local bioseq product, lower severity
644  EDiagSev sev = eDiag_Warning;
645  bool is_far_fail = false;
647  sev = eDiag_Info;
649  is_far_fail = true;
650  }
651  }
652 
653  string str("Inconsistent: Product= complete, Location= ");
654  str += (partial_loc != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
655  str += "Feature.partial= ";
656  str += is_partial ? "TRUE" : "FALSE";
657  if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
658  // suppress for genomic gpipe
659  } else if (is_far_fail) {
661  } else {
663  }
664  }
665  // inconsistent combination of partial/complete product,location,partial flag - part 2
666  else if (partial_loc == eSeqlocPartial_Complete || !is_partial) {
667  string str("Inconsistent: ");
668  if (m_Feat.IsSetProduct()) {
669  str += "Product= ";
670  str += (partial_prod != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
671  }
672  str += "Location= ";
673  str += (partial_loc != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
674  str += "Feature.partial= ";
675  str += is_partial ? "TRUE" : "FALSE";
676  if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
677  // suppress for genomic gpipe
678  } else {
680  }
681  }
682  // 5' or 3' partial location giving unclassified partial product
683  else if ((((partial_loc & eSeqlocPartial_Start) != 0) ||
684  ((partial_loc & eSeqlocPartial_Stop) != 0)) &&
685  ((partial_prod & eSeqlocPartial_Other) != 0) &&
686  is_partial) {
688  "5' or 3' partial location should not have unclassified"
689  " partial in product molinfo descriptor");
690  }
691 
692  // note - in analogous C Toolkit function there is additional code for ensuring
693  // that partial intervals are partial at splice sites, gaps, or the ends of the
694  // sequence. This has been moved to CValidError_bioseq::ValidateFeatPartialInContext.
695  }
696 
697 }
698 
699 
701 {
702  if (x_HasSeqLocBond(m_Feat)) {
704  "Bond location should only be on bond features");
705  }
706 
707  // feature location should not be whole
708  if (m_Feat.GetLocation().IsWhole()) {
709  string prefix = "Feature";
710  if (m_Feat.IsSetData()) {
711  if (m_Feat.GetData().IsCdregion()) {
712  prefix = "CDS";
714  prefix = "mRNA";
715  }
716  }
717  PostErr(eDiag_Warning, eErr_SEQ_FEAT_WholeLocation, prefix + " may not have whole location");
718  }
719 
720  if (m_LocationBioseq) {
721  // look for mismatch in capitalization for IDs
722  CNcbiOstrstream os;
723  const CSeq_id *id = m_Feat.GetLocation().GetId();
724  if (id) {
725  id->WriteAsFasta(os);
726  string loc_id = CNcbiOstrstreamToString(os);
728  if ((*it)->IsGi() || (*it)->IsGibbsq() || (*it)->IsGibbmt()) {
729  continue;
730  }
731  CNcbiOstrstream os2;
732  (*it)->WriteAsFasta(os2);
733  string bs_id = CNcbiOstrstreamToString(os2);
734  if (NStr::EqualNocase(loc_id, bs_id) && !NStr::EqualCase(loc_id, bs_id)) {
736  "Sequence identifier in feature location differs in capitalization with identifier on Bioseq");
737  }
738  }
739  }
740  // look for protein features on the minus strand
744  "Feature on protein indicates negative strand");
745  }
746 
747  if (!m_Feat.GetData().IsImp()
748  || !m_Feat.GetData().GetImp().IsSetKey()
749  || !NStr::EqualNocase(m_Feat.GetData().GetImp().GetKey(), "gap")) {
750  try {
751  vector<TSeqPos> gap_starts;
752  size_t rval = x_CalculateLocationGaps(m_LocationBioseq, m_Feat.GetLocation(), gap_starts);
753  bool mostly_raw_ns = x_IsMostlyNs(m_Feat.GetLocation(), m_LocationBioseq);
754 
755  if ((rval & eLocationGapMostlyNs) || mostly_raw_ns) {
757  "Feature contains more than 50% Ns");
758  }
759  for (auto gap_start : gap_starts) {
761  "Feature begins or ends in gap starting at " + NStr::NumericToString(gap_start + 1));
762  }
763  if (rval & eLocationGapContainedInGap &&
766  "Feature inside sequence gap");
767  }
768  if (m_Feat.GetData().IsCdregion() || m_Feat.GetData().IsRna()) {
771  "Internal interval begins or ends in gap");
772  }
773  if (rval & eLocationGapCrossesUnknownGap) {
775  "Feature crosses gap of unknown length");
776  }
777  }
778  } catch (const CException &e) {
780  string("Exception while checking for intervals in gaps. EXCEPTION: ") +
781  e.what());
782  } catch (const std::exception&) {
783  }
784  }
785  }
786 
787 }
788 
789 
791 {
794  return true;
795  } else {
796  return false;
797  }
798 }
799 
800 
801 class CGapCache {
802 public:
803  CGapCache(const CSeq_loc& loc, CBioseq_Handle bsh);
805  bool IsUnknownGap(size_t offset);
806  bool IsKnownGap(size_t offset);
807  bool IsGap(size_t offset);
808 
809 private:
810  typedef enum {
816  size_t m_NumUnknown;
817  size_t m_NumKnown;
818 };
819 
820 CGapCache::CGapCache(const CSeq_loc& loc, CBioseq_Handle bsh)
821 {
822  TSeqPos start = loc.GetStart(eExtreme_Positional);
823  TSeqPos stop = loc.GetStop(eExtreme_Positional);
824  CRange<TSeqPos> range(start, stop);
826  TSeqPos pos = start;
827  while (map_iter && pos <= stop) {
828  TSeqPos map_end = map_iter.GetPosition() + map_iter.GetLength();
829  if (map_iter.GetType() == CSeqMap::eSeqGap) {
830  for (; pos < map_end && pos <= stop; pos++) {
831  if (map_iter.IsUnknownLength()) {
832  m_Map[pos - start] = eGapType_unknown;
833  m_NumUnknown++;
834  } else {
835  m_Map[pos - start] = eGapType_known;
836  m_NumKnown++;
837  }
838  }
839  } else {
840  pos = map_end;
841  }
842  ++map_iter;
843  }
844 }
845 
846 bool CGapCache::IsGap(size_t pos)
847 {
848  if (m_Map.find(pos) != m_Map.end()) {
849  return true;
850  } else {
851  return false;
852  }
853 }
854 
855 
856 bool CGapCache::IsKnownGap(size_t pos)
857 {
858  TGapTypeMap::iterator it = m_Map.find(pos);
859  if (it == m_Map.end()) {
860  return false;
861  } else if (it->second == eGapType_known) {
862  return true;
863  } else {
864  return false;
865  }
866 }
867 
868 
869 bool CGapCache::IsUnknownGap(size_t pos)
870 {
871  TGapTypeMap::iterator it = m_Map.find(pos);
872  if (it == m_Map.end()) {
873  return false;
874  } else if (it->second == eGapType_unknown) {
875  return true;
876  } else {
877  return false;
878  }
879 }
880 
881 
883 
884 {
885  if ( bsh.IsSetInst_Ext() ) {
886  const CBioseq_Handle::TInst_Ext& ext = bsh.GetInst_Ext();
887  if ( ext.IsDelta() ) {
888  ITERATE (CDelta_ext::Tdata, it, ext.GetDelta().Get()) {
889  if ( (*it)->IsLoc() ) {
890  return false;
891  }
892  }
893  }
894  }
895  return true;
896 }
897 
898 
899 size_t CSingleFeatValidator::x_CalculateLocationGaps(CBioseq_Handle bsh, const CSeq_loc& loc, vector<TSeqPos>& gap_starts)
900 {
901  size_t rval = eLocationGapNoProblems;
902  if (!bsh.IsNa() || !bsh.IsSetInst_Repr() || bsh.GetInst().GetRepr() != CSeq_inst::eRepr_delta) {
903  return rval;
904  }
905  // look for features inside gaps, crossing unknown gaps, or starting or ending in gaps
906  // ignore gap features for this
907  int num_n = 0;
908  int num_real = 0;
909  int num_gap = 0;
910  int num_unknown_gap = 0;
911  bool first_in_gap = false, last_in_gap = false;
912  bool local_first_gap = false, local_last_gap = false;
913  bool startsOrEndsInGap = false;
914  bool first = true;
915 
916  for (CSeq_loc_CI loc_it(loc); loc_it; ++loc_it) {
917  CConstRef<CSeq_loc> this_loc = loc_it.GetRangeAsSeq_loc();
918  CSeqVector vec = GetSequenceFromLoc(*this_loc, bsh.GetScope());
919  if (!vec.empty()) {
920  CBioseq_Handle ph;
921  bool match = false;
922  for (auto id_it : bsh.GetBioseqCore()->GetId()) {
923  if (id_it->Equals(loc_it.GetSeq_id())) {
924  match = true;
925  break;
926  }
927  }
928  if (match) {
929  ph = bsh;
930  } else {
931  ph = bsh.GetScope().GetBioseqHandle(*this_loc);
932  }
933  try {
934  CGapCache gap_cache(*this_loc, ph);
935  string vec_data;
936  vec.GetSeqData(0, vec.size(), vec_data);
937 
938  local_first_gap = false;
939  local_last_gap = false;
940  TSeqLength len = loc_it.GetRange().GetLength();
941  ENa_strand strand = loc_it.GetStrand();
942 
943  size_t pos = 0;
944  string::iterator it = vec_data.begin();
945  while (it != vec_data.end() && pos < len) {
946  bool is_gap = false;
947  bool unknown_length = false;
948  if (strand == eNa_strand_minus) {
949  if (gap_cache.IsKnownGap(len - pos - 1)) {
950  is_gap = true;
951  } else if (gap_cache.IsUnknownGap(len - pos - 1)) {
952  is_gap = true;
953  unknown_length = true;
954  }
955  } else {
956  if (gap_cache.IsKnownGap(pos)) {
957  is_gap = true;
958  } else if (gap_cache.IsUnknownGap(pos)) {
959  is_gap = true;
960  unknown_length = true;
961  }
962 
963  }
964  if (is_gap) {
965  if (pos == 0) {
966  local_first_gap = true;
967  } else if (pos == len - 1) {
968  local_last_gap = true;
969  }
970  if (unknown_length) {
971  num_unknown_gap++;
972  } else {
973  num_gap++;
974  }
975  } else if (*it == 'N') {
976  num_n++;
977  } else {
978  num_real++;
979  }
980  ++it;
981  ++pos;
982  }
983  } catch (CException&/* ex*/) {
984  /*
985  PostErr(eDiag_Fatal, eErr_INTERNAL_Exception,
986  string("Exception while checking for intervals in gaps. EXCEPTION: ") +
987  ex.what(), feat);
988  */
989  }
990  }
991  if (first) {
992  first_in_gap = local_first_gap;
993  first = false;
994  }
995  last_in_gap = local_last_gap;
996  if (local_first_gap || local_last_gap) {
997  startsOrEndsInGap = true;
998  }
999  }
1000 
1001  if (num_real == 0 && num_n == 0) {
1002  TSeqPos start = loc.GetStart(eExtreme_Positional);
1003  TSeqPos stop = loc.GetStop(eExtreme_Positional);
1004  if ((start == 0 || CSeqMap_CI(bsh, SSeqMapSelector(), start - 1).GetType() != CSeqMap::eSeqGap)
1005  && (stop == bsh.GetBioseqLength() - 1 || CSeqMap_CI(bsh, SSeqMapSelector(), stop + 1).GetType() != CSeqMap::eSeqGap)) {
1007  }
1008  }
1009 
1010 
1011  if (num_gap == 0 && num_unknown_gap == 0 && num_n == 0) {
1012  // ignore features that do not cover any gap characters
1013  } else if (first_in_gap || last_in_gap) {
1014  if (num_real > 0) {
1015  TSeqPos gap_start = x_FindStartOfGap(bsh,
1016  first_in_gap ? loc.GetStart(eExtreme_Biological)
1017  : loc.GetStop(eExtreme_Biological), &(bsh.GetScope()));
1018  gap_starts.push_back(gap_start);
1019  } else {
1021  }
1022  } else if (num_real == 0 && num_gap == 0 && num_unknown_gap == 0 && num_n >= 50) {
1024  } else if (startsOrEndsInGap) {
1026  } else if (num_unknown_gap > 0) {
1028  }
1029 
1030  if (num_n > num_real && xf_IsDeltaLitOnly(bsh)) {
1031  rval |= eLocationGapMostlyNs;
1032  }
1033 
1034  return rval;
1035 }
1036 
1037 
1039 {
1040  if (!bsh || !bsh.IsNa() || !bsh.IsSetInst_Repr()
1042  || !bsh.GetInst().IsSetExt()
1043  || !bsh.GetInst().GetExt().IsDelta()) {
1044  return bsh.GetInst_Length();
1045  }
1046  TSeqPos offset = 0;
1047 
1049  TSeqPos len = 0;
1050  if ((*it)->IsLiteral()) {
1051  len = (*it)->GetLiteral().GetLength();
1052  } else if ((*it)->IsLoc()) {
1053  len = sequence::GetLength((*it)->GetLoc(), scope);
1054  }
1055  if (pos >= offset && pos < offset + len) {
1056  return offset;
1057  } else {
1058  offset += len;
1059  }
1060  }
1061  return offset;
1062 }
1063 
1064 
1066 {
1067  if (!bsh.IsNa() || !bsh.IsSetInst_Repr() || bsh.GetInst_Repr() != CSeq_inst::eRepr_raw) {
1068  return false;
1069  }
1070  int num_n = 0;
1071  int real_bases = 0;
1072 
1073  for (CSeq_loc_CI loc_it(loc); loc_it; ++loc_it) {
1074  CConstRef<CSeq_loc> this_loc = loc_it.GetRangeAsSeq_loc();
1075  CSeqVector vec = GetSequenceFromLoc(*this_loc, bsh.GetScope());
1076  if (!vec.empty()) {
1077  CBioseq_Handle ph;
1078  bool match = false;
1079  for (auto id_it : bsh.GetBioseqCore()->GetId()) {
1080  if (id_it->Equals(loc_it.GetSeq_id())) {
1081  match = true;
1082  break;
1083  }
1084  }
1085  if (match) {
1086  ph = bsh;
1087  } else {
1088  ph = bsh.GetScope().GetBioseqHandle(*this_loc);
1089  }
1091  string vec_data;
1092  try {
1093  vec.GetSeqData(0, vec.size(), vec_data);
1094 
1095  int pos = 0;
1096  string::iterator it = vec_data.begin();
1097  while (it != vec_data.end()) {
1098  if (*it == 'N') {
1099  CSeqMap_CI map_iter(ph, SSeqMapSelector(), offset + pos);
1100  if (map_iter.GetType() == CSeqMap::eSeqGap) {
1101  } else {
1102  num_n++;
1103  }
1104  } else {
1105  if ((unsigned)(*it + 1) <= 256 && isalpha(*it)) {
1106  real_bases++;
1107  }
1108  }
1109  ++it;
1110  ++pos;
1111  }
1112  } catch (const CException& ) {
1113  } catch (const std::exception& ) {
1114  }
1115  }
1116  }
1117 
1118  return (num_n > real_bases);
1119 }
1120 
1121 
1123 {
1124  CBioseq_Handle prot_handle;
1125  is_far = false;
1126  if (!m_Feat.IsSetProduct()) {
1127  return prot_handle;
1128  }
1129  const CSeq_id* protid = nullptr;
1130  try {
1131  protid = &sequence::GetId(m_Feat.GetProduct(), &m_Scope);
1132  } catch (CException&) {}
1133 
1134  if (!protid) {
1135  return prot_handle;
1136  }
1137 
1138  if (m_Imp.IsHugeFileMode()) {
1139  if (look_far && m_Imp.IsFarSequence(*protid)) {
1140  prot_handle = m_Scope.GetBioseqHandle(*protid);
1141  if (prot_handle) {
1142  is_far = true;
1143  }
1144  return prot_handle;
1145  }
1146  return m_Imp.GetLocalBioseqHandle(*protid);
1147  }
1148 
1149  // try "local" scope
1151  if (!prot_handle) {
1152  prot_handle = m_Scope.GetBioseqHandleFromTSE(*protid, m_Imp.GetTSE_Handle());
1153  }
1154  if (!prot_handle && look_far) {
1155  prot_handle = m_Scope.GetBioseqHandle(*protid);
1156  if (prot_handle) {
1157  is_far = true;
1158  }
1159  }
1160 
1161  return prot_handle;
1162 }
1163 
1164 
1166 {
1167  bool look_far = false;
1168 
1169  if (m_Feat.IsSetData()) {
1170  if (m_Feat.GetData().IsCdregion()) {
1171  look_far = m_Imp.IsFarFetchCDSproducts();
1172  } else if (m_Feat.GetData().IsRna()) {
1173  look_far = m_Imp.IsFarFetchMRNAproducts();
1174  } else {
1175  look_far = m_Imp.IsRemoteFetch();
1176  }
1177  }
1178 
1179  return x_GetFeatureProduct(look_far, is_far);
1180 }
1181 
1182 
1184 {
1186  (!m_Feat.IsSetExcept() || !m_Feat.GetExcept())) {
1188  "Exception text is present, but exception flag is not set");
1189  } else if (m_Feat.IsSetExcept() && m_Feat.GetExcept() &&
1192  "Exception flag is set, but exception text is empty");
1193  }
1194  if (m_Feat.IsSetExcept_text() && !m_Feat.GetExcept_text().empty()) {
1196  }
1197 }
1198 
1199 
1201 {
1202  if (text.empty()) return;
1203 
1204  EDiagSev sev = eDiag_Error;
1205  bool found = false;
1206 
1207  string str;
1208 
1209  bool reasons_in_cit = false;
1210  bool annotated_by_transcript_or_proteomic = false;
1211  bool redundant_with_comment = false;
1212  bool refseq_except = false;
1213  vector<string> exceptions;
1214  NStr::Split(text, ",", exceptions, 0);
1215  ITERATE(vector<string>, it, exceptions) {
1216  found = false;
1217  str = NStr::TruncateSpaces(*it);
1218  if (NStr::IsBlank(*it)) {
1219  continue;
1220  }
1222 
1223  if (found) {
1224  if (NStr::EqualNocase(str, "reasons given in citation")) {
1225  reasons_in_cit = true;
1226  } else if (NStr::EqualNocase(str, "annotated by transcript or proteomic data")) {
1227  annotated_by_transcript_or_proteomic = true;
1228  }
1229  }
1230  if (!found) {
1231  if (m_LocationBioseq) {
1232  bool check_refseq = false;
1233  if (m_Imp.IsRefSeqConventions()) {
1234  check_refseq = true;
1235  } else if (GetGenProdSetParent(m_LocationBioseq)) {
1236  check_refseq = true;
1237  } else {
1239  if ((*id_it)->IsOther()) {
1240  check_refseq = true;
1241  break;
1242  }
1243  }
1244  }
1245 
1246  if (check_refseq) {
1248  found = true;
1249  refseq_except = true;
1250  }
1251  }
1252  }
1253  }
1254  if (!found) {
1255  // lower to warning for genomic refseq
1256  const CSeq_id *id = m_Feat.GetLocation().GetId();
1257  if ((id && IsNTNCNWACAccession(*id)) ||
1259  sev = eDiag_Warning;
1260  }
1262  str + " is not a legal exception explanation");
1263  }
1264  if (m_Feat.IsSetComment() && NStr::Find(m_Feat.GetComment(), str) != string::npos) {
1265  if (!NStr::EqualNocase(str, "ribosomal slippage") &&
1266  !NStr::EqualNocase(str, "trans-splicing") &&
1267  !NStr::EqualNocase(str, "RNA editing") &&
1268  !NStr::EqualNocase(str, "artificial location")) {
1269  redundant_with_comment = true;
1270  } else if (NStr::EqualNocase(m_Feat.GetComment(), str)) {
1271  redundant_with_comment = true;
1272  }
1273  }
1274  }
1275  if (redundant_with_comment) {
1277  "Exception explanation text is also found in feature comment");
1278  }
1279  if (refseq_except) {
1280  bool found_just_the_exception = CSeq_feat::IsExceptionTextRefSeqOnly(str);
1281 
1282  if (!found_just_the_exception) {
1284  "Genome processing exception should not be combined with other explanations");
1285  }
1286  }
1287 
1288  if (reasons_in_cit && !m_Feat.IsSetCit()) {
1290  "Reasons given in citation exception does not have the required citation");
1291  }
1292  if (annotated_by_transcript_or_proteomic) {
1293  bool has_inference = false;
1295  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "inference")) {
1296  has_inference = true;
1297  break;
1298  }
1299  }
1300  if (!has_inference) {
1302  "Annotated by transcript or proteomic data exception does not have the required inference qualifier");
1303  }
1304  }
1305 }
1306 
1307 
1308 const string kOrigProteinId = "orig_protein_id";
1309 
1311 {
1312  if (!m_Feat.GetData().IsRna()) {
1313  return true;
1314  } else {
1315  return false;
1316  }
1317 }
1318 
1319 
1321 {
1322  if (!m_Feat.IsSetQual()) {
1323  return;
1324  }
1325  string key;
1326  bool is_imp = false;
1328 
1329  if (m_Feat.IsSetData() && m_Feat.GetData().IsImp()) {
1330  is_imp = true;
1331  key = m_Feat.GetData().GetImp().GetKey();
1332  if (ftype == CSeqFeatData::eSubtype_imp && NStr::EqualNocase (key, "gene")) {
1334  } else if (ftype == CSeqFeatData::eSubtype_imp) {
1336  } else if (ftype == CSeqFeatData::eSubtype_Imp_CDS
1338  || ftype == CSeqFeatData::eSubtype_org) {
1340  }
1341  }
1342  else {
1343  key = m_Feat.GetData().GetKey();
1344  if (NStr::Equal (key, "Gene")) {
1345  key = "gene";
1346  }
1347  }
1348 
1349  for (auto gbq : m_Feat.GetQual()) {
1350  const string& qual_str = gbq->GetQual();
1351 
1352  if ( NStr::Equal (qual_str, "gsdb_id")) {
1353  continue;
1354  }
1355  auto gbqual_and_value = CSeqFeatData::GetQualifierTypeAndValue(qual_str);
1356  auto gbqual = gbqual_and_value.first;
1357  bool same_case = (gbqual == CSeqFeatData::eQual_bad) || NStr::EqualCase(gbqual_and_value.second, qual_str);
1358 
1359  if ( !same_case ) {
1361  qual_str + " is improperly capitalized");
1362  }
1363 
1364  if ( gbqual == CSeqFeatData::eQual_bad ) {
1365  if (is_imp) {
1366  if (!gbq->IsSetQual() || NStr::IsBlank(gbq->GetQual())) {
1368  "NULL qualifier");
1369  }
1370  else {
1372  "Unknown qualifier " + qual_str);
1373  }
1374  } else if (NStr::Equal(qual_str, kOrigProteinId)) {
1375  if (x_ReportOrigProteinId()) {
1377  }
1378  } else {
1380  if (chs == CSeqFeatData::e_Gene) {
1381  if (NStr::Equal(qual_str, "gen_map")
1382  || NStr::Equal(qual_str, "cyt_map")
1383  || NStr::Equal(qual_str, "rad_map")) {
1384  continue;
1385  }
1386  } else if (chs == CSeqFeatData::e_Cdregion) {
1387  if (NStr::Equal(qual_str, "orig_transcript_id")) {
1388  continue;
1389  }
1390  } else if (chs == CSeqFeatData::e_Rna) {
1391  if (NStr::Equal(qual_str, "orig_transcript_id")) {
1392  continue;
1393  }
1394  }
1395  PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownFeatureQual, "Unknown qualifier " + qual_str);
1396  }
1397  } else {
1398  if ( ftype != CSeqFeatData::eSubtype_bad && !CSeqFeatData::IsLegalQualifier(ftype, gbqual) ) {
1401  "Wrong qualifier " + qual_str + " for feature " +
1402  key);
1403  }
1404  else if (ftype == CSeqFeatData::eSubtype_misc_feature &&
1405  gbqual == CSeqFeatData::eQual_feat_class && !m_Imp.IsRefSeq()) {
1407  "feat_class qualifier is only legal for RefSeq");
1408  }
1409 
1410  if (gbq->IsSetVal() && !NStr::IsBlank(gbq->GetVal())) {
1411  // validate value of gbqual
1412  const string& val = gbq->GetVal();
1413  switch (gbqual) {
1414 
1416  if (NStr::Find(val, ",") != NPOS) {
1418  "Compound '" + val + "' must be split into separate instances of qualifier " + qual_str);
1419  }
1422  val + " is not a legal value for qualifier " + qual_str);
1423  }
1424  break;
1425 
1428  break;
1429 
1432  break;
1433 
1436  break;
1437 
1440  break;
1441 
1443  if (is_imp) {
1444  x_ValidateReplaceQual(key, qual_str, val);
1445  }
1446  break;
1447 
1450  if (is_imp && !CGb_qual::IsLegalMobileElementValue(val)) {
1452  val + " is not a legal value for qualifier " + qual_str);
1453  }
1454  break;
1455 
1458  break;
1459 
1461  if (is_imp && ftype == CSeqFeatData::eSubtype_misc_feature
1462  && NStr::EqualCase (val, "Vector Contamination")) {
1464  "Vector Contamination region should be trimmed from sequence");
1465  }
1466  break;
1467 
1469  if (!is_imp) {
1471  if (chs == CSeqFeatData::e_Gene) {
1473  "A product qualifier is not used on a gene feature");
1474  }
1475  }
1476  break;
1477 
1478  // for VR-825
1481  "locus-tag values should be on genes");
1482  break;
1483  default:
1484  break;
1485  } // end of switch statement
1486  }
1487  }
1488  }
1489 }
1490 
1491 
1492 void CSingleFeatValidator::x_ValidateRptUnitVal (const string& val, const string& key)
1493 {
1494  bool /* found = false, */ multiple_rpt_unit = false;
1495  ITERATE(string, it, val) {
1496  if ( *it <= ' ' ) {
1497  /* found = true; */
1498  } else if ( *it == '(' || *it == ')' ||
1499  *it == ',' || *it == '.' ||
1500  isdigit((unsigned char)(*it)) ) {
1501  multiple_rpt_unit = true;
1502  }
1503  }
1504  /*
1505  if ( found ||
1506  (!multiple_rpt_unit && val.length() > 48) ) {
1507  error = true;
1508  }
1509  */
1510  if ( NStr::CompareNocase(key, "repeat_region") == 0 &&
1511  !multiple_rpt_unit ) {
1512  if (val.length() <= GetLength(m_Feat.GetLocation(), &m_Scope) ) {
1513  bool just_nuc_letters = true;
1514  static const string nuc_letters = "ACGTNacgtn";
1515  ITERATE(string, it, val) {
1516  if ( nuc_letters.find(*it) == NPOS ) {
1517  just_nuc_letters = false;
1518  break;
1519  }
1520  }
1521 
1522  if ( just_nuc_letters ) {
1524  if ( !vec.empty() ) {
1525  string vec_data;
1526  vec.GetSeqData(0, vec.size(), vec_data);
1527  if (NStr::FindNoCase (vec_data, val) == string::npos) {
1529  "repeat_region /rpt_unit and underlying "
1530  "sequence do not match");
1531  }
1532  }
1533  }
1534  } else {
1536  "Length of rpt_unit_seq is greater than feature length");
1537  }
1538  }
1539 }
1540 
1541 
1542 void CSingleFeatValidator::x_ValidateRptUnitSeqVal (const string& val, const string& key)
1543 {
1544  // do validation common to rpt_unit
1546 
1547  // do the validation specific to rpt_unit_seq
1548  const char *cp = val.c_str();
1549  bool badchars = false;
1550  while (*cp != 0 && !badchars) {
1551  if (*cp < ' ') {
1552  badchars = true;
1553  } else if (*cp != '(' && *cp != ')'
1554  && !isdigit (*cp) && !isalpha (*cp)
1555  && *cp != ',' && *cp != ';') {
1556  badchars = true;
1557  }
1558  cp++;
1559  }
1560  if (badchars) {
1562  "/rpt_unit_seq has illegal characters");
1563  }
1564 }
1565 
1566 
1567 static bool s_RptUnitIsBaseRange (string str, TSeqPos& from, TSeqPos& to)
1568 
1569 {
1570  if (str.length() > 25) {
1571  return false;
1572  }
1573  SIZE_TYPE pos = NStr::Find (str, "..");
1574  if (pos == string::npos) {
1575  return false;
1576  }
1577 
1578  int tmp_from, tmp_to;
1579  try {
1580  tmp_from = NStr::StringToInt (str.substr(0, pos));
1581  from = tmp_from;
1582  tmp_to = NStr::StringToInt (str.substr (pos + 2));
1583  to = tmp_to;
1584  } catch (const CException& ) {
1585  return false;
1586  } catch (const std::exception& ) {
1587  return false;
1588  }
1589  if (tmp_from < 0 || tmp_to < 0) {
1590  return false;
1591  }
1592  return true;
1593 }
1594 
1595 
1597 {
1598  TSeqPos from = kInvalidSeqPos, to = kInvalidSeqPos;
1599  if (!s_RptUnitIsBaseRange(val, from, to)) {
1601  "/rpt_unit_range is not a base range");
1602  } else {
1604  if (from - 1 < range.GetFrom() || from - 1> range.GetTo() || to - 1 < range.GetFrom() || to - 1 > range.GetTo()) {
1606  "/rpt_unit_range is not within sequence length");
1607  } else {
1608  bool nulls_between = false;
1609  for ( CTypeConstIterator<CSeq_loc> lit = ConstBegin(m_Feat.GetLocation()); lit; ++lit ) {
1610  if ( lit->Which() == CSeq_loc::e_Null ) {
1611  nulls_between = true;
1612  }
1613  }
1614  if (nulls_between) {
1615  bool in_range = false;
1616  for ( CSeq_loc_CI it(m_Feat.GetLocation()); it; ++it ) {
1617  range = it.GetEmbeddingSeq_loc().GetTotalRange();
1618  if (from - 1 < range.GetFrom() || from - 1> range.GetTo() || to - 1 < range.GetFrom() || to - 1 > range.GetTo()) {
1619  } else {
1620  in_range = true;
1621  }
1622  }
1623  if (! in_range) {
1625  "/rpt_unit_range is not within ordered intervals");
1626  }
1627  }
1628  }
1629  }
1630 }
1631 
1632 
1634 {
1635  bool only_digits = true,
1636  has_spaces = false;
1637 
1638  ITERATE(string, it, val) {
1639  if ( isspace((unsigned char)(*it)) ) {
1640  has_spaces = true;
1641  }
1642  if ( !isdigit((unsigned char)(*it)) ) {
1643  only_digits = false;
1644  }
1645  }
1646  if (only_digits || has_spaces) {
1647  PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue, "Illegal value for qualifier label");
1648  }
1649 }
1650 
1651 
1653 {
1654  if (!NStr::StartsWith (val, "(")) {
1655  EAccessionFormatError valid_accession = ValidateAccessionString (val, true);
1656  if (valid_accession == eAccessionFormat_missing_version) {
1658  val + " accession missing version for qualifier compare");
1659  } else if (valid_accession == eAccessionFormat_bad_version) {
1661  val + " accession has bad version for qualifier compare");
1662  } else if (valid_accession != eAccessionFormat_valid) {
1664  val + " is not a legal accession for qualifier compare");
1665  } else if (m_Imp.IsINSDInSep() && NStr::Find (val, "_") != string::npos) {
1667  "RefSeq accession " + val + " cannot be used for qualifier compare");
1668  }
1669  }
1670 }
1671 
1672 
1673 static bool s_StringConsistsOf (string str, string consist)
1674 {
1675  const char *src = str.c_str();
1676  const char *find = consist.c_str();
1677  bool rval = true;
1678 
1679  while (*src != 0 && rval) {
1680  if (strchr (find, *src) == NULL) {
1681  rval = false;
1682  }
1683  src++;
1684  }
1685  return rval;
1686 }
1687 
1688 
1689 void CSingleFeatValidator::x_ValidateReplaceQual(const string& key, const string& qual_str, const string& val)
1690 {
1691  if (m_LocationBioseq) {
1692  if (m_LocationBioseq.IsNa()) {
1693  if (NStr::Equal(key, "variation")) {
1694  if (!s_StringConsistsOf (val, "acgtACGT")) {
1696  val + " is not a legal value for qualifier " + qual_str
1697  + " - should only be composed of acgt unambiguous nucleotide bases");
1698  }
1699  } else if (!s_StringConsistsOf (val, "acgtmrwsykvhdbn")) {
1701  val + " is not a legal value for qualifier " + qual_str
1702  + " - should only be composed of acgtmrwsykvhdbn nucleotide bases");
1703  }
1704  } else if (m_LocationBioseq.IsAa()) {
1705  if (!s_StringConsistsOf (val, "acdefghiklmnpqrstuvwy*")) {
1707  val + " is not a legal value for qualifier " + qual_str
1708  + " - should only be composed of acdefghiklmnpqrstuvwy* amino acids");
1709  }
1710  }
1711 
1712  // if no point in location with fuzz, info if text matches sequence
1713  bool has_fuzz = false;
1714  for( objects::CSeq_loc_CI it(m_Feat.GetLocation()); it && !has_fuzz; ++it) {
1715  if (it.IsPoint() && (it.GetFuzzFrom() || it.GetFuzzTo())) {
1716  has_fuzz = true;
1717  }
1718  }
1719  if (!has_fuzz && val.length() == GetLength (m_Feat.GetLocation(), &m_Scope)) {
1720  try {
1722  string bases;
1723  nuc_vec.GetSeqData(0, nuc_vec.size(), bases);
1724  if (NStr::EqualNocase(val, bases)) {
1726  "/replace already matches underlying sequence (" + val + ")");
1727  }
1728  } catch (const CException& ) {
1729  } catch (const std::exception& ) {
1730  }
1731  }
1732  }
1733 }
1734 
1735 
1737 {
1738  if (HasBadCharacter (value)) {
1740  field_name + " contains undesired character");
1741  }
1742  if (EndsWithBadCharacter (value)) {
1744  field_name + " ends with undesired character");
1745  }
1746  if (NStr::EndsWith (value, "-")) {
1749  field_name + " ends with hyphen");
1750  }
1751 }
1752 
1753 
1754 void CSingleFeatValidator::ValidateSplice(bool gene_pseudo, bool check_all)
1755 {
1756  if (!m_LocationBioseq) {
1757  return;
1758  }
1759 
1760  CSpliceProblems splice_problems;
1761  splice_problems.CalculateSpliceProblems(m_Feat, check_all, gene_pseudo, m_LocationBioseq);
1762 
1763  if (splice_problems.AreErrorsUnexpected()) {
1765  x_ReportSpliceProblems(splice_problems, label);
1766  }
1767 
1768  if (splice_problems.IsExceptionUnnecessary()) {
1770  "feature has exception but passes splice site test");
1771  }
1772 }
1773 
1774 
1776 {
1777  EDiagSev sev = eDiag_Warning;
1778  if (m_Imp.IsGpipe() && m_Imp.IsGenomic()) {
1779  sev = eDiag_Info;
1780  } else if ((m_Imp.IsGPS() || m_Imp.IsRefSeq()) && !m_Imp.ReportSpliceAsError()) {
1781  sev = eDiag_Warning;
1782  }
1783  return sev;
1784 }
1785 
1786 
1788 {
1789  if (problem.first == CSpliceProblems::eSpliceSiteRead_BadSeq) {
1791  "Bad sequence at splice donor after exon ending at position "
1792  + NStr::IntToString(problem.second + 1) + " of " + label);
1793  } else if (problem.first == CSpliceProblems::eSpliceSiteRead_WrongNT) {
1795  "Splice donor consensus (GT) not found after exon ending at position "
1796  + NStr::IntToString(problem.second + 1) + " of " + label);
1797  }
1798 
1799 }
1800 
1801 
1803 {
1804  if (problem.first == CSpliceProblems::eSpliceSiteRead_BadSeq) {
1806  "Bad sequence at splice acceptor before exon starting at position "
1807  + NStr::IntToString(problem.second + 1) + " of " + label);
1808  } else if (problem.first == CSpliceProblems::eSpliceSiteRead_WrongNT) {
1810  "Splice acceptor consensus (AG) not found before exon starting at position "
1811  + NStr::IntToString(problem.second + 1) + " of " + label);
1812  }
1813 
1814 }
1815 
1816 
1818 (const CSpliceProblems& problems, const string& label)
1819 {
1820  const CSpliceProblems::TSpliceProblemList& donor_problems = problems.GetDonorProblems();
1821  for (auto it = donor_problems.begin(); it != donor_problems.end(); it++) {
1823  }
1824  const CSpliceProblems::TSpliceProblemList& acceptor_problems = problems.GetAcceptorProblems();
1825  for (auto it = acceptor_problems.begin(); it != acceptor_problems.end(); it++) {
1827  }
1828 }
1829 
1830 
1832 {
1833  if (bsh) {
1834  FOR_EACH_SEQID_ON_BIOSEQ (it, *(bsh.GetBioseqCore())) {
1835  if ((*it)->IsOther() && (*it)->GetOther().IsSetAccession()
1836  && NStr::StartsWith ((*it)->GetOther().GetAccession(), "NM_")) {
1837  return true;
1838  }
1839  }
1840  }
1841  return false;
1842 }
1843 
1844 
1846 {
1847  if (m_Feat.GetData().IsImp()) {
1848  return;
1849  }
1850  string key = m_Feat.GetData().GetKey();
1851 
1853 
1854  // look for mandatory qualifiers
1855  EDiagSev sev = eDiag_Warning;
1856 
1857  for (auto required : CSeqFeatData::GetMandatoryQualifiers(subtype))
1858  {
1859  bool found = false;
1860  if (m_Feat.IsSetQual()) {
1861  for (auto qual : m_Feat.GetQual()) {
1862  if (qual->IsSetQual() && CSeqFeatData::GetQualifierType(qual->GetQual()) == required) {
1863  found = true;
1864  break;
1865  }
1866  }
1867  }
1868 
1869  if (!found) {
1870  if (required == CSeqFeatData::eQual_citation) {
1871  if (m_Feat.IsSetCit()) {
1872  found = true;
1873  } else if (m_Feat.IsSetComment() && NStr::EqualNocase (key, "conflict")) {
1874  // RefSeq allows conflict with accession in comment instead of sfp->cit
1876  if ((*it)->IsOther()) {
1877  found = true;
1878  break;
1879  }
1880  }
1881  }
1882  }
1883  }
1884  if (!found && (NStr::EqualNocase (key, "conflict") || NStr::EqualNocase (key, "old_sequence"))) {
1885  if (m_Feat.IsSetQual()) {
1886  for (auto qual : m_Feat.GetQual()) {
1887  if (qual->IsSetQual() && NStr::EqualNocase(qual->GetQual(), "compare")
1888  && qual->IsSetVal() && !NStr::IsBlank(qual->GetVal())) {
1889  found = true;
1890  break;
1891  }
1892  }
1893  }
1894  }
1895  if (!found && required == CSeqFeatData::eQual_ncRNA_class) {
1896  sev = eDiag_Error;
1897  if (m_Feat.GetData().IsRna() && m_Feat.GetData().GetRna().IsSetExt()
1898  && m_Feat.GetData().GetRna().GetExt().IsGen()
1901  found = true;
1902  }
1903  }
1904 
1905  if (!found) {
1907  "Missing qualifier " + CSeqFeatData::GetQualifierAsString(required) +
1908  " for feature " + key);
1909  }
1910  }
1911 }
1912 
1913 
1914 static bool s_LocationStrandsIncompatible (const CSeq_loc& loc1, const CSeq_loc& loc2, CScope * scope)
1915 {
1916  ENa_strand strand1 = loc1.GetStrand();
1917  ENa_strand strand2 = loc2.GetStrand();
1918 
1919  if (strand1 == strand2) {
1920  return false;
1921  }
1922  if ((strand1 == eNa_strand_unknown || strand1 == eNa_strand_plus) &&
1923  (strand2 == eNa_strand_unknown || strand2 == eNa_strand_plus)) {
1924  return false;
1925  }
1926  if (strand1 == eNa_strand_other) {
1927  ECompare comp = Compare(loc1, loc2, scope, fCompareOverlapping);
1928  if (comp == eContains) {
1929  return false;
1930  }
1931  } else if (strand2 == eNa_strand_other) {
1932  ECompare comp = Compare(loc1, loc2, scope, fCompareOverlapping);
1933  if (comp == eContained) {
1934  return false;
1935  }
1936  }
1937 
1938  return true;
1939 }
1940 
1941 
1943 {
1944  bool bad_strand = s_LocationStrandsIncompatible(gene.GetLocation(), m_Feat.GetLocation(), &m_Scope);
1945  if (bad_strand) {
1947  "Gene cross-reference is not on expected strand");
1948  }
1949 
1950 }
1951 
1952 
1954 {
1955  bool equivalent = false;
1956  if (g1.IsSetLocus_tag()
1957  && g2.IsSetLocus_tag()) {
1959  g2.GetLocus_tag())) {
1960  label = g1.GetLocus_tag();
1961  equivalent = true;
1962  }
1963  } else if (g1.IsSetLocus()
1964  && g2.IsSetLocus()) {
1965  if (NStr::EqualNocase(g1.GetLocus(),
1966  g2.GetLocus())) {
1967  label = g1.GetLocus();
1968  equivalent = true;
1969  }
1970  } else if (g1.IsSetSyn()
1971  && g2.IsSetSyn()) {
1972  if (NStr::EqualNocase (g1.GetSyn().front(),
1973  g2.GetSyn().front())) {
1974  label = g1.GetSyn().front();
1975  equivalent = true;
1976  }
1977  }
1978  return equivalent;
1979 }
1980 
1981 
1982 // Check for redundant gene Xref
1983 // Do not call if feat is gene
1985 {
1986  if (m_Feat.IsSetData() && m_Feat.GetData().IsGene()) {
1987  return;
1988  }
1989  auto tse = m_Imp.GetTSE_Handle();
1990  if (!tse) {
1991  return;
1992  }
1993 
1994  // first, look for gene by feature id xref
1995  bool has_gene_id_xref = false;
1996  if (m_Feat.IsSetXref()) {
1998  if ((*xref)->IsSetId() && (*xref)->GetId().IsLocal()) {
1999  CTSE_Handle::TSeq_feat_Handles gene_feats =
2000  tse.GetFeaturesWithId(CSeqFeatData::eSubtype_gene, (*xref)->GetId().GetLocal());
2001  if (gene_feats.size() > 0) {
2002  has_gene_id_xref = true;
2003  ITERATE(CTSE_Handle::TSeq_feat_Handles, gene, gene_feats) {
2004  x_ValidateGeneFeaturePair(*(gene->GetSeq_feat()));
2005  }
2006  }
2007  }
2008  }
2009  }
2010  if (has_gene_id_xref) {
2011  return;
2012  }
2013 
2014  // if we can't get the bioseq on which the gene is located, we can't check for
2015  // overlapping/ambiguous/redundant conditions
2016  if (!m_LocationBioseq) {
2017  return;
2018  }
2019 
2020  const CGene_ref* gene_xref = m_Feat.GetGeneXref();
2021 
2022  size_t num_genes = 0;
2023  size_t max = 0;
2024  size_t num_trans_spliced = 0;
2025  bool equivalent = false;
2026  /*
2027  CFeat_CI gene_it(bsh, CSeqFeatData::e_Gene);
2028  */
2029 
2030  //CFeat_CI gene_it(*m_Scope, feat.GetLocation(), SAnnotSelector (CSeqFeatData::e_Gene));
2031  CFeat_CI gene_it(m_LocationBioseq,
2035  CFeat_CI prev_gene;
2036  string label = "?";
2037  size_t num_match_by_locus = 0;
2038  size_t num_match_by_locus_tag = 0;
2039 
2040  for ( ; gene_it; ++gene_it) {
2041  if (gene_xref && gene_xref->IsSetLocus() &&
2042  gene_it->GetData().GetGene().IsSetLocus() &&
2043  NStr::Equal(gene_xref->GetLocus(), gene_it->GetData().GetGene().GetLocus())) {
2044  num_match_by_locus++;
2045  x_ValidateGeneFeaturePair(*(gene_it->GetSeq_feat()));
2046  }
2047  if (gene_xref && gene_xref->IsSetLocus_tag() &&
2048  gene_it->GetData().GetGene().IsSetLocus_tag() &&
2049  NStr::Equal(gene_xref->GetLocus_tag(), gene_it->GetData().GetGene().GetLocus_tag())) {
2050  num_match_by_locus_tag++;
2051  x_ValidateGeneFeaturePair(*(gene_it->GetSeq_feat()));
2052  if ((!gene_xref->IsSetLocus() || NStr::IsBlank(gene_xref->GetLocus())) &&
2053  gene_it->GetData().GetGene().IsSetLocus() &&
2054  !NStr::IsBlank(gene_it->GetData().GetGene().GetLocus())) {
2056  "Feature has Gene Xref with locus_tag but no locus, gene with locus_tag and locus exists");
2057  }
2058  }
2059 
2060  if (TestForOverlapEx (gene_it->GetLocation(), m_Feat.GetLocation(),
2061  gene_it->GetLocation().IsInt() ? eOverlap_Contained : eOverlap_Subset, &m_Scope) >= 0) {
2062  size_t len = GetLength(gene_it->GetLocation(), &m_Scope);
2063  if (len < max || num_genes == 0) {
2064  num_genes = 1;
2065  max = len;
2066  num_trans_spliced = 0;
2067  if (gene_it->IsSetExcept() && gene_it->IsSetExcept_text() &&
2068  NStr::FindNoCase (gene_it->GetExcept_text(), "trans-splicing") != string::npos) {
2069  num_trans_spliced++;
2070  }
2071  equivalent = false;
2072  prev_gene = gene_it;
2073  } else if (len == max) {
2074  equivalent |= s_GeneRefsAreEquivalent(gene_it->GetData().GetGene(), prev_gene->GetData().GetGene(), label);
2075  num_genes++;
2076  if (gene_it->IsSetExcept() && gene_it->IsSetExcept_text() &&
2077  NStr::FindNoCase (gene_it->GetExcept_text(), "trans-splicing") != string::npos) {
2078  num_trans_spliced++;
2079  }
2080  }
2081  }
2082  }
2083 
2084  if (!gene_xref) {
2085  // if there is no gene xref, then there should be 0 or 1 overlapping genes
2086  // so that mapping by overlap is unambiguous
2087  if (num_genes > 1 &&
2090  if (m_Imp.IsSmallGenomeSet() && num_genes == num_trans_spliced) {
2091  /* suppress for trans-spliced genes on small genome set */
2092  } else if (equivalent) {
2094  "Feature overlapped by "
2095  + NStr::SizetToString(num_genes)
2096  + " identical-length equivalent genes but has no cross-reference");
2097  } else {
2099  "Feature overlapped by "
2100  + NStr::SizetToString(num_genes)
2101  + " identical-length genes but has no cross-reference");
2102  }
2103  } else if (num_genes == 1
2104  && prev_gene->GetData().GetGene().IsSetAllele()
2105  && !NStr::IsBlank(prev_gene->GetData().GetGene().GetAllele())) {
2106  const string& allele = prev_gene->GetData().GetGene().GetAllele();
2107  // overlapping gene should not conflict with allele qualifier
2108  FOR_EACH_GBQUAL_ON_FEATURE (qual_iter, m_Feat) {
2109  const CGb_qual& qual = **qual_iter;
2110  if ( qual.IsSetQual() &&
2111  NStr::Compare(qual.GetQual(), "allele") == 0 ) {
2112  if ( qual.CanGetVal() &&
2113  NStr::CompareNocase(qual.GetVal(), allele) == 0 ) {
2115  "Redundant allele qualifier (" + allele +
2116  ") on gene and feature");
2119  "Mismatched allele qualifier on gene (" + allele +
2120  ") and feature (" + qual.GetVal() +")");
2121  }
2122  }
2123  }
2124  }
2125  } else if ( !gene_xref->IsSuppressed() ) {
2126  // we are counting features with gene xrefs
2128 
2129  // make sure overlapping gene and gene xref do not conflict
2130  if (gene_xref->IsSetAllele() && !NStr::IsBlank(gene_xref->GetAllele())) {
2131  const string& allele = gene_xref->GetAllele();
2132 
2133  FOR_EACH_GBQUAL_ON_FEATURE (qual_iter, m_Feat) {
2134  const CGb_qual& qual = **qual_iter;
2135  if ( qual.CanGetQual() &&
2136  NStr::Compare(qual.GetQual(), "allele") == 0 ) {
2137  if ( qual.CanGetVal() &&
2138  NStr::CompareNocase(qual.GetVal(), allele) == 0 ) {
2140  "Redundant allele qualifier (" + allele +
2141  ") on gene and feature");
2144  "Mismatched allele qualifier on gene (" + allele +
2145  ") and feature (" + qual.GetVal() +")");
2146  }
2147  }
2148  }
2149  }
2150 
2151  if (num_match_by_locus == 0 && num_match_by_locus_tag == 0) {
2152  // find gene on bioseq to match genexref
2153  if ((gene_xref->IsSetLocus_tag() &&
2154  !NStr::IsBlank(gene_xref->GetLocus_tag())) ||
2155  (gene_xref->IsSetLocus() &&
2156  !NStr::IsBlank(gene_xref->GetLocus()))) {
2158  if (!gene && m_LocationBioseq && m_LocationBioseq.IsAa()) {
2160  if (cds) {
2161  if (cds->IsSetLocation()) {
2162  const CSeq_loc& loc = cds->GetLocation();
2163  const CSeq_id* id = loc.GetId();
2164  if (id) {
2166  if (nbsh) {
2167  gene = m_Imp.GetGeneCache().GetGeneFromCache(cds, m_Scope);
2168  }
2169  }
2170  }
2171  }
2172  }
2173  string label2;
2174  if (gene && !CSingleFeatValidator::s_GeneRefsAreEquivalent(*gene_xref, gene->GetData().GetGene(), label2)) {
2175  gene.Reset();
2176  }
2177  if (gene_xref->IsSetLocus_tag() &&
2178  !NStr::IsBlank(gene_xref->GetLocus_tag()) &&
2179  !gene) {
2181  "Feature has gene locus_tag cross-reference but no equivalent gene feature exists");
2182  } else if (gene_xref->IsSetLocus() &&
2183  !NStr::IsBlank(gene_xref->GetLocus()) &&
2184  !gene) {
2186  "Feature has gene locus cross-reference but no equivalent gene feature exists");
2187  }
2188  }
2189  }
2190  }
2191 
2192 }
2193 
2194 
2196 {
2197  if (m_Feat.GetData().IsGene()) {
2198  return;
2199  }
2201 
2202  if (m_Feat.IsSetQual()) {
2203  // check old locus tag on feature and overlapping gene
2204  for (auto it : m_Feat.GetQual()) {
2205  if (it->IsSetQual() && NStr::Equal(it->GetQual(), "old_locus_tag")
2206  && it->IsSetVal() && !NStr::IsBlank(it->GetVal())) {
2207  x_ValidateOldLocusTag(it->GetVal());
2208  }
2209  }
2210  }
2211 }
2212 
2213 
2215 {
2216  if (ref.IsSetPseudo() && ref.GetPseudo()) {
2217  return true;
2218  } else {
2219  return false;
2220  }
2221 }
2222 
2223 
2224 bool s_HasNamedQual(const CSeq_feat& feat, const string& qual)
2225 {
2226  bool rval = false;
2227  if (feat.IsSetQual()) {
2228  for (auto it : feat.GetQual()) {
2229  if (it->IsSetQual() && NStr::EqualNocase(it->GetQual(), qual)) {
2230  rval = true;
2231  break;
2232  }
2233  }
2234  }
2235  return rval;
2236 }
2237 
2238 
2240 {
2241  if (feat.IsSetPseudo() && feat.GetPseudo()) {
2242  return true;
2243  } else if (s_HasNamedQual(feat, "pseudogene")) {
2244  return true;
2245  } else if (feat.IsSetData() && feat.GetData().IsGene() &&
2246  s_IsPseudo(feat.GetData().GetGene())) {
2247  return true;
2248  } else {
2249  return false;
2250  }
2251 }
2252 
2253 
2254 void CSingleFeatValidator::x_ValidateOldLocusTag(const string& old_locus_tag)
2255 {
2256  if (NStr::IsBlank(old_locus_tag)) {
2257  return;
2258  }
2259  bool pseudo = s_IsPseudo(m_Feat);
2260  const CGene_ref* grp = m_Feat.GetGeneXref();
2261  if ( !grp) {
2262  // check overlapping gene
2264  if ( overlap ) {
2265  if (s_IsPseudo(*overlap)) {
2266  pseudo = true;
2267  }
2268  string gene_old_locus_tag;
2269 
2270  FOR_EACH_GBQUAL_ON_SEQFEAT (it, *overlap) {
2271  if ((*it)->IsSetQual() && NStr::Equal ((*it)->GetQual(), "old_locus_tag")
2272  && (*it)->IsSetVal() && !NStr::IsBlank((*it)->GetVal())) {
2273  gene_old_locus_tag = (*it)->GetVal();
2274  break;
2275  }
2276  }
2277  if (!NStr::IsBlank (gene_old_locus_tag)
2278  && !NStr::EqualNocase (gene_old_locus_tag, old_locus_tag)) {
2280  "Old locus tag on feature (" + old_locus_tag
2281  + ") does not match that on gene (" + gene_old_locus_tag + ")");
2282  }
2283  grp = &(overlap->GetData().GetGene());
2284  }
2285  }
2286  if (grp && s_IsPseudo(*grp)) {
2287  pseudo = true;
2288  }
2289  if (!grp || !grp->IsSetLocus_tag() || NStr::IsBlank (grp->GetLocus_tag())) {
2290  if (! pseudo) {
2292  "old_locus_tag without inherited locus_tag");
2293  }
2294  }
2295 }
2296 
2297 
2299 {
2300  if (!m_Feat.GetData().IsImp()) {
2301  return;
2302  }
2303  const string& key = m_Feat.GetData().GetImp().GetKey();
2304  // validate the feature's location
2305  if ( m_Feat.GetData().GetImp().IsSetLoc() ) {
2306  const string& imp_loc = m_Feat.GetData().GetImp().GetLoc();
2307  if ( imp_loc.find("one-of") != string::npos ) {
2309  "ImpFeat loc " + imp_loc +
2310  " has obsolete 'one-of' text for feature " + key);
2311  } else if ( m_Feat.GetLocation().IsInt() ) {
2312  const CSeq_interval& seq_int = m_Feat.GetLocation().GetInt();
2313  string temp_loc = NStr::IntToString(seq_int.GetFrom() + 1) +
2314  ".." + NStr::IntToString(seq_int.GetTo() + 1);
2315  if ( imp_loc != temp_loc ) {
2317  "ImpFeat loc " + imp_loc + " does not equal feature location " +
2318  temp_loc + " for feature " + key);
2319  }
2320  }
2321  }
2322 
2323 }
2324 
2325 
2327 {
2328  if (!m_Feat.GetData().IsImp()) {
2329  return;
2330  }
2331  const string& key = m_Feat.GetData().GetImp().GetKey();
2332 
2333  // Make sure a feature has its mandatory qualifiers
2334  for (auto required : CSeqFeatData::GetMandatoryQualifiers(m_Feat.GetData().GetSubtype())) {
2335  bool found = false;
2336  if (m_Feat.IsSetQual()) {
2337  for (auto qual : m_Feat.GetQual()) {
2338  if (qual->IsSetQual() && CSeqFeatData::GetQualifierType(qual->GetQual()) == required) {
2339  found = true;
2340  break;
2341  }
2342  }
2343  if (!found && required == CSeqFeatData::eQual_citation) {
2344  if (m_Feat.IsSetCit()) {
2345  found = true;
2346  }
2347  else if (m_Feat.IsSetComment() && !NStr::IsBlank(m_Feat.GetComment())) {
2348  // RefSeq allows conflict with accession in comment instead of sfp->cit
2349  if (m_LocationBioseq) {
2351  if ((*it)->IsOther()) {
2352  found = true;
2353  break;
2354  }
2355  }
2356  }
2357  }
2358  if (!found
2359  && (NStr::EqualNocase(key, "conflict")
2360  || NStr::EqualNocase(key, "old_sequence"))) {
2361  // compare qualifier can now substitute for citation qualifier for conflict and old_sequence
2363  if ((*qual)->IsSetQual() && CSeqFeatData::GetQualifierType((*qual)->GetQual()) == CSeqFeatData::eQual_compare) {
2364  found = true;
2365  break;
2366  }
2367  }
2368  }
2369  }
2370  }
2371  if (!found) {
2373  "Missing qualifier " + CSeqFeatData::GetQualifierAsString(required) +
2374  " for feature " + key);
2375  }
2376  }
2377 }
2378 
2379 
2381 {
2382  switch ( m_Feat.GetData().Which () ) {
2383  case CSeqFeatData::e_Gene:
2385  case CSeqFeatData::e_Prot:
2386  case CSeqFeatData::e_Rna:
2387  case CSeqFeatData::e_Pub:
2388  case CSeqFeatData::e_Imp:
2390  case CSeqFeatData::e_Org:
2392  case CSeqFeatData::e_Seq:
2394  case CSeqFeatData::e_Bond:
2395  case CSeqFeatData::e_Site:
2396  case CSeqFeatData::e_Rsite:
2397  case CSeqFeatData::e_User:
2399  case CSeqFeatData::e_Num:
2402  case CSeqFeatData::e_Het:
2403  case CSeqFeatData::e_Clone:
2405  break;
2406  default:
2408  "Invalid SeqFeat type [" + NStr::IntToString(m_Feat.GetData().Which()) + "]");
2409  break;
2410  }
2411 }
2412 
2413 
2415 {
2416  bool rval = false;
2417  FOR_EACH_SEQID_ON_BIOSEQ (it, *(bsh.GetBioseqCore())) {
2418  if ((*it)->IsOther() && (*it)->GetTextseq_Id()->IsSetAccession()
2419  && NStr::StartsWith ((*it)->GetTextseq_Id()->GetAccession(), prefix)) {
2420  rval = true;
2421  break;
2422  }
2423  }
2424  return rval;
2425 }
2426 
2427 
2429 {
2430  if (!m_Feat.IsSetData() ||
2433  return;
2434  }
2435  string sfp_pseudo;
2436  string gene_pseudo;
2437  bool has_sfp_pseudo = false;
2438  bool has_gene_pseudo = false;
2439  if (m_Feat.IsSetQual()) {
2440  for (auto it : m_Feat.GetQual()) {
2441  if (it->IsSetQual() &&
2442  NStr::EqualNocase(it->GetQual(), "pseudogene") &&
2443  it->IsSetVal()) {
2444  sfp_pseudo = it->GetVal();
2445  has_sfp_pseudo = true;
2446  }
2447  }
2448  }
2449  if (gene && gene->IsSetQual()) {
2450  for (auto it : gene->GetQual()) {
2451  if (it->IsSetQual() &&
2452  NStr::EqualNocase(it->GetQual(), "pseudogene") &&
2453  it->IsSetVal()) {
2454  gene_pseudo = it->GetVal();
2455  has_gene_pseudo = true;
2456  }
2457  }
2458  }
2459 
2460  if (!has_sfp_pseudo && !has_gene_pseudo) {
2461  return;
2462  } else if (!has_sfp_pseudo) {
2463  return;
2464  } else if (has_sfp_pseudo && !has_gene_pseudo) {
2465  string msg = m_Feat.GetData().IsCdregion() ? "CDS" : "mRNA";
2466  msg += " has pseudogene qualifier, gene does not";
2468  msg);
2469  } else if (!NStr::EqualNocase(sfp_pseudo, gene_pseudo)) {
2470  string msg = "Different pseudogene values on ";
2471  msg += m_Feat.GetData().IsCdregion() ? "CDS" : "mRNA";
2472  msg += " (" + sfp_pseudo + ") and gene (" + gene_pseudo + ")";
2474  msg);
2475  }
2476 }
2477 
2478 
2479 // grp is from gene xref or from overlapping gene
2481 {
2482  if (!m_Imp.IsLocusTagGeneralMatch()) {
2483  return;
2484  }
2485  if (!m_Feat.IsSetProduct()) {
2486  return;
2487  }
2488 
2489  CTempString locus_tag = kEmptyStr;
2490  // obtain the gene-ref from the feature or the overlapping gene
2491  const CGene_ref* grp = m_Feat.GetGeneXref();
2492  if (grp && grp->IsSuppressed()) {
2493  return;
2494  } else if (grp && grp->IsSetLocus_tag() &&
2495  !NStr::IsBlank(grp->GetLocus_tag())) {
2496  locus_tag = grp->GetLocus_tag();
2497  } else if (gene && gene->GetData().GetGene().IsSetLocus_tag() &&
2498  !NStr::IsBlank(gene->GetData().GetGene().GetLocus_tag())) {
2499  locus_tag = gene->GetData().GetGene().GetLocus_tag();
2500  } else {
2501  return;
2502  }
2503 
2504  if (!m_ProductBioseq) {
2505  return;
2506  }
2507 
2508  for (auto id : m_ProductBioseq.GetId()) {
2509  CConstRef<CSeq_id> seqid = id.GetSeqId();
2510  if (!seqid || !seqid->IsGeneral()) {
2511  continue;
2512  }
2513  const CDbtag& dbt = seqid->GetGeneral();
2514  if (!dbt.IsSetDb() || dbt.IsSkippable()) {
2515  continue;
2516  }
2517 
2518  if (dbt.IsSetTag() && dbt.GetTag().IsStr()) {
2519  SIZE_TYPE pos = dbt.GetTag().GetStr().find('-');
2520  string str = dbt.GetTag().GetStr().substr(0, pos);
2521  if (!NStr::EqualNocase(locus_tag, str)) {
2523  "Gene locus_tag does not match general ID of product");
2524  }
2525  }
2526  }
2527 }
2528 
2529 
2530 static string s_AsciiString(const string& src)
2531 {
2532  string dst;
2533 
2534  for (char ch : src) {
2535  unsigned char chu = ch;
2536  if (chu > 31 && chu < 128) {
2537  dst += chu;
2538  } else {
2539  dst += '#';
2540  }
2541  }
2542 
2543  return dst;
2544 }
2545 
2546 
2548 {
2550 
2551  for (; it; ++it) {
2552  const string& str = *it;
2553  FOR_EACH_CHAR_IN_STRING(c_it, str) {
2554  const char& ch = *c_it;
2555  unsigned char chu = ch;
2556  if (ch > 127 || (ch < 32 && ch != '\t' && ch != '\r' && ch != '\n')) {
2557  string txt = s_AsciiString(str);
2559  "Non-ASCII character '" + NStr::NumericToString(chu) + "' found in feature (" + txt + ")");
2560  break;
2561  }
2562  }
2563  }
2564 }
2565 
2567 {
2569 
2570  x_CheckForEmpty();
2571 
2572  const CProt_ref& prot = m_Feat.GetData().GetProt();
2573  for (auto it : prot.GetName()) {
2574  if (prot.IsSetEc() && !prot.IsSetProcessed()
2575  && (NStr::EqualCase (it, "Hypothetical protein")
2576  || NStr::EqualCase (it, "hypothetical protein")
2577  || NStr::EqualCase (it, "Unknown protein")
2578  || NStr::EqualCase (it, "unknown protein"))) {
2580  "Unknown or hypothetical protein should not have EC number");
2581  }
2582 
2583  }
2584 
2585  if (prot.IsSetDesc() && ContainsSgml(prot.GetDesc())) {
2587  "protein description " + prot.GetDesc() + " has SGML");
2588  }
2589 
2590  if (prot.IsSetDesc() && m_Feat.IsSetComment()
2591  && NStr::EqualCase(prot.GetDesc(), m_Feat.GetComment())) {
2593  "Comment has same value as protein description");
2594  }
2595 
2598  "Apparent EC number in protein comment");
2599  }
2600 
2602 
2603  // only look for EC numbers in first protein name
2604  // only look for brackets and hypothetical protein XP_ in first protein name
2605  if (prot.IsSetName() && prot.GetName().size() > 0) {
2606  if (HasECnumberPattern(prot.GetName().front())) {
2608  "Apparent EC number in protein title");
2609  }
2610  x_ValidateProteinName(prot.GetName().front());
2611  }
2612 
2613  if ( prot.CanGetDb () ) {
2614  m_Imp.ValidateDbxref(prot.GetDb(), m_Feat);
2615  }
2616  if ( (!prot.IsSetName() || prot.GetName().empty()) &&
2617  (!prot.IsSetProcessed()
2618  || (prot.GetProcessed() != CProt_ref::eProcessed_signal_peptide
2619  && prot.GetProcessed() != CProt_ref::eProcessed_transit_peptide))) {
2620  if (prot.IsSetDesc() && !NStr::IsBlank (prot.GetDesc())) {
2622  "Protein feature has description but no name");
2623  } else if (prot.IsSetActivity() && !prot.GetActivity().empty()) {
2625  "Protein feature has function but no name");
2626  } else if (prot.IsSetEc() && !prot.GetEc().empty()) {
2628  "Protein feature has EC number but no name");
2629  } else {
2631  "Protein feature has no name");
2632  }
2633  }
2634 
2636 
2638 }
2639 
2640 
2642 {
2643  const CProt_ref& prot = m_Feat.GetData().GetProt();
2645 
2646  if ( prot.IsSetProcessed() ) {
2647  processed = prot.GetProcessed();
2648  }
2649 
2650  bool empty = true;
2651  if ( processed != CProt_ref::eProcessed_signal_peptide &&
2652  processed != CProt_ref::eProcessed_transit_peptide ) {
2653  if ( prot.IsSetName() &&
2654  !prot.GetName().empty() &&
2655  !prot.GetName().front().empty() ) {
2656  empty = false;
2657  }
2658  if ( prot.CanGetDesc() && !prot.GetDesc().empty() ) {
2659  empty = false;
2660  }
2661  if ( prot.CanGetEc() && !prot.GetEc().empty() ) {
2662  empty = false;
2663  }
2664  if ( prot.CanGetActivity() && !prot.GetActivity().empty() ) {
2665  empty = false;
2666  }
2667  if ( prot.CanGetDb() && !prot.GetDb().empty() ) {
2668  empty = false;
2669  }
2670 
2671  if ( empty ) {
2673  "There is a protein feature where all fields are empty");
2674  }
2675  }
2676 }
2677 
2678 
2679 // note - list bad protein names in lower case, as search term is converted to lower case
2680 // before looking for exact match
2681 static const char* const sc_BadProtNameText [] = {
2682  "'hypothetical protein",
2683  "alpha",
2684  "alternative",
2685  "alternatively spliced",
2686  "bacteriophage hypothetical protein",
2687  "beta",
2688  "cellular",
2689  "cnserved hypothetical protein",
2690  "conesrved hypothetical protein",
2691  "conserevd hypothetical protein",
2692  "conserved archaeal protein",
2693  "conserved domain protein",
2694  "conserved hypohetical protein",
2695  "conserved hypotehtical protein",
2696  "conserved hypotheical protein",
2697  "conserved hypothertical protein",
2698  "conserved hypothetcial protein",
2699  "conserved hypothetical",
2700  "conserved hypothetical exported protein",
2701  "conserved hypothetical integral membrane protein",
2702  "conserved hypothetical membrane protein",
2703  "conserved hypothetical phage protein",
2704  "conserved hypothetical prophage protein",
2705  "conserved hypothetical protein",
2706  "conserved hypothetical protein - phage associated",
2707  "conserved hypothetical protein fragment 3",
2708  "conserved hypothetical protein, fragment",
2709  "conserved hypothetical protein, putative",
2710  "conserved hypothetical protein, truncated",
2711  "conserved hypothetical protein, truncation",
2712  "conserved hypothetical protein.",
2713  "conserved hypothetical protein; possible membrane protein",
2714  "conserved hypothetical protein; putative membrane protein",
2715  "conserved hypothetical proteins",
2716  "conserved hypothetical protien",
2717  "conserved hypothetical transmembrane protein",
2718  "conserved hypotheticcal protein",
2719  "conserved hypthetical protein",
2720  "conserved in bacteria",
2721  "conserved membrane protein",
2722  "conserved protein",
2723  "conserved protein of unknown function",
2724  "conserved protein of unknown function ; putative membrane protein",
2725  "conserved unknown protein",
2726  "conservedhypothetical protein",
2727  "conserverd hypothetical protein",
2728  "conservered hypothetical protein",
2729  "consrved hypothetical protein",
2730  "converved hypothetical protein",
2731  "cytokine",
2732  "delta",
2733  "drosophila",
2734  "duplicated hypothetical protein",
2735  "epsilon",
2736  "gamma",
2737  "hla",
2738  "homeodomain",
2739  "homeodomain protein",
2740  "homolog",
2741  "hyopthetical protein",
2742  "hypotethical",
2743  "hypotheical protein",
2744  "hypothertical protein",
2745  "hypothetcical protein",
2746  "hypothetical",
2747  "hypothetical protein",
2748  "hypothetical conserved protein",
2749  "hypothetical exported protein",
2750  "hypothetical novel protein",
2751  "hypothetical orf",
2752  "hypothetical phage protein",
2753  "hypothetical prophage protein",
2754  "hypothetical protein (fragment)",
2755  "hypothetical protein (multi-domain)",
2756  "hypothetical protein (phage associated)",
2757  "hypothetical protein - phage associated",
2758  "hypothetical protein fragment",
2759  "hypothetical protein fragment 1",
2760  "hypothetical protein predicted by genemark",
2761  "hypothetical protein predicted by glimmer",
2762  "hypothetical protein predicted by glimmer/critica",
2763  "hypothetical protein, conserved",
2764  "hypothetical protein, phage associated",
2765  "hypothetical protein, truncated",
2766  "hypothetical protein-putative conserved hypothetical protein",
2767  "hypothetical protein.",
2768  "hypothetical proteins",
2769  "hypothetical protien",
2770  "hypothetical transmembrane protein",
2771  "hypothetoical protein",
2772  "hypothteical protein",
2773  "identified by sequence similarity; putative; orf located~using blastx/framed",
2774  "identified by sequence similarity; putative; orf located~using blastx/glimmer/genemark",
2775  "ion channel",
2776  "membrane protein, putative",
2777  "mouse",
2778  "narrowly conserved hypothetical protein",
2779  "novel protein",
2780  "orf",
2781  "orf, conserved hypothetical protein",
2782  "orf, hypothetical",
2783  "orf, hypothetical protein",
2784  "orf, hypothetical, fragment",
2785  "orf, partial conserved hypothetical protein",
2786  "orf; hypothetical protein",
2787  "orf; unknown function",
2788  "partial",
2789  "partial cds, hypothetical",
2790  "partially conserved hypothetical protein",
2791  "phage hypothetical protein",
2792  "phage-related conserved hypothetical protein",
2793  "phage-related protein",
2794  "plasma",
2795  "possible hypothetical protein",
2796  "precursor",
2797  "predicted coding region",
2798  "predicted protein",
2799  "predicted protein (pseudogene)",
2800  "predicted protein family",
2801  "product uncharacterised protein family",
2802  "protein family",
2803  "protein of unknown function",
2804  "pseudogene",
2805  "putative",
2806  "putative conserved protein",
2807  "putative exported protein",
2808  "putative hypothetical protein",
2809  "putative membrane protein",
2810  "putative orf; unknown function",
2811  "putative phage protein",
2812  "putative protein",
2813  "rearranged",
2814  "repeats containing protein",
2815  "reserved",
2816  "ribosomal protein",
2817  "similar to",
2818  "small",
2819  "small hypothetical protein",
2820  "transmembrane protein",
2821  "trna",
2822  "trp repeat",
2823  "trp-repeat protein",
2824  "truncated conserved hypothetical protein",
2825  "truncated hypothetical protein",
2826  "uncharacterized conserved membrane protein",
2827  "uncharacterized conserved protein",
2828  "uncharacterized conserved secreted protein",
2829  "uncharacterized protein",
2830  "uncharacterized protein conserved in archaea",
2831  "uncharacterized protein conserved in bacteria",
2832  "unique hypothetical",
2833  "unique hypothetical protein",
2834  "unknown",
2835  "unknown cds",
2836  "unknown function",
2837  "unknown gene",
2838  "unknown protein",
2839  "unknown, conserved protein",
2840  "unknown, hypothetical",
2841  "unknown-related protein",
2842  "unknown; predicted coding region",
2843  "unnamed",
2844  "unnamed protein product",
2845  "very hypothetical protein"
2846 };
2849 
2850 
2852 {
2853  if (!m_Imp.IsRefSeq()) {
2854  return;
2855  }
2856  const CProt_ref& prot = m_Feat.GetData().GetProt();
2857  if (!prot.IsSetName()) {
2858  if (!prot.IsSetProcessed() ||
2859  (prot.GetProcessed() != CProt_ref::eProcessed_signal_peptide &&
2860  prot.GetProcessed() != CProt_ref::eProcessed_transit_peptide)) {
2862  "Protein name is not set");
2863  }
2864  return;
2865  }
2866  for (auto it : m_Feat.GetData().GetProt().GetName()) {
2867  string search = it;
2868  search = NStr::ToLower(search);
2869  if (search.empty()) {
2871  "Protein name is empty");
2872  } else if (sc_BadProtName.find (search.c_str()) != sc_BadProtName.end()
2873  || NStr::Find(search, "=") != string::npos
2874  || NStr::Find(search, "~") != string::npos
2875  || NStr::FindNoCase(search, "uniprot") != string::npos
2876  || NStr::FindNoCase(search, "uniprotkb") != string::npos
2877  || NStr::FindNoCase(search, "pmid") != string::npos
2878  || NStr::FindNoCase(search, "dbxref") != string::npos) {
2880  "Uninformative protein name '" + it + "'");
2881  }
2882  }
2883 }
2884 
2885 
2887 {
2888  if (!m_Feat.GetData().GetProt().IsSetEc()) {
2889  return;
2890  }
2891  for (auto it : m_Feat.GetData().GetProt().GetEc()) {
2892  if (NStr::IsBlank (it)) {
2893  PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberEmpty, "EC number should not be empty");
2894  } else if (!CProt_ref::IsValidECNumberFormat(it)) {
2896  (it) + " is not in proper EC_number format");
2897  } else {
2898  const string& ec_number = it;
2901  switch (status) {
2904  "EC_number " + ec_number + " was deleted");
2905  break;
2909  "EC_number " + ec_number + " was transferred and is no longer valid");
2910  break;
2912  {
2913  size_t pos = NStr::Find (ec_number, "n");
2914  if (pos == string::npos || !isdigit (ec_number.c_str()[pos + 1])) {
2916  ec_number + " is not a legal value for qualifier EC_number");
2917  } else {
2919  ec_number + " is not a legal preliminary value for qualifier EC_number");
2920  }
2921  }
2922  break;
2923  default:
2924  break;
2925  }
2926  }
2927  }
2928 
2929 }
2930 
2931 
2932 void CProtValidator::x_ValidateProteinName(const string& prot_name)
2933 {
2934  if (NStr::EndsWith(prot_name, "]")) {
2935  bool report_name = true;
2936  size_t pos = NStr::Find(prot_name, "[", NStr::eNocase, NStr::eReverseSearch);
2937  if (pos == string::npos) {
2938  // no disqualifying text
2939  } else if (prot_name.length() - pos < 5) {
2940  // no disqualifying text
2941  } else if (NStr::EqualCase(prot_name, pos, 4, "[NAD")) {
2942  report_name = false;
2943  }
2944  if (!m_Imp.IsEmbl() && !m_Imp.IsTPE()) {
2945  if (report_name) {
2947  "Protein name ends with bracket and may contain organism name");
2948  }
2949  }
2950  }
2951  if (NStr::StartsWith(prot_name, "hypothetical protein XP_") && m_LocationBioseq) {
2952  for (auto id_it : m_LocationBioseq.GetCompleteBioseq()->GetId()) {
2953  if (id_it->IsOther()
2954  && id_it->GetOther().IsSetAccession()
2955  && !NStr::EqualNocase(id_it->GetOther().GetAccession(),
2956  prot_name.substr(21))) {
2958  "Hypothetical protein reference does not match accession");
2959  }
2960  }
2961  }
2962  if (!m_Imp.IsRefSeq() && NStr::FindNoCase(prot_name, "RefSeq") != string::npos) {
2963  PostErr(eDiag_Error, eErr_SEQ_FEAT_RefSeqInText, "Protein name contains 'RefSeq'");
2964  }
2965  if (m_Feat.IsSetComment() && NStr::EqualCase(m_Feat.GetComment(), prot_name)) {
2967  "Comment has same value as protein name");
2968  }
2969 
2970  if (s_StringHasPMID(prot_name)) {
2972  "Protein name has internal PMID");
2973  }
2974 
2975  if (m_Imp.DoRubiscoTest()) {
2976  if (NStr::FindCase(prot_name, "ribulose") != string::npos
2977  && NStr::FindCase(prot_name, "bisphosphate") != string::npos
2978  && NStr::FindCase(prot_name, "methyltransferase") == string::npos
2979  && NStr::FindCase(prot_name, "activase") == string::npos) {
2980  if (NStr::EqualNocase(prot_name, "ribulose-1,5-bisphosphate carboxylase/oxygenase")) {
2981  // allow standard name without large or small subunit designation - later need kingdom test
2982  } else if (!NStr::EqualNocase(prot_name, "ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit")
2983  && !NStr::EqualNocase(prot_name, "ribulose-1,5-bisphosphate carboxylase/oxygenase small subunit")) {
2985  "Nonstandard ribulose bisphosphate protein name");
2986  }
2987  }
2988  }
2989 
2990 
2991 
2992  ValidateCharactersInField(prot_name, "Protein name");
2993  if (ContainsSgml(prot_name)) {
2995  "protein name " + prot_name + " has SGML");
2996  }
2997 
2998 }
2999 
3000 
3002 {
3003  if (!m_LocationBioseq) {
3004  return;
3005  }
3006  const CBioseq& pbioseq = *(m_LocationBioseq.GetCompleteBioseq());
3007  // if there is a coding region for this bioseq, this type of error
3008  // will be handled there
3009  const CSeq_feat* cds = m_Imp.GetCDSGivenProduct(pbioseq);
3010  if (cds) return;
3012  if (! prot) return;
3013 
3015  if (! mi_i) return;
3016  const CMolInfo& mi = mi_i->GetMolinfo();
3017  if (! mi.IsSetCompleteness()) return;
3018  int completeness = mi.GetCompleteness();
3019 
3020  const CSeq_loc& prot_loc = prot->GetLocation();
3021  bool prot_partial5 = prot_loc.IsPartialStart(eExtreme_Biological);
3022  bool prot_partial3 = prot_loc.IsPartialStop(eExtreme_Biological);
3023 
3024  bool conflict = false;
3025  if (completeness == CMolInfo::eCompleteness_partial && ((! prot_partial5) && (! prot_partial3))) {
3026  conflict = true;
3027  } else if (completeness == CMolInfo::eCompleteness_no_left && ((! prot_partial5) || prot_partial3)) {
3028  conflict = true;
3029  } else if (completeness == CMolInfo::eCompleteness_no_right && (prot_partial5 || (! prot_partial3))) {
3030  conflict = true;
3031  } else if (completeness == CMolInfo::eCompleteness_no_ends && ((! prot_partial5) || (! prot_partial3))) {
3032  conflict = true;
3033  } else if ((completeness < CMolInfo::eCompleteness_partial || completeness > CMolInfo::eCompleteness_no_ends) && (prot_partial5 || prot_partial3)) {
3034  conflict = true;
3035  }
3036 
3037  if (conflict) {
3039  "Molinfo completeness and protein feature partials conflict");
3040  }
3041 }
3042 
3044 {
3046 
3047  const CRNA_ref& rna = m_Feat.GetData().GetRna();
3048 
3050  if (rna.IsSetType()) {
3051  rna_type = rna.GetType();
3052  }
3053 
3054  if (rna_type == CRNA_ref::eType_rRNA) {
3055  if (rna.CanGetExt() && rna.GetExt().IsName()) {
3056  const string& rna_name = rna.GetExt().GetName();
3057  ValidateCharactersInField (rna_name, "rRNA name");
3058  if (ContainsSgml(rna_name)) {
3060  "rRNA name " + rna_name + " has SGML");
3061  }
3062  }
3063  }
3064 
3067 
3068  bool feat_pseudo = s_IsPseudo(m_Feat);
3069  bool pseudo = feat_pseudo;
3070  if (!pseudo) {
3072  if (gene) {
3073  pseudo = s_IsPseudo(*gene);
3074  }
3075  }
3076 
3077  if (!pseudo) {
3079  }
3080 
3081  x_ValidateRnaProduct(feat_pseudo, pseudo);
3082 
3083  if (rna_type == CRNA_ref::eType_rRNA
3084  || rna_type == CRNA_ref::eType_snRNA
3085  || rna_type == CRNA_ref::eType_scRNA
3086  || rna_type == CRNA_ref::eType_snoRNA) {
3087  if (!rna.IsSetExt() || !rna.GetExt().IsName() || NStr::IsBlank(rna.GetExt().GetName())) {
3088  if (!pseudo) {
3089  string rna_typename = CRNA_ref::GetRnaTypeName(rna_type);
3091  rna_typename + " has no name");
3092  }
3093  }
3094  }
3095 
3096 
3097  if ( rna_type == CRNA_ref::eType_unknown ) {
3099  "RNA type 0 (unknown) not supported");
3100  }
3101 
3102 
3103 }
3104 
3105 
3106 void CRNAValidator::x_ValidateRnaProduct(bool feat_pseudo, bool pseudo)
3107 {
3108  if (!m_Feat.IsSetProduct()) {
3109  return;
3110  }
3111 
3113 
3114  if ((!m_Feat.IsSetExcept_text()
3115  || NStr::FindNoCase (m_Feat.GetExcept_text(), "transcribed pseudogene") == string::npos)
3116  && !m_Imp.IsRefSeq()) {
3117  if (feat_pseudo) {
3119  "A pseudo RNA should not have a product");
3120  } else if (pseudo) {
3122  "An RNA overlapped by a pseudogene should not have a product");
3123  }
3124  }
3125 
3126 }
3127 
3128 
3130 {
3131  if ( !m_Feat.GetData().GetRna().IsSetType() || !m_ProductBioseq ) {
3132  return;
3133  }
3135  if ( !di ) {
3136  return;
3137  }
3138  const CMolInfo& mol_info = di->GetMolinfo();
3139  if ( !mol_info.CanGetBiomol() ) {
3140  return;
3141  }
3142  int biomol = mol_info.GetBiomol();
3143 
3144  switch ( m_Feat.GetData().GetRna().GetType() ) {
3145 
3146  case CRNA_ref::eType_mRNA:
3147  if ( biomol == CMolInfo::eBiomol_mRNA ) {
3148  return;
3149  }
3150  break;
3151 
3152  case CRNA_ref::eType_tRNA:
3153  if ( biomol == CMolInfo::eBiomol_tRNA ) {
3154  return;
3155  }
3156  break;
3157 
3158  case CRNA_ref::eType_rRNA:
3159  if ( biomol == CMolInfo::eBiomol_rRNA ) {
3160  return;
3161  }
3162  break;
3163 
3164  default:
3165  return;
3166  }
3167 
3169  "Type of RNA does not match MolInfo of product Bioseq");
3170 }
3171 
3172 
3174 {
3176  if ( !sd ) {
3177  return false;
3178  }
3179  const CMolInfo& molinfo = sd->GetMolinfo();
3180  if (!molinfo.IsSetCompleteness ()) {
3181  return false;
3182  }
3183  CMolInfo::TCompleteness completeness = molinfo.GetCompleteness();
3184  if (completeness == CMolInfo::eCompleteness_partial
3185  || completeness == CMolInfo::eCompleteness_no_ends
3186  || completeness == CMolInfo::eCompleteness_no_left
3187  || completeness == CMolInfo::eCompleteness_no_right) {
3188  return true;
3189  } else {
3190  return false;
3191  }
3192 }
3193 
3194 
3196 {
3197  if (!m_Feat.GetData().GetRna().IsSetExt() || !m_Feat.GetData().GetRna().GetExt().IsTRNA()) {
3198  return;
3199  }
3200  if ( !m_Feat.GetData().GetRna().IsSetType() ||
3203  "tRNA data structure on non-tRNA feature");
3204  }
3205 
3206  const CTrna_ext& trna = m_Feat.GetData().GetRna().GetExt ().GetTRNA ();
3207  if ( trna.CanGetAnticodon () ) {
3208  const CSeq_loc& anticodon = trna.GetAnticodon();
3209  size_t anticodon_len = GetLength(anticodon, &m_Scope);
3210  if ( anticodon_len != 3 ) {
3212  "Anticodon is not 3 bases in length");
3213  }
3214  ECompare comp = sequence::Compare(anticodon,
3215  m_Feat.GetLocation(),
3216  &m_Scope,
3218  if ( comp != eContained && comp != eSame ) {
3220  "Anticodon location not in tRNA");
3221  }
3222  x_ValidateAnticodon(anticodon);
3223  }
3225 
3226 }
3227 
3228 
3230 {
3231  if (!m_Feat.GetData().GetRna().IsSetType() ||
3233  return;
3234  }
3235  const CRNA_ref& rna = m_Feat.GetData().GetRna();
3236 
3237  // check for unparsed qualifiers
3238  for (auto& gbqual : m_Feat.GetQual()) {
3239  if ( NStr::CompareNocase(gbqual->GetQual (), "anticodon") == 0 ) {
3241  "Unparsed anticodon qualifier in tRNA");
3242  } else if (NStr::CompareNocase (gbqual->GetQual (), "product") == 0 ) {
3243  if (NStr::CompareNocase (gbqual->GetVal (), "tRNA-fMet") != 0 &&
3244  NStr::CompareNocase (gbqual->GetVal (), "tRNA-iMet") != 0 &&
3245  NStr::CompareNocase (gbqual->GetVal (), "tRNA-Ile2") != 0) {
3247  "Unparsed product qualifier in tRNA");
3248  }
3249  }
3250  }
3251 
3252 
3253  /* tRNA with string extension */
3254  if ( rna.IsSetExt() &&
3255  rna.GetExt().Which () == CRNA_ref::C_Ext::e_Name ) {
3257  "Unparsed product qualifier in tRNA");
3258  } else if (!rna.IsSetExt() || rna.GetExt().Which() == CRNA_ref::C_Ext::e_not_set ) {
3260  "Missing encoded amino acid qualifier in tRNA");
3261  }
3262 
3264 
3265  bool isLessThan100 = false;
3266  const CSeq_loc& loc = m_Feat.GetLocation();
3267  CSeq_loc_CI li(loc);
3268 
3269  TSeqPos last_start = li.GetRange().GetFrom();
3270  TSeqPos last_stop = li.GetRange().GetTo();
3271  CRef<CSeq_id> last_id(new CSeq_id());
3272  last_id->Assign(li.GetSeq_id());
3273 
3274  ++li;
3275  while (li) {
3276  TSeqPos this_start = li.GetRange().GetFrom();
3277  TSeqPos this_stop = li.GetRange().GetTo();
3278  if (abs ((int)this_start - (int)last_stop) < 100 || abs ((int)this_stop - (int)last_start) < 100) {
3279  if (li.GetSeq_id().Equals(*last_id)) {
3280  // definitely same bioseq, definitely report
3281  isLessThan100 = true;
3282  break;
3283  } else {
3284  // only report if definitely on same bioseq
3285  CBioseq_Handle last_bsh = m_Scope.GetBioseqHandle(*last_id);
3286  if (last_bsh) {
3287  for (auto id_it : last_bsh.GetId()) {
3288  if (id_it.GetSeqId()->Equals(li.GetSeq_id())) {
3289  isLessThan100 = true;
3290  break;
3291  }
3292  }
3293  }
3294  }
3295  }
3296  last_start = this_start;
3297  last_stop = this_stop;
3298  last_id->Assign(li.GetSeq_id());
3299  ++li;
3300  }
3301  bool pseudo = m_Feat.IsSetPseudo() && m_Feat.GetPseudo() ;
3302  if ( !pseudo ) {
3303  const CGene_ref* grp = m_Feat.GetGeneXref();
3304  if ( grp == NULL ) {
3306  if (gene) {
3307  pseudo = gene->IsSetPseudo() && gene->GetPseudo();
3308  if ( !pseudo ) {
3309  grp = &(gene->GetData().GetGene());
3310  }
3311  }
3312  }
3313  if ( !pseudo && grp != NULL ) {
3314  pseudo = grp->GetPseudo();
3315  }
3316  }
3317  if (isLessThan100 && ! pseudo) {
3319  if (bsh) {
3321  if (sd) {
3322  const CSeqdesc::TSource& source = sd->GetSource();
3323  if (source.IsSetLineage()) {
3324  string lineage = source.GetLineage();
3325  if (NStr::StartsWith(lineage, "Bacteria; ")) {
3327  "tRNA intron in bacteria is less than 100 bp");
3328  }
3329  }
3330  }
3331  }
3332  }
3333 }
3334 
3335 
3336 void CRNAValidator::x_ValidateAnticodon(const CSeq_loc& anticodon)
3337 {
3338  bool ordered = true;
3339  bool adjacent = false;
3340  bool unmarked_strand = false;
3341  bool mixed_strand = false;
3342 
3343  CSeq_loc_CI prev;
3344  for (CSeq_loc_CI curr(anticodon); curr; ++curr) {
3345  bool chk = true;
3346  if (curr.GetEmbeddingSeq_loc().IsInt()) {
3347  chk = sequence::IsValid(curr.GetEmbeddingSeq_loc().GetInt(), &m_Scope);
3348  } else if (curr.GetEmbeddingSeq_loc().IsPnt()) {
3349  chk = sequence::IsValid(curr.GetEmbeddingSeq_loc().GetPnt(), &m_Scope);
3350  } else {
3351  continue;
3352  }
3353 
3354  if ( !chk ) {
3355  string lbl;
3356  curr.GetEmbeddingSeq_loc().GetLabel(&lbl);
3358  "Anticodon location [" + lbl + "] out of range");
3359  }
3360 
3361  if ( prev && curr &&
3362  IsSameBioseq(curr.GetSeq_id(), prev.GetSeq_id(), &m_Scope) ) {
3363  CSeq_loc_CI::TRange prev_range = prev.GetRange();
3364  CSeq_loc_CI::TRange curr_range = curr.GetRange();
3365  if ( ordered ) {
3366  if ( curr.GetStrand() == eNa_strand_minus ) {
3367  if (prev_range.GetTo() < curr_range.GetTo()) {
3368  ordered = false;
3369  }
3370  if (curr_range.GetTo() + 1 == prev_range.GetFrom()) {
3371  adjacent = true;
3372  }
3373  } else {
3374  if (prev_range.GetTo() > curr_range.GetTo()) {
3375  ordered = false;
3376  }
3377  if (prev_range.GetTo() + 1 == curr_range.GetFrom()) {
3378  adjacent = true;
3379  }
3380  }
3381  }
3382  ENa_strand curr_strand = curr.GetStrand();
3383  ENa_strand prev_strand = prev.GetStrand();
3384  if ( curr_range == prev_range && curr_strand == prev_strand ) {
3386  "Duplicate anticodon exons in location");
3387  }
3388  if ( curr_strand != prev_strand ) {
3389  if (curr_strand == eNa_strand_plus && prev_strand == eNa_strand_unknown) {
3390  unmarked_strand = true;
3391  } else if (curr_strand == eNa_strand_unknown && prev_strand == eNa_strand_plus) {
3392  unmarked_strand = true;
3393  } else {
3394  mixed_strand = true;
3395  }
3396  }
3397  }
3398  prev = curr;
3399  }
3400  if (adjacent) {
3402  "Adjacent intervals in Anticodon");
3403  }
3404 
3405  ENa_strand loc_strand = m_Feat.GetLocation().GetStrand();
3406  ENa_strand ac_strand = anticodon.GetStrand();
3407  if (loc_strand == eNa_strand_minus && ac_strand != eNa_strand_minus) {
3409  "Anticodon strand and tRNA strand do not match.");
3410  } else if (loc_strand != eNa_strand_minus && ac_strand == eNa_strand_minus) {
3412  "Anticodon strand and tRNA strand do not match.");
3413  }
3414 
3415  // trans splicing exception turns off both mixed_strand and out_of_order messages
3416  bool trans_splice = false;
3418  if (NStr::FindNoCase(m_Feat.GetExcept_text(), "trans-splicing") != NPOS) {
3419  trans_splice = true;
3420  }
3421  }
3422  if (!trans_splice) {
3423  string loc_lbl;
3424  anticodon.GetLabel(&loc_lbl);
3425  if (mixed_strand) {
3427  "Mixed strands in Anticodon [" + loc_lbl + "]");
3428  }
3429  if (unmarked_strand) {
3431  "Mixed plus and unknown strands in Anticodon [" + loc_lbl + "]");
3432  }
3433  if (!ordered) {
3435  "Intervals out of order in Anticodon [" + loc_lbl + "]");
3436  }
3437  }
3438 }
3439 
3440 
3441 int s_LegalNcbieaaValues[] = { 42, 65, 66, 67, 68, 69, 70, 71, 72, 73,
3442  74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
3443  84, 85, 86, 87, 88, 89, 90 };
3444 
3445 static const char* kAANames[] = {
3446  "---", "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
3447  "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr",
3448  "Val", "Trp", "OTHER", "Tyr", "Glx", "Sec", "TERM", "Pyl", "Xle"
3449 };
3450 
3451 
3452 const char* GetAAName(unsigned char aa, bool is_ascii)
3453 {
3454  try {
3455  if (is_ascii) {
3458  }
3459  return (aa < sizeof(kAANames)/sizeof(*kAANames)) ? kAANames[aa] : "OTHER";
3460  } catch (const CException& ) {
3461  return "OTHER";
3462  } catch (const std::exception& ) {
3463  return "OTHER";
3464  }
3465 }
3466 
3467 
3468 static string GetGeneticCodeName (int gcode)
3469 {
3470  const CGenetic_code_table& code_table = CGen_code_table::GetCodeTable();
3471  const list<CRef<CGenetic_code> >& codes = code_table.Get();
3472 
3473  for ( list<CRef<CGenetic_code> >::const_iterator code_it = codes.begin(), code_it_end = codes.end(); code_it != code_it_end; ++code_it ) {
3474  if ((*code_it)->GetId() == gcode) {
3475  return (*code_it)->GetName();
3476  }
3477  }
3478  return "unknown";
3479 }
3480 
3481 
3483 {
3484  if (!m_Feat.IsSetData() || !m_Feat.GetData().IsRna() ||
3485  !m_Feat.GetData().GetRna().IsSetExt() ||
3486  !m_Feat.GetData().GetRna().GetExt().IsTRNA()) {
3487  return;
3488  }
3489  const CTrna_ext& trna = m_Feat.GetData().GetRna().GetExt().GetTRNA();
3490 
3491  if (!trna.IsSetAa()) {
3492  PostErr (eDiag_Error, eErr_SEQ_FEAT_BadTrnaAA, "Missing tRNA amino acid");
3493  return;
3494  }
3495 
3496  unsigned char aa = 0, orig_aa;
3497  vector<char> seqData;
3498  string str;
3499 
3500  switch (trna.GetAa().Which()) {
3502  str = trna.GetAa().GetIupacaa();
3504  aa = seqData[0];
3505  break;
3507  str = trna.GetAa().GetNcbi8aa();
3509  aa = seqData[0];
3510  break;
3512  str = trna.GetAa().GetNcbi8aa();
3514  aa = seqData[0];
3515  break;
3517  seqData.push_back(trna.GetAa().GetNcbieaa());
3518  aa = seqData[0];
3519  break;
3520  default:
3521  NCBI_THROW (CCoreException, eCore, "Unrecognized tRNA aa coding");
3522  break;
3523  }
3524 
3525  // make sure the amino acid is valid
3526  bool found = false;
3527  for ( unsigned int i = 0; i < sizeof (s_LegalNcbieaaValues) / sizeof (int); ++i ) {
3528  if ( aa == s_LegalNcbieaaValues[i] ) {
3529  found = true;
3530  break;
3531  }
3532  }
3533  orig_aa = aa;
3534  if ( !found ) {
3535  aa = ' ';
3536  }
3537 
3538  if (m_Feat.GetData().GetRna().IsSetType() &&
3540  bool mustbemethionine = false;
3541  for (auto gbqual : m_Feat.GetQual()) {
3542  if (NStr::CompareNocase(gbqual->GetQual(), "product") == 0 &&
3543  (NStr::CompareNocase(gbqual->GetVal(), "tRNA-fMet") == 0 ||
3544  NStr::CompareNocase(gbqual->GetVal(), "tRNA-iMet") == 0)) {
3545  mustbemethionine = true;
3546  break;
3547  }
3548  }
3549  if (mustbemethionine) {
3550  if (aa != 'M') {
3551  string aanm = GetAAName(aa, true);
3553  "Initiation tRNA claims to be tRNA-" + aanm +
3554  ", but should be tRNA-Met");
3555  }
3556  }
3557  }
3558 
3559  // Retrive the Genetic code id for the tRNA
3560  int gcode = 1;
3561  if ( m_LocationBioseq ) {
3562  // need only the closest biosoure.
3564  if ( diter ) {
3565  gcode = diter->GetSource().GetGenCode();
3566  }
3567  }
3568 
3569  const string& ncbieaa = CGen_code_table::GetNcbieaa(gcode);
3570  if ( ncbieaa.length() != 64 ) {
3571  return;
3572  }
3573 
3574  string codename = GetGeneticCodeName (gcode);
3575  char buf[2];
3576  buf[0] = aa;
3577  buf[1] = 0;
3578  string aaname = buf;
3579  aaname += "/";
3580  aaname += GetAAName (aa, true);
3581 
3582  EDiagSev sev = (aa == 'U' || aa == 'O') ? eDiag_Warning : eDiag_Error;
3583 
3584  bool modified_codon_recognition = false;
3585  bool rna_editing = false;
3586  if ( m_Feat.IsSetExcept_text() ) {
3587  string excpt_text = m_Feat.GetExcept_text();
3588  if ( NStr::FindNoCase(excpt_text, "modified codon recognition") != NPOS ) {
3589  modified_codon_recognition = true;
3590  }
3591  if ( NStr::FindNoCase(excpt_text, "RNA editing") != NPOS ) {
3592  rna_editing = true;
3593  }
3594  }
3595 
3596  vector<string> recognized_codon_values;
3597  vector<unsigned char> recognized_taa_values;
3598 
3599  ITERATE( CTrna_ext::TCodon, iter, trna.GetCodon() ) {
3600  if (*iter == 255) continue;
3601  // test that codon value is in range 0 - 63
3602  if ( *iter > 63 ) {
3604  "tRNA codon value " + NStr::IntToString(*iter) +
3605  " is greater than maximum 63");
3606  continue;
3607  } else if (*iter < 0) {
3609  "tRNA codon value " + NStr::IntToString(*iter) +
3610  " is less than 0");
3611  continue;
3612  }
3613 
3614  if ( !modified_codon_recognition && !rna_editing ) {
3615  unsigned char taa = ncbieaa[*iter];
3616  string codon = CGen_code_table::IndexToCodon(*iter);
3617  recognized_codon_values.push_back (codon);
3618  recognized_taa_values.push_back (taa);
3619 
3620  if ( taa != aa ) {
3621  if ( (aa == 'U') && (taa == '*') && (*iter == 14) ) {
3622  // selenocysteine normally uses TGA (14), so ignore without requiring exception in record
3623  // TAG (11) is used for pyrrolysine in archaebacteria
3624  // TAA (10) is not yet known to be used for an exceptional amino acid
3625  } else {
3626  NStr::ReplaceInPlace (codon, "T", "U");
3627 
3629  "Codon recognized by tRNA (" + codon + ") does not match amino acid ("
3630  + aaname + ") specified by genetic code ("
3631  + NStr::IntToString (gcode) + "/" + codename + ")");
3632  }
3633  }
3634  }
3635  }
3636 
3637  // see if anticodon is compatible with codons recognized and amino acid
3638  string anticodon = "?";
3639  vector<string> codon_values;
3640  vector<unsigned char> taa_values;
3641 
3642  if (trna.IsSetAnticodon() && GetLength (trna.GetAnticodon(), &m_Scope) == 3) {
3643  try {
3644  anticodon = GetSequenceStringFromLoc(trna.GetAnticodon(), m_Scope);
3645  // get reverse complement sequence for location
3646  CRef<CSeq_loc> codon_loc(SeqLocRevCmpl(trna.GetAnticodon(), &m_Scope));
3647  string codon = GetSequenceStringFromLoc(*codon_loc, m_Scope);
3648  if (codon.length() > 3) {
3649  codon = codon.substr (0, 3);
3650  }
3651 
3652  // expand wobble base to known binding partners
3653  string wobble;
3654 
3655  char ch = anticodon.c_str()[0];
3656  switch (ch) {
3657  case 'A' :
3658  wobble = "ACT";
3659  break;
3660  case 'C' :
3661  wobble = "G";
3662  break;
3663  case 'G' :
3664  wobble = "CT";
3665  break;
3666  case 'T' :
3667  wobble = "AG";
3668  break;
3669  default :
3670  break;
3671  }
3672  if (!NStr::IsBlank(wobble)) {
3673  string::iterator str_it = wobble.begin();
3674  while (str_it != wobble.end()) {
3675  codon[2] = *str_it;
3676  int index = CGen_code_table::CodonToIndex (codon);
3677  if (index < 64 && index > -1) {
3678  unsigned char taa = ncbieaa[index];
3679  taa_values.push_back(taa);
3680  codon_values.push_back(codon);
3681  }
3682  ++str_it;
3683  }
3684  }
3685  NStr::ReplaceInPlace (anticodon, "T", "U");
3686  if (anticodon.length() > 3) {
3687  anticodon = anticodon.substr(0, 3);
3688  }
3689  } catch (const CException& ) {
3690  } catch (const std::exception& ) {
3691  }
3692 
3693  if (codon_values.size() > 0) {
3694  bool ok = false;
3695  // check that codons predicted from anticodon can transfer indicated amino acid
3696  for (size_t i = 0; i < codon_values.size(); i++) {
3697  if (!NStr::IsBlank (codon_values[i]) && aa == taa_values[i]) {
3698  ok = true;
3699  }
3700  }
3701  if (!ok) {
3702  if (aa == 'U' && NStr::Equal (anticodon, "UCA")) {
3703  // ignore TGA codon for selenocysteine
3704  } else if (aa == 'O' && NStr::Equal (anticodon, "CUA")) {
3705  // ignore TAG codon for pyrrolysine
3706  } else if (aa == 'I' && NStr::Equal (anticodon, "CAU")) {
3707  // ignore ATG predicted codon for Ile2
3708  } else if (!m_Feat.IsSetExcept_text()
3709  || (NStr::FindNoCase(m_Feat.GetExcept_text(), "modified codon recognition") == string::npos
3710  &&NStr::FindNoCase(m_Feat.GetExcept_text(), "RNA editing") == string::npos)) {
3712  "Codons predicted from anticodon (" + anticodon
3713  + ") cannot produce amino acid (" + aaname + ")");
3714  }
3715  }
3716 
3717  // check that codons recognized match codons predicted from anticodon
3718  if (recognized_codon_values.size() > 0) {
3719  ok = false;
3720  for (size_t i = 0; i < codon_values.size() && !ok; i++) {
3721  for (size_t j = 0; j < recognized_codon_values.size() && !ok; j++) {
3722  if (NStr::Equal (codon_values[i], recognized_codon_values[j])) {
3723  ok = true;
3724  } else if ( NStr::Equal (codon_values[i], "ATG") && aa == 'I') {
3725  // allow ATG recognized codon (pre-RNA-editing) for Ile2
3726  ok = true;
3727  }
3728  }
3729  }
3730  if (!ok
3731  && (!m_Feat.IsSetExcept_text()
3732  || NStr::FindNoCase (m_Feat.GetExcept_text(), "RNA editing") == string::npos)) {
3734  "Codon recognized cannot be produced from anticodon ("
3735  + anticodon + ")");
3736  }
3737  }
3738  }
3739  }
3740 
3741  if (!m_Feat.IsSetPseudo() || !m_Feat.GetPseudo()) {
3742  if (orig_aa == 0 || orig_aa == 255) {
3743  PostErr (sev, eErr_SEQ_FEAT_BadTrnaAA, "Missing tRNA amino acid");
3744  } else {
3745  // verify that legal amino acid is indicated
3746  unsigned int idx;
3747  if (aa != '*') {
3748  idx = aa - 64;
3749  } else {
3750  idx = 25;
3751  }
3752  if (idx == 0 || idx >= 28) {
3753  PostErr (sev, eErr_SEQ_FEAT_BadTrnaAA, "Invalid tRNA amino acid");
3754  }
3755  }
3756  }
3757 }
3758 
3759 
3761 {
3762  if (!m_Feat.GetData().GetRna().IsSetType() ||
3764  return;
3765  }
3766  TFeatScores scores;
3771  scores, m_Scope);
3772  bool found_bad = false;
3773  for (auto it : scores) {
3774  CRef<CSeq_loc> intersection = it.second->GetLocation().Intersect(m_Feat.GetLocation(),
3775  0 /* flags*/,
3776  nullptr /* synonym mapper */);
3777  if (intersection) {
3778  TSeqPos length = sequence::GetLength(*intersection, &m_Scope);
3779  if (length >= 5) {
3780  found_bad = true;
3781  break;
3782  }
3783  }
3784  }
3785  if (found_bad) {
3787  "tRNA-rRNA overlap");
3788  }
3792  if (cds) {
3794  "tRNA overlaps CDS");
3795  }
3796 }
3797 
3798 
3800 {
3801  size_t mismatches = 0;
3802  size_t problems = GetMRNATranslationProblems
3803  (m_Feat, mismatches, m_Imp.IgnoreExceptions(),
3806  m_Imp.IsGenomic(), &m_Scope);
3807  x_ReportRNATranslationProblems(problems, mismatches);
3808 }
3809 
3810 
3811 void CRNAValidator::x_ReportRNATranslationProblems(size_t problems, size_t mismatches)
3812 {
3813  if (problems & eMRNAProblem_TransFail) {
3815  "Unable to transcribe mRNA");
3816  }
3817  if (problems & eMRNAProblem_UnableToFetch) {
3818  const CSeq_id& product_id = GetId(m_Feat.GetProduct(), &m_Scope);
3819  string label = product_id.AsFastaString();
3821  "Unable to fetch mRNA transcript '" + label + "'");
3822  }
3823 
3824  bool is_refseq = m_Imp.IsRefSeqConventions();
3825  if (m_LocationBioseq) {
3827  if ((*it)->IsOther()) {
3828  is_refseq = true;
3829  break;
3830  }
3831  }
3832  }
3833 
3835 
3836  string farstr;
3837  EDiagSev sev = eDiag_Error;
3838 
3839  // if not local bioseq product, lower severity (with the exception of Refseq)
3840  if (m_ProductIsFar && !is_refseq) {
3841  sev = eDiag_Warning;
3842  }
3843  if (m_ProductIsFar) {
3844  farstr = "(far) ";
3845  if (m_Feat.IsSetPartial()
3848  sev = eDiag_Warning;
3849  }
3850  }
3851 
3852  if (problems & eMRNAProblem_TranscriptLenLess) {
3854  "Transcript length [" + NStr::SizetToString(feat_len) +
3855  "] less than " + farstr + "product length [" +
3856  NStr::SizetToString(m_ProductBioseq.GetInst_Length()) + "], and tail < 95% polyA");
3857  }
3858 
3859  if (problems & eMRNAProblem_PolyATail100) {
3861  "Transcript length [" + NStr::SizetToString(feat_len)
3862  + "] less than " + farstr + "product length ["
3863  + NStr::SizetToString(m_ProductBioseq.GetInst_Length()) + "], but tail is 100% polyA");
3864  }
3865  if (problems & eMRNAProblem_PolyATail95) {
3867  "Transcript length [" + NStr::SizetToString(feat_len) +
3868  "] less than " + farstr + "product length [" +
3869  NStr::SizetToString(m_ProductBioseq.GetInst_Length()) + "], but tail >= 95% polyA");
3870  }
3871  if (problems & eMRNAProblem_TranscriptLenMore) {
3873  "Transcript length [" + NStr::IntToString(feat_len) + "] " +
3874  "greater than " + farstr + "product length [" +
3876  }
3877  if ((problems & eMRNAProblem_Mismatch) && mismatches > 0) {
3879  "There are " + NStr::SizetToString(mismatches) +
3880  " mismatches out of " + NStr::SizetToString(feat_len) +
3881  " bases between the transcript and " + farstr + "product sequence");
3882  }
3883  if (problems & eMRNAProblem_UnnecessaryException) {
3885  "mRNA has exception but passes transcription test");
3886  }
3887  if (problems & eMRNAProblem_ErroneousException) {
3888  size_t total = min(feat_len, m_ProductBioseq.GetInst_Length());
3890  "mRNA has unclassified exception but only difference is " + NStr::SizetToString(mismatches)
3891  + " mismatches out of " + NStr::SizetToString(total) + " bases");
3892  }
3893  if (problems & eMRNAProblem_ProductReplaced) {
3895  "mRNA has transcribed product replaced exception");
3896  }
3897 }
3898 
3899 
3901 CRNAValidator(feat, scope, imp)
3902 {
3904  if (m_Gene) {
3906  } else {
3907  m_GeneIsPseudo = false;
3908  }
3910 }
3911 
3912 
3914 {
3916 
3919 
3920  x_ValidateMrna();
3921 
3922  if (!m_GeneIsPseudo && !m_FeatIsPseudo) {
3924  }
3926 }
3927 
3928 
3930 {
3931  bool pseudo = m_GeneIsPseudo;
3932  if (!pseudo) {
3933  pseudo = s_IsPseudo(m_Feat);
3934  }
3935  ValidateSplice(pseudo, false);
3936 
3937  const CRNA_ref& rna = m_Feat.GetData().GetRna();
3938 
3939  if (m_Feat.IsSetQual()) {
3940  for (auto it : m_Feat.GetQual()) {
3941  const CGb_qual& qual = *it;
3942  if (qual.CanGetQual()) {
3943  const string& key = qual.GetQual();
3944  if (NStr::EqualNocase(key, "protein_id")) {
3946  "protein_id should not be a gbqual on an mRNA feature");
3947  }
3948  else if (NStr::EqualNocase(key, "transcript_id")) {
3950  "transcript_id should not be a gbqual on an mRNA feature");
3951  }
3952  }
3953  }
3954  }
3955 
3956  if (rna.IsSetExt() && rna.GetExt().IsName()) {
3957  const string& rna_name = rna.GetExt().GetName();
3958  if (NStr::StartsWith(rna_name, "transfer RNA ") &&
3959  (!NStr::EqualNocase(rna_name, "transfer RNA nucleotidyltransferase")) &&
3960  (!NStr::EqualNocase(rna_name, "transfer RNA methyltransferase"))) {
3962  "mRNA feature product indicates it should be a tRNA feature");
3963  }
3964  ValidateCharactersInField(rna_name, "mRNA name");
3965  if (ContainsSgml(rna_name)) {
3967  "mRNA name " + rna_name + " has SGML");
3968  }
3969  }
3970 }
3971 
3972 
3974 {
3975  if (!m_Feat.IsSetProduct()) {
3976  return;
3977  }
3978  if ( !m_ProductBioseq) {
3979  if (m_LocationBioseq) {
3981  if (seh.IsSet() && seh.GetSet().IsSetClass()
3983  || seh.GetSet().GetClass() == CBioseq_set::eClass_other)) {
3985  "Product Bioseq of mRNA feature is not "
3986  "packaged in the record");
3987  }
3988  }
3989  } else {
3990 
3991  //CConstRef<CSeq_feat> mrna = m_Imp.GetmRNAGivenProduct (*(m_ProductBioseq.GetCompleteBioseq()));
3993  if (mrna && mrna.GetPointer() != &m_Feat) {
3995  "Identical transcript IDs found on multiple mRNAs");
3996  }
3997  }
3998 }
3999 
4000 
4001 static bool s_EqualGene_ref(const CGene_ref& genomic, const CGene_ref& mrna)
4002 {
4003  bool locus = (!genomic.CanGetLocus() && !mrna.CanGetLocus()) ||
4004  (genomic.CanGetLocus() && mrna.CanGetLocus() &&
4005  genomic.GetLocus() == mrna.GetLocus());
4006  bool allele = (!genomic.CanGetAllele() && !mrna.CanGetAllele()) ||
4007  (genomic.CanGetAllele() && mrna.CanGetAllele() &&
4008  genomic.GetAllele() == mrna.GetAllele());
4009  bool desc = (!genomic.CanGetDesc() && !mrna.CanGetDesc()) ||
4010  (genomic.CanGetDesc() && mrna.CanGetDesc() &&
4011  genomic.GetDesc() == mrna.GetDesc());
4012  bool locus_tag = (!genomic.CanGetLocus_tag() && !mrna.CanGetLocus_tag()) ||
4013  (genomic.CanGetLocus_tag() && mrna.CanGetLocus_tag() &&
4014  genomic.GetLocus_tag() == mrna.GetLocus_tag());
4015 
4016  return locus && allele && desc && locus_tag;
4017 }
4018 
4019 
4020 // check that there is no conflict between the gene on the genomic
4021 // and the gene on the mrna.
4023 {
4024  if (!m_ProductBioseq) {
4025  return;
4026  }
4027  const CGene_ref* genomicgrp = nullptr;
4028  if (m_Gene) {
4029  genomicgrp = &(m_Gene->GetData().GetGene());
4030  } else {
4031  genomicgrp = m_Feat.GetGeneXref();
4032  }
4033  if (!genomicgrp) {
4034  return;
4035  }
4037  if ( mrna_gene ) {
4038  const CGene_ref& mrnagrp = mrna_gene->GetData().GetGene();
4039  if ( !s_EqualGene_ref(*genomicgrp, mrnagrp) ) {
4041  "Gene on mRNA bioseq does not match gene on genomic bioseq",
4042  mrna_gene->GetOriginalFeature());
4043  }
4044  }
4045 
4046 }
4047 
4048 
4050 {
4053 }
4054 
4055 
4057 {
4059  const CBioSource& bsrc = m_Feat.GetData().GetBiosrc();
4060  if ( bsrc.IsSetIs_focus() ) {
4062  "Focus must be on BioSource descriptor, not BioSource feature.");
4063  }
4064 
4066 
4068  if ( !dbsrc_i ) {
4069  return;
4070  }
4071 
4072  const COrg_ref& org = bsrc.GetOrg();
4073  const CBioSource& dbsrc = dbsrc_i->GetSource();
4074  const COrg_ref& dorg = dbsrc.GetOrg();
4075 
4076  if ( org.CanGetTaxname() && !org.GetTaxname().empty() &&
4077  dorg.CanGetTaxname() ) {
4078  string taxname = org.GetTaxname();
4079  string dtaxname = dorg.GetTaxname();
4080  if ( NStr::CompareNocase(taxname, dtaxname) != 0 ) {
4081  if ( !dbsrc.IsSetIs_focus() && !m_Imp.IsTransgenic(dbsrc) ) {
4083  "BioSource descriptor must have focus or transgenic "
4084  "when BioSource feature with different taxname is "
4085  "present.");
4086  }
4087  }
4088  }
4089 }
4090 
4091 
4093 {
4096  if ( range.GetFrom() != range.GetTo() ) {
4097  EDiagSev sev = eDiag_Warning;
4098  if (m_Imp.IsRefSeq()) {
4099  sev = eDiag_Error;
4100  }
4102  "PolyA_site should be a single point");
4103  }
4104 
4105 }
4106 
4107 
4109 {
4111  if ( range.GetFrom() == range.GetTo() ) {
4112  EDiagSev sev = eDiag_Warning;
4113  if (m_Imp.IsRefSeq()) {
4114  sev = eDiag_Error;
4115  }
4116  PostErr (sev, eErr_SEQ_FEAT_PolyAsignalNotRange, "PolyA_signal should be a range");
4117  }
4118 }
4119 
4120 
4122  CSingleFeatValidator(feat, scope, imp)
4123 {
4125 }
4126 
4127 
4129 {
4131 
4132  if (m_Imp.IsEmbl() || m_Imp.IsDdbj()) {
4134  "sig/mat/transit_peptide feature cannot be associated with a "
4135  "protein product of a coding region feature");
4136  } else {
4138  "Peptide processing feature should be converted to the "
4139  "appropriate protein feature subtype");
4140  }
4142 }
4143 
4144 
4146 {
4147  if (!m_CDS) {
4148  return;
4149  }
4150 
4151  const string& key = m_Feat.GetData().GetImp().GetKey();
4152 
4154  if (NStr::Equal(key, "sig_peptide") && in_frame == feature::eLocationInFrame_NotIn) {
4155  return;
4156  }
4157  switch (in_frame) {
4159  if (NStr::Equal(key, "sig_peptide")) {
4160  // ignore
4161  } else {
4163  "Start and stop of " + key + " are out of frame with CDS codons");
4164  }
4165  break;
4168  "Start and stop of " + key + " are out of frame with CDS codons");
4169  break;
4172  "Start of " + key + " is out of frame with CDS codons");
4173  break;
4176  "Stop of " + key + " is out of frame with CDS codons");
4177  break;
4179  break;
4180  }
4181 }
4182 
4183 
4185 {
4187  bool feat_pseudo = s_IsPseudo(m_Feat);
4188  bool pseudo = feat_pseudo;
4189  if (!pseudo) {
4191  if (gene) {
4192  pseudo = s_IsPseudo(*gene);
4193  }
4194  }
4195  if (m_Imp.IsValidateExons()) {
4196  ValidateSplice(pseudo, true);
4197  }
4198 }
4199 
4200 
4202 {
4204  bool feat_pseudo = s_IsPseudo(m_Feat);
4205  bool pseudo = feat_pseudo;
4206  if (!pseudo) {
4208  if (gene) {
4209  pseudo = s_IsPseudo(*gene);
4210  }
4211  }
4212 
4213  if (x_IsIntronShort(pseudo)) {
4215  "Introns should be at least 10 nt long");
4216  }
4217 
4219  && NStr::FindNoCase (m_Feat.GetExcept_text(), "nonconsensus splice site") != string::npos) {
4220  return;
4221  }
4222 
4223  const CSeq_loc& loc = m_Feat.GetLocation();
4224 
4225  bool partial5 = loc.IsPartialStart(eExtreme_Biological);
4226  bool partial3 = loc.IsPartialStop(eExtreme_Biological);
4227  if (partial5 && partial3) {
4228  return;
4229  }
4230 
4231  // suppress if contained by rRNA - different consensus splice site
4232  TFeatScores scores;
4237  scores, m_Scope);
4238  if (scores.size() > 0) {
4239  return;
4240  }
4241 
4242  // suppress if contained by tRNA - different consensus splice site
4243  scores.clear();
4248  scores, m_Scope);
4249  if (scores.size() > 0) {
4250  return;
4251  }
4252 
4253  // skip if more than one bioseq
4254  if (!IsOneBioseq(loc, &m_Scope)) {
4255  return;
4256  }
4257 
4258  // skip if organelle
4260  return;
4261  }
4262 
4264  string label;
4265  m_LocationBioseq.GetId().front().GetSeqId()->GetLabel(&label);
4267 
4268  ENa_strand strand = loc.GetStrand();
4269 
4270  if (eNa_strand_minus != strand && eNa_strand_plus != strand) {
4271  strand = eNa_strand_plus;
4272  }
4273 
4274  bool donor_in_gap = false;
4275  bool acceptor_in_gap = false;
4276 
4277  TSeqPos end5 = loc.GetStart (eExtreme_Biological);
4278  if (vec.IsInGap(end5)) {
4279  donor_in_gap = true;
4280  }
4281 
4282  TSeqPos end3 = loc.GetStop (eExtreme_Biological);
4283  if (vec.IsInGap(end3)) {
4284  acceptor_in_gap = true;
4285  }
4286 
4287  if (!partial5 && !partial3) {
4288  if (donor_in_gap && acceptor_in_gap) {
4289  return;
4290  }
4291  }
4292 
4293  Char donor[2]; // donor site signature
4294  Char acceptor[2]; // acceptor site signature
4295  bool donor_good = false; // flag == "true" indicates that donor signature is in @donor
4296  bool acceptor_good = false; // flag == "true" indicates that acceptor signature is in @acceptor
4297 
4298  // Read donor signature into @donor
4299  if (!partial5 && !donor_in_gap) {
4300  if (eNa_strand_minus == strand) {
4301  if (end5 > 0 && IsResidue (vec[end5 - 1]) && IsResidue (vec[end5])) {
4302  donor[0] = vec[end5 - 1];
4303  donor[1] = vec[end5];
4304  donor_good = true;
4305  }
4306  }
4307  else {
4308  if( end5 < seq_len - 1 && IsResidue (vec[end5]) && IsResidue (vec[end5 + 1])) {
4309  donor[0] = vec[end5];
4310  donor[1] = vec[end5 + 1];
4311  donor_good = true;
4312  }
4313  }
4314  }
4315 
4316  // Read acceptor signature into @acceptor
4317  if (!partial3 && !acceptor_in_gap) {
4318  if (eNa_strand_minus == strand) {
4319  if (end3 < seq_len - 1 && IsResidue (vec[end3]) && IsResidue (vec[end3 + 1])) {
4320  acceptor[0] = vec[end3];
4321  acceptor[1] = vec[end3 + 1];
4322  acceptor_good = true;
4323  }
4324  }
4325  else {
4326  if (end3 > 0 && IsResidue (vec[end3 - 1]) && IsResidue (vec[end3])) {
4327  acceptor[0] = vec[end3 - 1];
4328  acceptor[1] = vec[end3];
4329  acceptor_good = true;
4330  }
4331  }
4332  }
4333 
4334  // Check intron's both ends.
4335  if (!partial5 && !partial3) {
4336  if (donor_good && acceptor_good) {
4337  if (CheckIntronSpliceSites(strand, donor, acceptor)) {
4338  return;
4339  }
4340  }
4341  }
4342 
4343  // Check 5'-most
4344  if (!partial5) {
4345  if (!donor_in_gap) {
4346  bool not_found = true;
4347 
4348  if (donor_good) {
4349  if (CheckIntronDonor(strand, donor)) {
4350  not_found = false;
4351  }
4352  }
4353  //
4354  if (not_found) {
4355  if ((strand == eNa_strand_minus && end5 == seq_len - 1) ||
4356  (strand == eNa_strand_plus && end5 == 0)) {
4357 
4359  "Splice donor consensus (GT) not found at start of terminal intron, position "
4360  + NStr::IntToString (end5 + 1) + " of " + label);
4361  }
4362  else {
4364  "Splice donor consensus (GT) not found at start of intron, position "
4365  + NStr::IntToString (end5 + 1) + " of " + label);
4366  }
4367  }
4368  }
4369  }
4370 
4371  // Check 3'-most
4372  if (!partial3) {
4373  if (!acceptor_in_gap) {
4374  bool not_found = true;
4375 
4376  if (acceptor_good) {
4377  if (CheckIntronAcceptor(strand, acceptor)) {
4378  not_found = false;
4379  }
4380  }
4381 
4382  if (not_found) {
4383  if ((strand == eNa_strand_minus && end3 == 0) ||
4384  (strand == eNa_strand_plus && end3 == seq_len - 1)) {
4386  "Splice acceptor consensus (AG) not found at end of terminal intron, position "
4387  + NStr::IntToString (end3 + 1) + " of " + label + ", but at end of sequence");
4388  }
4389  else {
4391  "Splice acceptor consensus (AG) not found at end of intron, position "
4392  + NStr::IntToString (end3 + 1) + " of " + label);
4393  }
4394  }
4395  }
4396  }
4397 
4398 }
4399 
4400 
4402 {
4403  if (!m_Feat.IsSetData()
4405  || !m_Feat.IsSetLocation()
4406  || pseudo) {
4407  return false;
4408  }
4409 
4410  const CSeq_loc& loc = m_Feat.GetLocation();
4411  bool is_short = false;
4412 
4413  if (! m_Imp.IsIndexerVersion()) {
4414  if (!m_LocationBioseq || IsOrganelle(m_LocationBioseq)) return is_short;
4415  }
4416 
4417  if (GetLength(loc, &m_Scope) < 11) {
4418  bool partial_left = loc.IsPartialStart(eExtreme_Positional);
4419  bool partial_right = loc.IsPartialStop(eExtreme_Positional);
4420 
4421  CBioseq_Handle bsh;
4422  if (partial_left && loc.GetStart(eExtreme_Positional) == 0) {
4423  // partial at beginning of sequence, ok
4424  } else if (partial_right &&
4425  (m_LocationBioseq) &&
4426  loc.GetStop(eExtreme_Positional) == (
4428  {
4429  // partial at end of sequence
4430  } else {
4431  is_short = true;
4432  }
4433  }
4434  return is_short;
4435 }
4436 
4437 
4439 {
4442  && (!m_Feat.IsSetQual() || m_Feat.GetQual().empty())
4443  && (!m_Feat.IsSetDbxref() || m_Feat.GetDbxref().empty())) {
4445  "A note or other qualifier is required for a misc_feature");
4446  }
4448  if (NStr::FindWord(m_Feat.GetComment(), "cspA") != NPOS) {
4450  if (cds) {
4451  string content_label;
4452  feature::GetLabel(*cds, &content_label, feature::fFGL_Content, &m_Scope);
4453  if (NStr::Equal(content_label, "cold-shock protein")) {
4455  "cspA misc_feature overlapped by cold-shock protein CDS");
4456  }
4457  }
4458  }
4459  }
4460 
4461 }
4462 
4463 
4465 {
4467 
4468  bool is_far_delta = false;
4471  if ( repr == CSeq_inst::eRepr_delta ) {
4472  is_far_delta = true;
4473  const CBioseq& seq = *(m_LocationBioseq.GetCompleteBioseq());
4474  const CSeq_inst& inst = seq.GetInst();
4475  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
4476  if ( !(*sg) ) continue;
4477  if (( (**sg).Which() == CDelta_seq::e_Loc )) continue;
4478  is_far_delta = false;
4479  }
4480  }
4481  }
4482  if (! is_far_delta) {
4484  "An assembly_gap feature should only be on a contig record");
4485  }
4486  if (m_Feat.GetLocation().IsInt()) {
4487  TSeqPos from = m_Feat.GetLocation().GetInt().GetFrom();
4488  TSeqPos to = m_Feat.GetLocation().GetInt().GetTo();
4490  string sequence;
4491  bool is5 = false;
4492  bool is3 = false;
4493  long int count = 0;
4494  vec.GetSeqData(from - 1, from, sequence);
4495  if (NStr::Equal (sequence, "N")) {
4496  is5 = true;
4497  }
4498  vec.GetSeqData(to + 1, to + 2, sequence);
4499  if (NStr::Equal (sequence, "N")) {
4500  is3 = true;
4501  }
4502  EDiagSev sv = eDiag_Warning;
4503  if (m_Imp.IsGenomeSubmission()) {
4504  sv = eDiag_Error;
4505  }
4506  if (is5 && is3) {
4508  "Assembly_gap flanked by Ns on 5' and 3' sides");
4509  } else if (is5) {
4511  "Assembly_gap flanked by Ns on 5' side");
4512  } else if (is3) {
4514  "Assembly_gap flanked by Ns on 3' side");
4515  }
4516  vec.GetSeqData(from, to + 1, sequence);
4517  for (size_t i = 0; i < sequence.size(); i++) {
4518  if (sequence[i] != 'N') {
4519  count++;
4520  }
4521  }
4522  if (count > 0) {
4523  PostErr(eDiag_Error, eErr_SEQ_FEAT_AssemblyGapCoversSequence, "Assembly_gap extends into sequence");
4524  }
4525  }
4526 }
4527 
4528 
4530 {
4532  int loc_len = GetLength (m_Feat.GetLocation(), &m_Scope);
4533  // look for estimated length qualifier
4535  if ((*it)->IsSetQual() && NStr::EqualNocase ((*it)->GetQual(), "estimated_length")
4536  && (*it)->IsSetVal() && !NStr::EqualNocase ((*it)->GetVal(), "unknown")) {
4537  try {
4538  int estimated_length = NStr::StringToInt ((*it)->GetVal());
4539  if (estimated_length != loc_len) {
4541  "Gap feature estimated_length " + NStr::IntToString (estimated_length)
4542  + " does not match " + NStr::IntToString (loc_len) + " feature length");
4543  return;
4544  }
4545  } catch (const CException& ) {
4546  } catch (const std::exception& ) {
4547  }
4548  }
4549  }
4550  try {
4551  string s_data = GetSequenceStringFromLoc(m_Feat.GetLocation(), m_Scope);
4553  if ( !vec.empty() ) {
4554  string vec_data;
4555  vec.GetSeqData(0, vec.size(), vec_data);
4556  int num_n = 0;
4557  int num_real = 0;
4558  unsigned int num_gap = 0;
4559  int pos = 0;
4560  string::iterator it = vec_data.begin();
4561  while (it != vec_data.end()) {
4562  if (*it == 'N') {
4563  if (vec.IsInGap(pos)) {
4564  // gap not N
4565  num_gap++;
4566  } else {
4567  num_n++;
4568  }
4569  } else if (*it != '-') {
4570  num_real++;
4571  }
4572  ++it;
4573  ++pos;
4574  }
4575  if (num_real > 0 && num_n > 0) {
4577  "Gap feature over " + NStr::IntToString (num_real)
4578  + " real bases and " + NStr::IntToString (num_n)
4579  + " Ns");
4580  } else if (num_real > 0) {
4582  "Gap feature over " + NStr::IntToString (num_real)
4583  + " real bases");
4584  } else if (num_n > 0) {
4586  "Gap feature over " + NStr::IntToString (num_n)
4587  + " Ns");
4588  } else if (num_gap != GetLength (m_Feat.GetLocation(), &m_Scope)) {
4590  "Gap feature estimated_length " + NStr::IntToString (loc_len)
4591  + " does not match " + NStr::IntToString (num_gap)
4592  + " gap characters");
4593  }
4594  }
4595 
4596  } catch (const CException& ) {
4597  } catch (const std::exception& ) {
4598  }
4599 }
4600 
4601 
4603 {
4606 
4607  const string& key = m_Feat.GetData().GetImp().GetKey();
4608  if (NStr::IsBlank(key)) {
4610  "NULL feature key");
4611  return;
4612  }
4613 
4614  if (subtype == CSeqFeatData::eSubtype_imp || subtype == CSeqFeatData::eSubtype_bad) {
4615  if (NStr::Equal(key, "mRNA")) {
4616  subtype = CSeqFeatData::eSubtype_mRNA;
4617  } else if (NStr::Equal(key, "tRNA")) {
4618  subtype = CSeqFeatData::eSubtype_tRNA;
4619  } else if (NStr::Equal(key, "tRNA")) {
4620  subtype = CSeqFeatData::eSubtype_tRNA;
4621  } else if (NStr::Equal(key, "rRNA")) {
4622  subtype = CSeqFeatData::eSubtype_rRNA;
4623  } else if (NStr::Equal(key, "snRNA")) {
4624  subtype = CSeqFeatData::eSubtype_snRNA;
4625  } else if (NStr::Equal(key, "scRNA")) {
4626  subtype = CSeqFeatData::eSubtype_scRNA;
4627  } else if (NStr::Equal(key, "snoRNA")) {
4629  } else if (NStr::Equal(key, "misc_RNA")) {
4631  } else if (NStr::Equal(key, "precursor_RNA")) {
4633  } else if (NStr::EqualNocase (key, "mat_peptide")) {
4635  } else if (NStr::EqualNocase (key, "propeptide")) {
4637  } else if (NStr::EqualNocase (key, "sig_peptide")) {
4639  } else if (NStr::EqualNocase (key, "transit_peptide")) {
4641  } else if (NStr::EqualNocase (key, "preprotein")
4642  || NStr::EqualNocase(key, "proprotein")) {
4644  } else if (NStr::EqualNocase (key, "virion")) {
4646  } else if (NStr::EqualNocase(key, "mutation")) {
4648  } else if (NStr::EqualNocase(key, "allele")) {
4650  } else if (NStr::EqualNocase (key, "CDS")) {
4652  } else if (NStr::EqualNocase(key, "Import")) {
4654  "Feature key Import is no longer legal");
4655  return;
4656  }
4657  }
4658 
4659  switch ( subtype ) {
4660 
4665  "Unknown feature key " + key);
4666  break;
4667 
4672  "Feature key " + key + " is no longer legal");
4673  break;
4674 
4675 
4677  if (m_Imp.IsEmbl() || m_Imp.IsDdbj()) {
4678  const CSeq_loc& loc = m_Feat.GetLocation();
4681  "Pre/pro protein feature cannot be associated with a "
4682  "protein product of a coding region feature");
4683  } else {
4685  "Peptide processing feature should be converted to the "
4686  "appropriate protein feature subtype");
4687  }
4688  break;
4689 
4698  // !!! what about other RNA types (preRNA, otherRNA)?
4700  "RNA feature should be converted to the appropriate RNA feature "
4701  "subtype, location should be converted manually");
4702  break;
4703 
4705  {
4706  // impfeat CDS must be pseudo; fail if not
4707  bool pseudo = sequence::IsPseudo(m_Feat, m_Scope);
4708  if ( !pseudo ) {
4710  "ImpFeat CDS should be pseudo");
4711  }
4712 
4714  if ( NStr::CompareNocase( (*gbqual)->GetQual(), "translation") == 0 ) {
4716  "ImpFeat CDS with /translation found");
4717  }
4718  }
4719  }
4720  break;
4723  "Unknown feature key " + key);
4724  break;
4727  && (!m_Feat.IsSetDbxref() || m_Feat.GetDbxref().empty())) {
4728  if (!m_Feat.IsSetQual() || m_Feat.GetQual().empty()) {
4730  "repeat_region has no qualifiers");
4731  } else if ( ! m_Imp.IsGenomeSubmission() ) {
4732  bool okay = false;
4734  if ( ! NStr::EqualNocase((*gbqual)->GetQual(), "rpt_type") ) {
4735  okay = true;
4736  break;
4737  }
4738  const string& val = (*gbqual)->GetVal();
4739  if ( ! NStr::Equal (val, "other") ) {
4740  okay = true;
4741  break;
4742  }
4743  }
4744  if ( ! okay ) {
4746  "repeat_region has no qualifiers except rpt_type other");
4747  }
4748  }
4749  }
4750  break;
4752  {
4753  vector<string> valid_types = CSeqFeatData::GetRegulatoryClassList();
4755  if ( NStr::CompareNocase( (*gbqual)->GetQual(), "regulatory_class") != 0 ) continue;
4756  const string& val = (*gbqual)->GetVal();
4757  bool missing = true;
4758  FOR_EACH_STRING_IN_VECTOR (itr, valid_types) {
4759  string str = *itr;
4760  if ( NStr::Equal (str, val) ) {
4761  missing = false;
4762  }
4763  }
4764  if ( missing ) {
4765  if ( NStr::Equal (val, "other") && !m_Feat.IsSetComment() ) {
4767  "The regulatory_class 'other' is missing the required /note");
4768  }
4769  }
4770  }
4771  }
4772  break;
4774  {
4777  if ( NStr::CompareNocase( (*gbqual)->GetQual(), "recombination_class") != 0 ) continue;
4778  const string& val = (*gbqual)->GetVal();
4779  if ( recomb_values.find(val.c_str()) == recomb_values.end() ) {
4780  if ( NStr::Equal (val, "other")) {
4781  if (!m_Feat.IsSetComment()) {
4783  "The recombination_class 'other' is missing the required /note");
4784  }
4785  } else {
4786  // Removed per VR-770. FlatFile will automatically
4787  // display the unrecognized recombination_class value
4788  // in the note and list the recombination_class as other
4789 // PostErr(eDiag_Info, eErr_SEQ_FEAT_InvalidQualifierValue,
4790 // "'" + val + "' is not a legal value for recombination_class", feat);
4791  }
4792  }
4793  }
4794  }
4795  break;
4796  default:
4797  break;
4798  }// end of switch statement
4799 }
4800 
4801 
4803 {
4804  if (!feat.IsSetData()) {
4805  return new CSingleFeatValidator(feat, scope, imp);
4806  } else if (feat.GetData().IsCdregion()) {
4807  return new CCdregionValidator(feat, scope, imp);
4808  } else if (feat.GetData().IsGene()) {
4809  return new CGeneValidator(feat, scope, imp);
4810  } else if (feat.GetData().IsProt()) {
4811  return new CProtValidator(feat, scope, imp);
4812  } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
4813  return new CMRNAValidator(feat, scope, imp);
4814  } else if (feat.GetData().IsRna()) {
4815  return new CRNAValidator(feat, scope, imp);
4816  } else if (feat.GetData().IsPub()) {
4817  return new CPubFeatValidator(feat, scope, imp);
4818  } else if (feat.GetData().IsBiosrc()) {
4819  return new CSrcFeatValidator(feat, scope, imp);
4820  } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_exon) {
4821  return new CExonValidator(feat, scope, imp);
4822  } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_intron) {
4823  return new CIntronValidator(feat, scope, imp);
4824  } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_feature) {
4825  return new CMiscFeatValidator(feat, scope, imp);
4826  } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_assembly_gap) {
4827  return new CAssemblyGapValidator(feat, scope, imp);
4828  } else if (feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_polyA_site) {
4829  return new CPolyASiteValidator(feat, scope, imp);
4830  }
4832  return new CPolyASignalValidator(feat, scope, imp);
4833  } else if (feat.GetData().IsImp()) {
4834  CSeqFeatData::ESubtype subtype = feat.GetData().GetSubtype();
4835  switch (subtype) {
4840  return new CPeptideValidator(feat, scope, imp);
4841  break;
4843  return new CGapFeatValidator(feat, scope, imp);
4844  break;
4845  default:
4846  return new CImpFeatValidator(feat, scope, imp);
4847  break;
4848  }
4849  } else {
4850  return new CSingleFeatValidator(feat, scope, imp);
4851  }
4852 }
4853 
4854 
4855 END_SCOPE(validator)
static CRef< CScope > m_Scope
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
EErrType
@ eErr_SEQ_FEAT_WrongQualOnImpFeat
@ eErr_SEQ_FEAT_NotSpliceConsensusAcceptor
@ eErr_SEQ_FEAT_rRNADoesNotHaveProduct
@ eErr_SEQ_FEAT_WholeLocation
@ eErr_SEQ_FEAT_MobileElementInvalidQualifier
@ eErr_SEQ_FEAT_DuplicateAnticodonInterval
@ eErr_SEQ_FEAT_ShortTRNAIntron
@ eErr_SEQ_FEAT_MinusStrandProtein
@ eErr_SEQ_FEAT_NotSpliceConsensusDonor
@ eErr_SEQ_FEAT_GeneXrefWithoutLocus
@ eErr_SEQ_FEAT_GenesInconsistent
@ eErr_SEQ_FEAT_PseudoRnaHasProduct
@ eErr_SEQ_FEAT_EcNumberDataMissing
@ eErr_SEQ_FEAT_InvalidProductOnGene
@ eErr_SEQ_FEAT_BadRRNAcomponentOverlapTRNA
@ eErr_SEQ_FEAT_mRNAUnnecessaryException
@ eErr_SEQ_FEAT_UnknownImpFeatQual
@ eErr_SEQ_FEAT_InvalidCompareBadAccession
@ eErr_SEQ_FEAT_InvalidCompareMissingVersion
@ eErr_SEQ_FEAT_InvalidRptUnitRange
@ eErr_GENERIC_SgmlPresentInText
@ eErr_SEQ_FEAT_BadAnticodonAA
@ eErr_SEQ_FEAT_RnaProductMismatch
@ eErr_SEQ_FEAT_FeatureBeginsOrEndsInGap
@ eErr_SEQ_FEAT_EcNumberInProteinName
@ eErr_SEQ_FEAT_InvalidTRNAdata
@ eErr_SEQ_FEAT_UnnecessaryException
@ eErr_SEQ_FEAT_AssemblyGapFeatureProblem
@ eErr_SEQ_FEAT_OldLocusTagWithoutLocusTag
@ eErr_SEQ_FEAT_NotSpliceConsensusAcceptorTerminalIntron
@ eErr_SEQ_FEAT_AnticodonMixedStrand
@ eErr_SEQ_FEAT_UnparsedtRNAProduct
@ eErr_SEQ_FEAT_InconsistentPseudogeneValue
@ eErr_SEQ_FEAT_GeneXrefWithoutGene
@ eErr_SEQ_FEAT_ReplacedEcNumber
@ eErr_SEQ_FEAT_PartialsInconsistent
@ eErr_SEQ_FEAT_InvalidQualifierValue
@ eErr_SEQ_FEAT_DuplicateGeneOntologyTerm
@ eErr_SEQ_FEAT_ProtRefHasNoData
@ eErr_SEQ_FEAT_NotSpliceConsensusDonorTerminalIntron
@ eErr_SEQ_FEAT_BadTrnaAA
@ eErr_SEQ_FEAT_WrongQualOnFeature
@ eErr_SEQ_FEAT_ProductFetchFailure
@ eErr_SEQ_FEAT_MismatchedAllele
@ eErr_SEQ_FEAT_RepeatSeqDoNotMatch
@ eErr_SEQ_FEAT_MissingQualOnImpFeat
@ eErr_SEQ_FEAT_InvalidRptUnitSeqCharacters
@ eErr_SEQ_FEAT_TranscriptLen
@ eErr_SEQ_FEAT_RubiscoProblem
@ eErr_SEQ_FEAT_InvalidAlleleDuplicates
@ eErr_SEQ_FEAT_ImpCDSnotPseudo
@ eErr_SEQ_FEAT_BadCDScomponentOverlapTRNA
@ eErr_SEQ_FEAT_BadEcNumberValue
@ eErr_SEQ_FEAT_EcNumberEmpty
@ eErr_SEQ_FEAT_ImpCDShasTranslation
@ eErr_SEQ_FEAT_PeptideFeatOutOfFrame
@ eErr_SEQ_FEAT_ProteinNameHasPMID
@ eErr_SEQ_FEAT_ImpFeatBadLoc
@ eErr_SEQ_FEAT_MissingQualOnFeature
@ eErr_SEQ_FEAT_PolyAsiteNotPoint
@ eErr_SEQ_FEAT_RepeatRegionNeedsNote
@ eErr_SEQ_FEAT_GeneXrefStrandProblem
@ eErr_SEQ_FEAT_PolyATail
@ eErr_SEQ_FEAT_MissingTrnaAA
@ eErr_GENERIC_NonAsciiAsn
@ eErr_SEQ_FEAT_UnparsedtRNAAnticodon
@ eErr_SEQ_FEAT_RefSeqInText
@ eErr_SEQ_FEAT_ErroneousException
@ eErr_SEQ_FEAT_ImproperBondLocation
@ eErr_SEQ_FEAT_InvalidPseudoQualifier
@ eErr_SEQ_FEAT_FeatureSeqIDCaseDifference
@ eErr_SEQ_FEAT_BadProductSeqId
@ eErr_SEQ_FEAT_PeptideFeatureLacksCDS
@ eErr_SEQ_FEAT_InvalidCompareRefSeqAccession
@ eErr_SEQ_FEAT_InvalidReplace
@ eErr_SEQ_FEAT_UnknownImpFeatKey
@ eErr_SEQ_FEAT_IdenticalMRNAtranscriptIDs
@ eErr_SEQ_FEAT_AssemblyGapCoversSequence
@ eErr_SEQ_FEAT_ShortIntron
@ eErr_SEQ_FEAT_SplitEcNumber
@ eErr_SEQ_FEAT_AssemblyGapAdjacentToNs
@ eErr_SEQ_FEAT_InvalidPunctuation
@ eErr_SEQ_FEAT_LocusTagProductMismatch
@ eErr_SEQ_FEAT_UnknownFeatureQual
@ eErr_SEQ_FEAT_TranscriptMismatches
@ eErr_SEQ_FEAT_IncorrectQualifierCapitalization
@ eErr_SEQ_FEAT_InvalidNumberQualifier
@ eErr_SEQ_FEAT_FeatureInsideGap
@ eErr_SEQ_FEAT_InvalidRNAFeature
@ eErr_SEQ_FEAT_tRNArange
@ eErr_SEQ_FEAT_GeneIdMismatch
@ eErr_SEQ_FEAT_MissingMRNAproduct
@ eErr_SEQ_FEAT_tRNAmRNAmixup
@ eErr_SEQ_FEAT_RNAtype0
@ eErr_SEQ_FEAT_UndesiredProteinName
@ eErr_SEQ_FEAT_MrnaTransFail
@ eErr_SEQ_FEAT_InvalidInferenceValue
@ eErr_SEQ_FEAT_GeneXrefNeeded
@ eErr_SEQ_FEAT_InvalidType
@ eErr_SEQ_FEAT_SerialInComment
@ eErr_SEQ_FEAT_BadTrailingCharacter
@ eErr_SEQ_FEAT_IntervalBeginsOrEndsInGap
@ eErr_SEQ_FEAT_ProteinNameEndsInBracket
@ eErr_SEQ_FEAT_BadInternalCharacter
@ eErr_SEQ_FEAT_BadProteinName
@ eErr_SEQ_FEAT_MissingLocation
@ eErr_SEQ_FEAT_ExceptionMissingText
@ eErr_SEQ_FEAT_BadAnticodonCodon
@ eErr_SEQ_FEAT_BadTrailingHyphen
@ eErr_SEQ_FEAT_OldLocusTagMismtach
@ eErr_SEQ_FEAT_PseudoRnaViaGeneHasProduct
@ eErr_SEQ_FEAT_DeletedEcNumber
@ eErr_SEQ_FEAT_FeatureIsMostlyNs
@ eErr_SEQ_FEAT_InvalidMatchingReplace
@ eErr_INTERNAL_Exception
@ eErr_SEQ_FEAT_BadEcNumberFormat
@ eErr_SEQ_FEAT_BothStrands
@ eErr_SEQ_FEAT_ExceptionProblem
@ eErr_SEQ_FEAT_RedundantFields
@ eErr_SEQ_FEAT_ColdShockProteinProblem
@ eErr_SEQ_FEAT_TrnaCodonWrong
@ eErr_SEQ_FEAT_NoNameForProtein
@ eErr_SEQ_FEAT_RptUnitRangeProblem
@ eErr_SEQ_FEAT_InvalidVariationReplace
@ eErr_SEQ_FEAT_SeqLocOrder
@ eErr_SEQ_FEAT_AnticodonStrandConflict
@ eErr_SEQ_FEAT_InvalidRepeatUnitLength
@ eErr_SEQ_FEAT_VectorContamination
@ eErr_SEQ_FEAT_AbuttingIntervals
@ eErr_SEQ_FEAT_EcNumberInProteinComment
@ eErr_SEQ_FEAT_UnnecessaryCitPubEquiv
@ eErr_SEQ_FEAT_PartialProblem
@ eErr_SEQ_FEAT_RegulatoryClassOtherNeedsNote
@ eErr_SEQ_FEAT_MiscFeatureNeedsNote
@ eErr_SEQ_FEAT_FocusOnBioSourceFeature
@ eErr_SEQ_FEAT_PolyAsignalNotRange
@ eErr_SEQ_DESCR_BioSourceNeedsFocus
@ eErr_SEQ_FEAT_BadTrnaCodon
@ eErr_SEQ_FEAT_FeatureCrossesGap
@ eErr_SEQ_FEAT_SelfReferentialProduct
@ eErr_SEQ_FEAT_GapFeatureProblem
@ eErr_SEQ_FEAT_HypotheticalProteinMismatch
@ eErr_SEQ_FEAT_MissingGeneXref
@ eErr_SEQ_FEAT_RecombinationClassOtherNeedsNote
@ eErr_SEQ_FEAT_MissingExceptionFlag
#define false
Definition: bool.h:36
bool IsOrganismEukaryote() const
Definition: cache_impl.hpp:103
int GetGenCode(int def=1) const
Definition: BioSource.cpp:73
CBioseq_Handle –.
CCoreException –.
Definition: ncbiexpt.hpp:1476
Definition: Dbtag.hpp:53
bool IsSkippable(void) const
Definition: Dbtag.cpp:281
void Validate() override
CFeat_CI –.
Definition: feat_ci.hpp:64
bool IsKnownGap(size_t offset)
bool IsGap(size_t offset)
bool IsUnknownGap(size_t offset)
map< size_t, EGapType > TGapTypeMap
CGapCache(const CSeq_loc &loc, CBioseq_Handle bsh)
@Gb_qual.hpp User-defined methods of the data storage class.
Definition: Gb_qual.hpp:61
static bool IsLegalMobileElementValue(const string &val)
Definition: Gb_qual.cpp:454
static bool IsValidPseudogeneValue(const string &val)
Definition: Gb_qual.cpp:303
static bool IsValidRptTypeValue(const string &val)
Definition: Gb_qual.cpp:258
static const TLegalRecombinationClassSet & GetSetOfLegalRecombinationClassValues(void)
Definition: Gb_qual.cpp:329
static const string & GetNcbieaa(int id)
static string IndexToCodon(int index)
static int CodonToIndex(char base1, char base2, char base3)
static const CGenetic_code_table & GetCodeTable(void)
CRef< feature::CFeatTree > GetFeatTreeFromCache(const CSeq_loc &loc, CScope &scope)
Definition: gene_cache.cpp:79
CConstRef< CSeq_feat > GetGeneFromCache(const CSeq_feat *feat, CScope &scope)
Definition: gene_cache.cpp:106
bool IsSuppressed(void) const
Definition: Gene_ref.cpp:75
bool x_IsIntronShort(bool pseudo)
CMRNAValidator(const CSeq_feat &feat, CScope &scope, CValidError_imp &imp)
CConstRef< CSeq_feat > m_Gene
void Validate() override
CMappedFeat –.
Definition: mapped_feat.hpp:59
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CPeptideValidator(const CSeq_feat &feat, CScope &scope, CValidError_imp &imp)
CConstRef< CSeq_feat > m_CDS
void x_ValidateSeqFeatLoc() override
void Validate() override
void x_ValidateProteinName(const string &prot_name)
static EECNumberFileStatus GetECNumAmbiguousStatus()
Definition: Prot_ref.cpp:85
static EECNumberFileStatus GetECNumSpecificStatus()
Definition: Prot_ref.cpp:88
@ eECFile_not_found
File was not found in expected directory.
Definition: Prot_ref.hpp:74
static bool IsECNumberSplit(const string &old_ecno)
Definition: Prot_ref.cpp:226
static bool IsValidECNumberFormat(const string &ecno)
Verify correct form of EC number.
Definition: Prot_ref.cpp:257
static EECNumberFileStatus GetECNumDeletedStatus()
Definition: Prot_ref.cpp:86
EECNumberStatus
Enzyme Commission number status.
Definition: Prot_ref.hpp:63
@ eEC_replaced
Obsolete synonym for some other EC number.
Definition: Prot_ref.hpp:66
@ eEC_unknown
Unrecognized; possibly malformed.
Definition: Prot_ref.hpp:68
@ eEC_deleted
Withdrawn, with no (single?) replacement.
Definition: Prot_ref.hpp:67
static EECNumberFileStatus GetECNumReplacedStatus()
Definition: Prot_ref.cpp:87
static EECNumberStatus GetECNumberStatus(const string &ecno)
Determine an EC number's validity and specificity.
Definition: Prot_ref.cpp:190
void Validate() override
void x_ReportRNATranslationProblems(size_t problems, size_t mismatches)
void x_ValidateAnticodon(const CSeq_loc &anticodon)
void x_ValidateRnaProduct(bool feat_pseudo, bool pseudo)
@RNA_ref.hpp User-defined methods of the data storage class.
Definition: RNA_ref.hpp:54
static string GetRnaTypeName(const CRNA_ref::EType rna_type)
Definition: RNA_ref.cpp:73
CScope –.
Definition: scope.hpp:92
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
bool IsLegalQualifier(EQualifier qual) const
Test wheather a certain qualifier is legal for the feature.
@ eQual_mobile_element_type
static bool AllowStrandBoth(ESubtype subtype)
ESubtype GetSubtype(void) const
string GetKey(EVocabulary vocab=eVocabulary_full) const
const TQualifiers & GetMandatoryQualifiers(void) const
Get the list of all mandatory qualifiers for the feature.
static std::pair< EQualifier, CTempString > GetQualifierTypeAndValue(CTempString qual)
@ eSubtype_transit_peptide
@ eSubtype_bad
These no longer need to match the FEATDEF values in the C toolkit's objfdef.h.
static EQualifier GetQualifierType(CTempString qual)
convert qual string to enumerated value
static CTempString GetQualifierAsString(EQualifier qual)
Convert a qualifier from an enumerated value to a string representation or empty if not found.
static CTempString SubtypeValueToName(ESubtype eSubtype)
Turns a ESubtype into its string value which is NOT necessarily related to the identifier of the enum...
static const vector< string > & GetRegulatoryClassList()
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
@ e_Ncbieaa
Definition: sequtil.hpp:57
@ e_Ncbi8aa
Definition: sequtil.hpp:56
@ e_Ncbistdaa
Definition: sequtil.hpp:58
@ e_Iupacaa
Definition: sequtil.hpp:55
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_entry_Handle –.
CSeq_ext –.
Definition: Seq_ext.hpp:66
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
static bool IsExceptionTextInLegalList(const string &exception_text, bool allow_refseq)
Indicates whether this specific text occurs in the list of legal exceptions.
Definition: Seq_feat.cpp:542
const CGene_ref * GetGeneXref(void) const
See related function in util/feature.hpp.
Definition: Seq_feat.cpp:181
static bool IsExceptionTextRefSeqOnly(const string &exception_text)
Indicates whether this specific text is a RefSeq-only exception.
Definition: Seq_feat.cpp:553
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
static TIndex GetMapToIndex(CSeq_data::E_Choice from_type, CSeq_data::E_Choice to_type, TIndex from_idx)
static bool x_IsMostlyNs(const CSeq_loc &loc, CBioseq_Handle bsh)
CBioseq_Handle x_GetFeatureProduct(bool look_far, bool &is_far)
void ValidateCharactersInField(string value, string field_name)
void PostErr(EDiagSev sv, EErrType et, const string &msg)
void x_ValidateLabelVal(const string &val)
CSingleFeatValidator(const CSeq_feat &feat, CScope &scope, CValidError_imp &imp)
void x_ReportAcceptorSpliceSiteReadErrors(const CSpliceProblems::TSpliceProblem &problem, const string &label)
void x_ValidateRptUnitVal(const string &val, const string &key)
void x_ValidateReplaceQual(const string &key, const string &qual_str, const string &val)
CBioseq_Handle x_GetBioseqByLocation(const CSeq_loc &loc)
static bool s_IsPseudo(const CSeq_feat &feat)
void x_ValidateCompareVal(const string &val)
void x_ValidateRptUnitSeqVal(const string &val, const string &key)
void x_ReportDonorSpliceSiteReadErrors(const CSpliceProblems::TSpliceProblem &problem, const string &label)
static bool s_GeneRefsAreEquivalent(const CGene_ref &g1, const CGene_ref &g2, string &label)
void x_ValidateGbQual(const CGb_qual &qual)
void x_ValidateGeneFeaturePair(const CSeq_feat &gene)
void x_ValidateOldLocusTag(const string &old_locus_tag)
virtual void x_ValidateFeatComment()
bool x_HasNamedQual(const string &qual_name)
void x_ValidateLocusTagGeneralMatch(CConstRef< CSeq_feat > gene)
static bool x_HasSeqLocBond(const CSeq_feat &feat)
void x_ReportPseudogeneConflict(CConstRef< CSeq_feat > gene)
void x_ValidateRptUnitRangeVal(const string &val)
static bool x_BioseqHasNmAccession(CBioseq_Handle bsh)
void ValidateSplice(bool gene_pseudo, bool check_all)
void x_ReportSpliceProblems(const CSpliceProblems &problems, const string &label)
static TSeqPos x_FindStartOfGap(CBioseq_Handle bsh, TSeqPos pos, CScope *scope)
static size_t x_CalculateLocationGaps(CBioseq_Handle bsh, const CSeq_loc &loc, vector< TSeqPos > &gap_starts)
virtual void x_ValidateExceptText(const string &text)
static bool s_BioseqHasRefSeqThatStartsWithPrefix(CBioseq_Handle bsh, string prefix)
static void x_LocHasStrandBoth(const CSeq_loc &feat, bool &both, bool &both_rev)
const TSpliceProblemList & GetDonorProblems() const
void CalculateSpliceProblems(const CSeq_feat &feat, bool check_all, bool pseudo, CBioseq_Handle loc_handle)
vector< TSpliceProblem > TSpliceProblemList
bool IsExceptionUnnecessary() const
bool AreErrorsUnexpected() const
pair< size_t, TSeqPos > TSpliceProblem
const TSpliceProblemList & GetAcceptorProblems() const
const_iterator find(const key_type &key) const
Return a const_iterator pointing to the specified element, or to the end if the element is not found.
Definition: static_set.hpp:680
const_iterator end() const
Return the end of the controlled sequence.
Definition: static_set.hpp:647
vector< CSeq_feat_Handle > TSeq_feat_Handles
Definition: tse_handle.hpp:167
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
static EInferenceValidCode ValidateInference(string inference, bool fetch_accession, CScope *scope=nullptr)
bool DoRubiscoTest() const
bool ReportSpliceAsError() const
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
Definition: validatorp.cpp:358
const CBioSourceKind & BioSourceKind() const
bool IsRemoteFetch() const
CConstRef< CSeq_feat > GetmRNAGivenProduct(const CBioseq &seq)
bool DoesAnyFeatLocHaveGI() const
bool IsLocusTagGeneralMatch() const
void ValidateDbxref(const CDbtag &xref, const CSerialObject &obj, bool biosource=false, const CSeq_entry *ctx=nullptr)
bool IsSerialNumberInComment(const string &comment)
bool IsGenomic() const
bool IsFarSequence(const CSeq_id &id)
Definition: validatorp.cpp:234
const CTSE_Handle & GetTSE_Handle()
bool ValidateInferenceAccessions() const
bool IsHugeFileMode() const
Definition: validatorp.cpp:211
void IncrementPseudogeneCount()
CConstRef< CSeq_feat > GetCDSGivenProduct(const CBioseq &seq)
CBioseq_Handle GetLocalBioseqHandle(const CSeq_id &id)
Definition: validatorp.cpp:257
bool IsRefSeq() const
bool IsGPS() const
bool IsINSDInSep() const
bool x_IsFarFetchFailure(const CSeq_loc &loc)
bool IsGenomeSubmission() const
void ValidateBioSource(const CBioSource &bsrc, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
void IncrementGeneXrefCount()
bool IsGpipe() const
bool IsFarFetchCDSproducts() const
bool IsValidateExons() const
bool IgnoreExceptions() const
bool IsRefSeqConventions() const
SValidatorContext & SetContext()
Definition: validatorp.cpp:194
bool IsIndexerVersion() const
CGeneCache & GetGeneCache()
bool IsSmallGenomeSet() const
bool IsEmbl() const
void ValidateSeqLoc(const CSeq_loc &loc, const CBioseq_Handle &seq, bool report_abutting, const string &prefix, const CSerialObject &obj, bool lowerSev=false)
bool IsTPE() const
bool IsFarFetchMRNAproducts() const
bool IsDdbj() const
bool IsTransgenic(const CBioSource &bsrc)
void ValidatePubdesc(const CPubdesc &pub, const CSerialObject &obj, const CSeq_entry *ctx=nullptr)
Definition: valid_pub.cpp:77
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
char value[7]
Definition: config.c:431
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
static void chk(int check, const char *fmt,...)
Definition: ct_dynamic.c:49
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
Public API for finding the gene(s) on a given feature using the same criteria as the flatfile generat...
vector< TGoTermError > GetGoTermErrors(const CSeq_feat &feat)
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
#define NULL
Definition: ncbistd.hpp:225
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
@ eDiag_Info
Informational message.
Definition: ncbidiag.hpp:651
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
@ eDiag_Fatal
Fatal error – guarantees exit(or abort)
Definition: ncbidiag.hpp:655
@ eDiag_Critical
Critical error message.
Definition: ncbidiag.hpp:654
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
const float pi
Definition: math.hpp:54
const string AsFastaString(void) const
Definition: Seq_id.cpp:2265
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
virtual void WriteAsFasta(ostream &out) const
Implement serializable interface.
Definition: Seq_id.cpp:2163
string GetLabel(const CSeq_id &id)
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
Definition: Seq_id.cpp:169
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
Definition: Seq_loc.cpp:3222
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
bool IsSetStrand(EIsSetStrand flag=eIsSetStrand_Any) const
Check if strand is set for any/all part(s) of the seq-loc depending on the flag.
Definition: Seq_loc.cpp:858
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
CRef< CSeq_loc > Intersect(const CSeq_loc &other, TOpFlags flags, ISynonymMapper *syn_mapper) const
Find the intersection with the seq-loc, merge/sort resulting ranges depending on flags.
Definition: Seq_loc.cpp:5183
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
CConstBeginInfo ConstBegin(const C &obj)
Get starting point of non-modifiable object hierarchy.
Definition: iterator.hpp:1012
ELocationInFrame IsLocationInFrame(const CSeq_feat_Handle &cds, const CSeq_loc &loc)
Determines whether location loc is in frame with coding region cds.
Definition: feature.cpp:3818
ELocationInFrame
Definition: feature.hpp:531
CMappedFeat GetBestOverlappingFeat(const CMappedFeat &feat, CSeqFeatData::ESubtype need_subtype, sequence::EOverlapType overlap_type, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0)
Definition: feature.cpp:3653
@ eLocationInFrame_InFrame
Definition: feature.hpp:532
@ eLocationInFrame_BadStart
Definition: feature.hpp:533
@ eLocationInFrame_BadStop
Definition: feature.hpp:534
@ eLocationInFrame_BadStartAndStop
Definition: feature.hpp:535
@ eLocationInFrame_NotIn
Definition: feature.hpp:536
@ fFGL_Content
Include its content if there is any.
Definition: feature.hpp:73
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
bool IsValid(const CSeq_point &pt, CScope *scope)
Checks that point >= 0 and point < length of Bioseq.
Int8 TestForOverlapEx(const CSeq_loc &loc1, const CSeq_loc &loc2, EOverlapType type, CScope *scope=0, TOverlapFlags flags=fOverlap_Default)
Updated version of TestForOverlap64().
int SeqLocPartialCheck(const CSeq_loc &loc, CScope *scope)
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
bool IsOneBioseq(const CSeq_loc &loc, CScope *scope)
Returns true if all embedded CSeq_ids represent the same CBioseq, else false.
ECompare
bool IsSameBioseq(const CSeq_id &id1, const CSeq_id &id2, CScope *scope, CScope::EGetBioseqFlag get_flag=CScope::eGetBioseq_All)
Determines if two CSeq_ids represent the same CBioseq.
CSeq_loc * SeqLocRevCmpl(const CSeq_loc &loc, CScope *scope)
Get reverse complement of the seq-loc (?)
@ eSeqlocPartial_Internal
@ eSeqlocPartial_Other
@ eSeqlocPartial_Complete
@ eSeqlocPartial_Stop
@ eSeqlocPartial_Start
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ eOverlap_Simple
any overlap of extremes
@ eOverlap_Interval
at least one pair of intervals must overlap
@ eOverlap_Contained
2nd contained within 1st extremes
@ eOverlap_Subset
2nd is a subset of 1st ranges
@ eContains
First CSeq_loc contains second.
@ eSame
CSeq_locs contain each other.
@ eContained
First CSeq_loc contained by second.
const CSeq_feat * GetCDSForProduct(const CBioseq &product, CScope *scope)
Get the encoding CDS feature of a given protein sequence.
Definition: sequence.cpp:2549
bool IsPseudo(const CSeq_feat &feat, CScope &scope)
Determines whether given feature is pseudo, using gene associated with feature if necessary Checks to...
Definition: sequence.cpp:1428
CConstRef< CSeq_feat > GetOverlappingGene(const CSeq_loc &loc, CScope &scope, ETransSplicing eTransSplicing=eTransSplicing_Auto)
Definition: sequence.cpp:1366
CConstRef< CSeq_feat > GetOverlappingCDS(const CSeq_loc &loc, CScope &scope)
Definition: sequence.cpp:1579
vector< TFeatScore > TFeatScores
Definition: sequence.hpp:353
void GetOverlappingFeatures(const CSeq_loc &loc, CSeqFeatData::E_Choice feat_type, CSeqFeatData::ESubtype feat_subtype, EOverlapType overlap_type, TFeatScores &feats, CScope &scope, const TBestFeatOpts opts=0, CGetOverlappingFeaturesPlugin *plugin=NULL)
Find all features overlapping the location.
Definition: sequence.cpp:945
CBioseq_Handle GetBioseqHandleFromTSE(const CSeq_id &id, const CTSE_Handle &tse)
Get bioseq handle for sequence withing one TSE.
Definition: scope.cpp:253
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
CSeq_feat_Handle GetSeq_featHandle(const CSeq_feat &feat, EMissing action=eMissing_Default)
Definition: scope.cpp:200
bool IsSetExcept(void) const
const CTSE_Handle & GetTSE_Handle(void) const
Get CTSE_Handle of containing TSE.
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
TClass GetClass(void) const
const TInst_Ext & GetInst_Ext(void) const
TBioseqCore GetBioseqCore(void) const
Get bioseq core structure.
bool IsSetInst_Ext(void) const
bool IsSetDbxref(void) const
const CSeqFeatData & GetData(void) const
TSeqPos GetBioseqLength(void) const
TSet GetSet(void) const
bool IsAa(void) const
bool IsSetExcept_text(void) const
TInst_Length GetInst_Length(void) const
const string & GetExcept_text(void) const
bool IsSetInst_Repr(void) const
bool IsSetClass(void) const
TInst_Repr GetInst_Repr(void) const
CScope & GetScope(void) const
Get scope this handle belongs to.
bool IsSet(void) const
const CSeq_feat::TDbxref & GetDbxref(void) const
bool IsNa(void) const
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
const TId & GetId(void) const
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
const TInst & GetInst(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
const CSeq_loc & GetLocation(void) const
const CSeq_feat & GetOriginalFeature(void) const
Get original feature with unmapped location/product.
CSeqMap::ESegmentType GetType(void) const
Definition: seq_map_ci.hpp:651
bool IsUnknownLength(void) const
return true if current segment is a gap of unknown length
Definition: seq_map_ci.cpp:302
TSeqPos GetPosition(void) const
return position of current segment in sequence
Definition: seq_map_ci.hpp:665
TSeqPos GetLength(void) const
return length of current segment
Definition: seq_map_ci.hpp:672
CConstRef< CSeq_feat > GetSeq_feat(void) const
Get current seq-feat.
ENa_strand GetStrand(void) const
Definition: seq_vector.hpp:336
bool IsInGap(TSeqPos pos) const
true if sequence at 0-based position 'pos' has gap Note: this method is not MT-safe,...
Definition: seq_vector.hpp:277
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos TSeqLength
Definition: seq_map.hpp:74
TSeqPos size(void) const
Definition: seq_vector.hpp:291
bool empty(void) const
Definition: seq_vector.hpp:284
@ fDefaultFlags
Definition: seq_map.hpp:140
@ eSeqGap
gap
Definition: seq_map.hpp:97
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1684
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
#define kEmptyStr
Definition: ncbistr.hpp:123
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2887
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5325
static SIZE_TYPE FindWord(const CTempString str, const CTempString word, ECase use_case=eCase, EDirection direction=eForwardSearch)
Find given word in the string.
Definition: ncbistr.cpp:3056
static SIZE_TYPE FindCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case sensitive search.
Definition: ncbistr.hpp:5490
static int Compare(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Compare of a substring with another string.
Definition: ncbistr.hpp:5297
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5384
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3401
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3182
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ eReverseSearch
Search in a backward direction.
Definition: ncbistr.hpp:1947
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
static const char label[]
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetIs_focus(void) const
to distinguish biological focus Check if a value has been assigned to Is_focus data member.
Definition: BioSource_.hpp:552
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
bool IsSetSyn(void) const
synonyms for locus Check if a value has been assigned to Syn data member.
Definition: Gene_ref_.hpp:756
const TSyn & GetSyn(void) const
Get the Syn member data.
Definition: Gene_ref_.hpp:768
const TDesc & GetDesc(void) const
Get the Desc member data.
Definition: Gene_ref_.hpp:599
bool IsSetPseudo(void) const
pseudogene Check if a value has been assigned to Pseudo data member.
Definition: Gene_ref_.hpp:681
bool CanGetLocus(void) const
Check if it is safe to call GetLocus method.
Definition: Gene_ref_.hpp:499
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
Definition: Gene_ref_.hpp:781
bool CanGetLocus_tag(void) const
Check if it is safe to call GetLocus_tag method.
Definition: Gene_ref_.hpp:787
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
Definition: Gene_ref_.hpp:493
bool IsSetAllele(void) const
Official allele designation Check if a value has been assigned to Allele data member.
Definition: Gene_ref_.hpp:540
bool CanGetAllele(void) const
Check if it is safe to call GetAllele method.
Definition: Gene_ref_.hpp:546
bool CanGetDesc(void) const
Check if it is safe to call GetDesc method.
Definition: Gene_ref_.hpp:593
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
Definition: Gene_ref_.hpp:793
const TLocus & GetLocus(void) const
Get the Locus member data.
Definition: Gene_ref_.hpp:505
TPseudo GetPseudo(void) const
Get the Pseudo member data.
Definition: Gene_ref_.hpp:706
const TAllele & GetAllele(void) const
Get the Allele member data.
Definition: Gene_ref_.hpp:552
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsSetDb(void) const
name of database or system Check if a value has been assigned to Db data member.
Definition: Dbtag_.hpp:208
const TTag & GetTag(void) const
Get the Tag member data.
Definition: Dbtag_.hpp:267
bool IsSetTag(void) const
appropriate tag Check if a value has been assigned to Tag data member.
Definition: Dbtag_.hpp:255
const TDb & GetDb(void) const
Get the Db member data.
Definition: Dbtag_.hpp:220
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
bool CanGetTaxname(void) const
Check if it is safe to call GetTaxname method.
Definition: Org_ref_.hpp:366
EProcessed
processing status
Definition: Prot_ref_.hpp:95
const TName & GetName(void) const
Get the Name member data.
Definition: Prot_ref_.hpp:378
bool IsSetEc(void) const
E.C.
Definition: Prot_ref_.hpp:438
const TEc & GetEc(void) const
Get the Ec member data.
Definition: Prot_ref_.hpp:450
@ eProcessed_signal_peptide
Definition: Prot_ref_.hpp:99
@ eProcessed_transit_peptide
Definition: Prot_ref_.hpp:100
const TPub & GetPub(void) const
Get the variant data.
Definition: Pub_set_.hpp:386
list< CRef< CPub > > TPub
Definition: Pub_set_.hpp:159
bool IsPub(void) const
Check if variant Pub is selected.
Definition: Pub_set_.hpp:380
const TAnticodon & GetAnticodon(void) const
Get the Anticodon member data.
Definition: Trna_ext_.hpp:649
TType GetType(void) const
Get the Type member data.
Definition: RNA_ref_.hpp:529
TNcbi8aa GetNcbi8aa(void) const
Get the variant data.
Definition: Trna_ext_.hpp:543
const TAa & GetAa(void) const
Get the Aa member data.
Definition: Trna_ext_.hpp:603
const TCodon & GetCodon(void) const
Get the Codon member data.
Definition: Trna_ext_.hpp:624
bool IsSetAa(void) const
Check if a value has been assigned to Aa data member.
Definition: Trna_ext_.hpp:591
bool IsTRNA(void) const
Check if variant TRNA is selected.
Definition: RNA_ref_.hpp:498
bool IsSetAnticodon(void) const
location of anticodon Check if a value has been assigned to Anticodon data member.
Definition: Trna_ext_.hpp:637
EType
type of RNA feature
Definition: RNA_ref_.hpp:95
bool IsSetExt(void) const
generic fields for ncRNA, tmRNA, miscRNA Check if a value has been assigned to Ext data member.
Definition: RNA_ref_.hpp:604
TNcbieaa GetNcbieaa(void) const
Get the variant data.
Definition: Trna_ext_.hpp:516
bool IsGen(void) const
Check if variant Gen is selected.
Definition: RNA_ref_.hpp:504
TIupacaa GetIupacaa(void) const
Get the variant data.
Definition: Trna_ext_.hpp:489
list< int > TCodon
Definition: Trna_ext_.hpp:306
bool CanGetAnticodon(void) const
Check if it is safe to call GetAnticodon method.
Definition: Trna_ext_.hpp:643
const TGen & GetGen(void) const
Get the variant data.
Definition: RNA_ref_.cpp:156
bool IsSetType(void) const
Check if a value has been assigned to Type data member.
Definition: RNA_ref_.hpp:510
bool IsSetClass(void) const
for ncRNAs, the class of non-coding RNA: examples: antisense_RNA, guide_RNA, snRNA Check if a value h...
Definition: RNA_gen_.hpp:247
E_Choice Which(void) const
Which variant is currently selected.
Definition: Trna_ext_.hpp:454
const TExt & GetExt(void) const
Get the Ext member data.
Definition: RNA_ref_.hpp:616
const TTRNA & GetTRNA(void) const
Get the variant data.
Definition: RNA_ref_.cpp:134
const TClass & GetClass(void) const
Get the Class member data.
Definition: RNA_gen_.hpp:259
@ e_not_set
No variant selected.
Definition: RNA_ref_.hpp:133
@ e_Name
for naming "other" type
Definition: RNA_ref_.hpp:134
@ eType_scRNA
will become ncRNA, with RNA-gen.class = scRNA
Definition: RNA_ref_.hpp:102
@ eType_snoRNA
will become ncRNA, with RNA-gen.class = snoRNA
Definition: RNA_ref_.hpp:103
@ eType_snRNA
will become ncRNA, with RNA-gen.class = snRNA
Definition: RNA_ref_.hpp:101
bool CanGetDbxref(void) const
Check if it is safe to call GetDbxref method.
Definition: Seq_feat_.hpp:1327
const TVal & GetVal(void) const
Get the Val member data.
Definition: Gb_qual_.hpp:259
const TKey & GetKey(void) const
Get the Key member data.
Definition: Imp_feat_.hpp:259
bool IsSetLoc(void) const
original location string Check if a value has been assigned to Loc data member.
Definition: Imp_feat_.hpp:294
bool IsSetComment(void) const
Check if a value has been assigned to Comment data member.
Definition: Seq_feat_.hpp:1037
vector< CRef< CDbtag > > TDbxref
Definition: Seq_feat_.hpp:123
const TPub & GetPub(void) const
Get the variant data.
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
bool IsSetQual(void) const
qualifiers Check if a value has been assigned to Qual data member.
Definition: Seq_feat_.hpp:1135
E_Choice Which(void) const
Which variant is currently selected.
bool IsBond(void) const
Check if variant Bond is selected.
bool IsProt(void) const
Check if variant Prot is selected.
bool IsCdregion(void) const
Check if variant Cdregion is selected.
bool IsImp(void) const
Check if variant Imp is selected.
const TCit & GetCit(void) const
Get the Cit member data.
Definition: Seq_feat_.hpp:1240
const TQual & GetQual(void) const
Get the Qual member data.
Definition: Seq_feat_.hpp:1147
bool IsSetPartial(void) const
incomplete in some way? Check if a value has been assigned to Partial data member.
Definition: Seq_feat_.hpp:943
bool IsSetKey(void) const
Check if a value has been assigned to Key data member.
Definition: Imp_feat_.hpp:247
bool IsSetXref(void) const
cite other relevant features Check if a value has been assigned to Xref data member.
Definition: Seq_feat_.hpp:1296
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
E_Choice
Choice variants.
bool IsGene(void) const
Check if variant Gene is selected.
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
bool IsSetExcept(void) const
something funny about this? Check if a value has been assigned to Except data member.
Definition: Seq_feat_.hpp:990
const TExcept_text & GetExcept_text(void) const
Get the Except_text member data.
Definition: Seq_feat_.hpp:1405
bool IsPub(void) const
Check if variant Pub is selected.
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
Definition: Seq_feat_.hpp:1393
const TDbxref & GetDbxref(void) const
Get the Dbxref member data.
Definition: Seq_feat_.hpp:1333
bool CanGetVal(void) const
Check if it is safe to call GetVal method.
Definition: Gb_qual_.hpp:253
bool IsHet(void) const
Check if variant Het is selected.
bool IsSetExp_ev(void) const
Check if a value has been assigned to Exp_ev data member.
Definition: Seq_feat_.hpp:1249
const TBiosrc & GetBiosrc(void) const
Get the variant data.
bool CanGetExcept_text(void) const
Check if it is safe to call GetExcept_text method.
Definition: Seq_feat_.hpp:1399
TPseudo GetPseudo(void) const
Get the Pseudo member data.
Definition: Seq_feat_.hpp:1365
const TProduct & GetProduct(void) const
Get the Product member data.
Definition: Seq_feat_.hpp:1096
bool IsSetQual(void) const
Check if a value has been assigned to Qual data member.
Definition: Gb_qual_.hpp:200
bool CanGetExcept(void) const
Check if it is safe to call GetExcept method.
Definition: Seq_feat_.hpp:996
bool IsSetPseudo(void) const
annotated on pseudogene? Check if a value has been assigned to Pseudo data member.
Definition: Seq_feat_.hpp:1346
const TComment & GetComment(void) const
Get the Comment member data.
Definition: Seq_feat_.hpp:1049
bool IsSetCit(void) const
citations for this feature Check if a value has been assigned to Cit data member.
Definition: Seq_feat_.hpp:1228
bool IsBiosrc(void) const
Check if variant Biosrc is selected.
const TGene & GetGene(void) const
Get the variant data.
const Tdata & Get(void) const
Get the member data.
TPartial GetPartial(void) const
Get the Partial member data.
Definition: Seq_feat_.hpp:962
const TProt & GetProt(void) const
Get the variant data.
TExcept GetExcept(void) const
Get the Except member data.
Definition: Seq_feat_.hpp:1009
const TXref & GetXref(void) const
Get the Xref member data.
Definition: Seq_feat_.hpp:1308
vector< CRef< CSeqFeatXref > > TXref
Definition: Seq_feat_.hpp:122
vector< CRef< CGb_qual > > TQual
Definition: Seq_feat_.hpp:117
const TQual & GetQual(void) const
Get the Qual member data.
Definition: Gb_qual_.hpp:212
const TRna & GetRna(void) const
Get the variant data.
bool IsSetDbxref(void) const
support for xref to other databases Check if a value has been assigned to Dbxref data member.
Definition: Seq_feat_.hpp:1321
bool IsSetVal(void) const
Check if a value has been assigned to Val data member.
Definition: Gb_qual_.hpp:247
bool IsSetProduct(void) const
product of process Check if a value has been assigned to Product data member.
Definition: Seq_feat_.hpp:1084
bool CanGetQual(void) const
Check if it is safe to call GetQual method.
Definition: Gb_qual_.hpp:206
const TLoc & GetLoc(void) const
Get the Loc member data.
Definition: Imp_feat_.hpp:306
bool IsRna(void) const
Check if variant Rna is selected.
TExp_ev GetExp_ev(void) const
Get the Exp_ev member data.
Definition: Seq_feat_.hpp:1268
const TImp & GetImp(void) const
Get the variant data.
bool IsSetLocation(void) const
feature made from Check if a value has been assigned to Location data member.
Definition: Seq_feat_.hpp:1105
@ e_Het
cofactor, prosthetic grp, etc, bound to seq
@ e_Region
named region (globin locus)
@ e_Seq
to annotate origin from another seq
@ e_Txinit
transcription initiation
@ e_Num
a numbering system
@ e_Pub
publication applies to this seq
@ e_User
user defined structure
@ e_Rsite
restriction site (for maps really)
@ e_Comment
just a comment
@ e_Non_std_residue
non-standard residue here in seq
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
const TName & GetName(void) const
Get the Name member data.
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
TFrom GetFrom(void) const
Get the From member data.
bool IsGeneral(void) const
Check if variant General is selected.
Definition: Seq_id_.hpp:877
bool CanGetName(void) const
Check if it is safe to call GetName method.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
bool CanGetAccession(void) const
Check if it is safe to call GetAccession method.
const TGeneral & GetGeneral(void) const
Get the variant data.
Definition: Seq_id_.cpp:369
TTo GetTo(void) const
Get the To member data.
bool IsWhole(void) const
Check if variant Whole is selected.
Definition: Seq_loc_.hpp:522
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
bool IsSetName(void) const
Check if a value has been assigned to Name data member.
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_other
Definition: Na_strand_.hpp:70
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ eNa_strand_both_rev
in reverse orientation
Definition: Na_strand_.hpp:69
@ eNa_strand_both
in forward orientation
Definition: Na_strand_.hpp:68
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Null
not placed
Definition: Seq_loc_.hpp:98
@ eClass_gen_prod_set
genomic products, chrom+mRNA+protein
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
bool IsSetCompleteness(void) const
Check if a value has been assigned to Completeness data member.
Definition: MolInfo_.hpp:569
bool CanGetBiomol(void) const
Check if it is safe to call GetBiomol method.
Definition: MolInfo_.hpp:428
ERepr
representation class
Definition: Seq_inst_.hpp:91
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
bool IsSetExt(void) const
extensions for special types Check if a value has been assigned to Ext data member.
Definition: Seq_inst_.hpp:826
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_inst_.hpp:838
TBiomol GetBiomol(void) const
Get the Biomol member data.
Definition: MolInfo_.hpp:447
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
TCompleteness GetCompleteness(void) const
Get the Completeness member data.
Definition: MolInfo_.hpp:594
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
list< CRef< CDelta_seq > > Tdata
Definition: Delta_ext_.hpp:89
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eCompleteness_no_left
missing 5' or NH3 end
Definition: MolInfo_.hpp:158
@ eCompleteness_partial
partial but no details given
Definition: MolInfo_.hpp:157
@ eCompleteness_no_right
missing 3' or COOH end
Definition: MolInfo_.hpp:159
@ eCompleteness_no_ends
missing both ends
Definition: MolInfo_.hpp:160
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
@ e_Loc
point to a sequence
Definition: Delta_seq_.hpp:89
char * buf
int i
int len
const TrnaAa taa[]
Definition: loadfeat.cpp:126
static void text(MDB_val *v)
Definition: mdb_dump.c:62
range(_Ty, _Ty) -> range< _Ty >
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::KEY key
const CharType(& source)[N]
Definition: pointer.h:1149
#define abs(a)
Definition: ncbi_heapmgr.c:130
const char * tag
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
T max(T x_, T y_)
T min(T x_, T y_)
CSeqVector GetSequenceFromLoc(const CSeq_loc &loc, CScope &scope, CBioseq_Handle::EVectorCoding coding=CBioseq_Handle::eCoding_Iupac)
Definition: utilities.cpp:203
bool IsResidue(unsigned char residue)
Definition: utilities.hpp:88
bool HasECnumberPattern(const string &str)
Definition: utilities.cpp:1088
bool IsNTNCNWACAccession(const string &acc)
Definition: utilities.cpp:2755
string GetBioseqIdLabel(const CBioseq &sq)
Definition: utilities.cpp:981
bool HasBadCharacter(const string &str)
Definition: utilities.cpp:755
CBioseq_set_Handle GetGenProdSetParent(const CBioseq_set_Handle &set)
Definition: utilities.cpp:570
bool EndsWithBadCharacter(const string &str)
Definition: utilities.cpp:768
string GetSequenceStringFromLoc(const CSeq_loc &loc, CScope &scope)
Definition: utilities.cpp:175
bool IsOrganelle(int genome)
Definition: utilities.cpp:2831
bool s_StringHasPMID(const string &str)
Definition: utilities.cpp:727
CSeqVector GetSequenceFromFeature(const CSeq_feat &feat, CScope &scope, CBioseq_Handle::EVectorCoding coding=CBioseq_Handle::eCoding_Iupac, bool product=false)
Definition: utilities.cpp:214
EAccessionFormatError ValidateAccessionString(const string &accession, bool require_version)
Definition: utilities.cpp:624
EAccessionFormatError
Definition: utilities.hpp:105
@ eAccessionFormat_missing_version
Definition: utilities.hpp:111
@ eAccessionFormat_valid
Definition: utilities.hpp:106
@ eAccessionFormat_bad_version
Definition: utilities.hpp:112
static const char * expected[]
Definition: bcp.c:42
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
Definition: pcre_exec.c:513
static const char * suffix[]
Definition: pcregrep.c:408
static const char * prefix[]
Definition: pcregrep.c:405
int offset
Definition: replacements.h:160
#define FOR_EACH_SEQID_ON_BIOSEQ(Itr, Var)
FOR_EACH_SEQID_ON_BIOSEQ EDIT_EACH_SEQID_ON_BIOSEQ.
Definition: seq_macros.hpp:308
#define FOR_EACH_GBQUAL_ON_FEATURE
#define FOR_EACH_GBQUAL_ON_SEQFEAT(Itr, Var)
FOR_EACH_GBQUAL_ON_SEQFEAT EDIT_EACH_GBQUAL_ON_SEQFEAT.
#define FOR_EACH_STRING_IN_VECTOR(Itr, Var)
FOR_EACH_STRING_IN_VECTOR EDIT_EACH_STRING_IN_VECTOR.
#define FOR_EACH_CHAR_IN_STRING(Itr, Var)
FOR_EACH_CHAR_IN_STRING EDIT_EACH_CHAR_IN_STRING.
bool ContainsSgml(const string &str)
static string s_AsciiString(const string &src)
static bool s_LocationStrandsIncompatible(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
CSingleFeatValidator * FeatValidatorFactory(const CSeq_feat &feat, CScope &scope, CValidError_imp &imp)
bool s_HasNamedQual(const CSeq_feat &feat, const string &qual)
const string kInferenceMessage[]
static bool s_StringConsistsOf(string str, string consist)
static string GetGeneticCodeName(int gcode)
const string kOrigProteinId
int s_LegalNcbieaaValues[]
static const char *const sc_BadProtNameText[]
CStaticArraySet< const char *, PCase_CStr > TBadProtNameSet
static bool s_EqualGene_ref(const CGene_ref &genomic, const CGene_ref &mrna)
static bool s_IsBioseqPartial(CBioseq_Handle bsh)
const char * GetAAName(unsigned char aa, bool is_ascii)
DEFINE_STATIC_ARRAY_MAP(TBadProtNameSet, sc_BadProtName, sc_BadProtNameText)
static const char * kAANames[]
bool HasGeneIdXref(const CMappedFeat &sf, const CObject_id &tag, bool &has_parent_gene_id)
static bool s_RptUnitIsBaseRange(string str, TSeqPos &from, TSeqPos &to)
static bool xf_IsDeltaLitOnly(CBioseq_Handle bsh)
bool CheckIntronAcceptor(ENa_strand strand, TConstSpliceSite acceptor)
bool CheckIntronSpliceSites(ENa_strand strand, TConstSpliceSite donor, TConstSpliceSite acceptor)
bool CheckIntronDonor(ENa_strand strand, TConstSpliceSite donor)
static const char * str(char *buf, int n)
Definition: stats.c:84
SAnnotSelector –.
Selector used in CSeqMap methods returning iterators.
Definition: seq_map_ci.hpp:113
atomic_bool CheckECNumFileStatus
size_t GetMRNATranslationProblems(const CSeq_feat &feat, size_t &mismatches, bool ignore_exceptions, CBioseq_Handle nuc, CBioseq_Handle rna, bool far_fetch, bool is_gpipe, bool is_genomic, CScope *scope)
@ eMRNAProblem_UnnecessaryException
@ eMRNAProblem_UnableToFetch
@ eMRNAProblem_TranscriptLenLess
@ eMRNAProblem_PolyATail95
@ eMRNAProblem_TranscriptLenMore
@ eMRNAProblem_TransFail
@ eMRNAProblem_ProductReplaced
@ eMRNAProblem_Mismatch
@ eMRNAProblem_ErroneousException
@ eMRNAProblem_PolyATail100
Modified on Thu Mar 28 17:11:39 2024 by modify_doxy.py rev. 669887