NCBI C++ ToolKit
single_feat_validator.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: single_feat_validator.cpp 102723 2024-07-01 17:41:38Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko......
27  *
28  * File Description:
29  * validation of Seq_feat
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
42 
43 #include <serial/serialbase.hpp>
44 
45 #include <objmgr/seqdesc_ci.hpp>
48 #include <util/sgml_entity.hpp>
50 #include <atomic>
51 
54 BEGIN_SCOPE(validator)
55 using namespace sequence;
56 
58  : m_Feat(feat), m_Scope(scope), m_Imp(imp), m_ProductIsFar(false)
59 {
60 
61 }
62 
63 
65 {
66  if (!m_Feat.IsSetLocation()) {
68  "The feature is missing a location");
69  return;
70  }
71 
73  bool lowerSev = false;
75  if ( m_Feat.IsSetDbxref() ) {
77  const CDbtag& dbtag = **it;
78  if ( dbtag.GetDb() == "dbSNP" ) {
79  lowerSev = true;
80  }
81  }
82  }
83  }
85  (m_Feat.GetData().IsGene() || !m_Imp.IsGpipe()),
86  "Location", m_Feat, lowerSev);
87 
90 
91  if (m_Feat.IsSetProduct()) {
94  PostErr(eDiag_Error, eErr_SEQ_FEAT_SelfReferentialProduct, "Self-referential feature product");
95  }
97  }
98 
100 
102 
106  }
107 
109 
111 
112  if (m_Feat.IsSetQual()) {
113  for (auto it = m_Feat.GetQual().begin(); it != m_Feat.GetQual().end(); it++) {
114  x_ValidateGbQual(**it);
115  }
116  }
117 
119 
120  if (m_Feat.IsSetExp_ev() && m_Feat.GetExp_ev() > 0 &&
121  !x_HasNamedQual("inference") &&
122  !x_HasNamedQual("experiment") &&
125  "Inference or experiment qualifier missing but obsolete experimental evidence qualifier set");
126  }
127 
129 
132 
134 
136 
138 
140 }
141 
142 void CSingleFeatValidator::PostErr(EDiagSev sv, EErrType et, const string& msg)
143 {
144  m_Imp.PostErr(sv, et, msg, m_Feat);
145 }
146 
147 
150 {
151  if (loc.IsInt() || loc.IsWhole()) {
152  return m_Scope.GetBioseqHandle(loc);
153  }
154  CBioseq_Handle rval;
156  for (CSeq_loc_CI citer(loc); citer; ++citer) {
157  const CSeq_id& this_id = citer.GetSeq_id();
158  if (!prev || !prev->Equals(this_id)) {
159  rval = m_Scope.GetBioseqHandle(this_id);
160  if (rval) {
161  break;
162  }
163  prev.Reset(&this_id);
164  }
165  }
166  return rval;
167 }
168 
169 
171 {
172  if (!m_Feat.IsSetProduct()) {
173  return;
174  }
175  const CSeq_id& sid = GetId(m_Feat.GetProduct(), &m_Scope);
176 
177  switch (sid.Which()) {
178  case CSeq_id::e_Genbank:
179  case CSeq_id::e_Embl:
180  case CSeq_id::e_Ddbj:
181  case CSeq_id::e_Tpg:
182  case CSeq_id::e_Tpe:
183  case CSeq_id::e_Tpd:
184  {
185  const CTextseq_id* tsid = sid.GetTextseq_Id();
186  if (tsid) {
187  if (!tsid->CanGetAccession() && tsid->CanGetName()) {
188  if (ValidateAccessionString(tsid->GetName(), false) == eAccessionFormat_valid) {
190  "Feature product should not put an accession in the Textseq-id 'name' slot");
191  } else {
193  "Feature product should not use "
194  "Textseq-id 'name' slot");
195  }
196  }
197  }
198  }
199  break;
200 
201  default:
202  break;
203  }
204 
205  if (m_ProductBioseq) {
207 
208  for (auto id : m_ProductBioseq.GetCompleteBioseq()->GetId()) {
209  if (id->Which() == sid.Which()) {
210  // check to make sure capitalization is the same
211  string from_seq = id->AsFastaString();
212  string from_loc = sid.AsFastaString();
213  if (!NStr::EqualCase(from_seq, from_loc) &&
214  NStr::EqualNocase(from_seq, from_loc)) {
216  "Capitalization change from product location on feature to product sequence");
217  }
218  }
219  switch (id->Which()) {
220  case CSeq_id::e_Genbank:
221  case CSeq_id::e_Embl:
222  case CSeq_id::e_Ddbj:
223  case CSeq_id::e_Tpg:
224  case CSeq_id::e_Tpe:
225  case CSeq_id::e_Tpd:
226  {
227  const CTextseq_id* tsid = id->GetTextseq_Id();
228  if (tsid) {
229  if (!tsid->IsSetAccession() && tsid->IsSetName()) {
230  if (ValidateAccessionString(tsid->GetName(), false) == eAccessionFormat_valid) {
232  "Protein bioseq has Textseq-id 'name' that "
233  "looks like it is derived from a nucleotide "
234  "accession");
235  } else {
237  "Protein bioseq has Textseq-id 'name' and no accession");
238  }
239  }
240  }
241  }
242  break;
243  default:
244  break;
245  }
246  }
247  }
248 }
249 
250 
252 {
253  // check for bond locations - only allowable in bond feature and under special circumstances for het
254  bool is_seqloc_bond = false;
255  if (feat.IsSetData()) {
256  if (feat.GetData().IsHet()) {
257  // heterogen can have mix of bonds with just "a" point specified */
258  for (CSeq_loc_CI it(feat.GetLocation()); it; ++it) {
259  if (it.GetEmbeddingSeq_loc().IsBond()
260  && (!it.GetEmbeddingSeq_loc().GetBond().IsSetA()
261  || it.GetEmbeddingSeq_loc().GetBond().IsSetB())) {
262  is_seqloc_bond = true;
263  break;
264  }
265  }
266  } else if (!feat.GetData().IsBond()) {
267  for (CSeq_loc_CI it(feat.GetLocation()); it; ++it) {
268  if (it.GetEmbeddingSeq_loc().IsBond()) {
269  is_seqloc_bond = true;
270  break;
271  }
272  }
273  }
274  } else {
275  for (CSeq_loc_CI it(feat.GetLocation()); it; ++it) {
276  if (it.GetEmbeddingSeq_loc().IsBond()) {
277  is_seqloc_bond = true;
278  break;
279  }
280  }
281  }
282  return is_seqloc_bond;
283 }
284 
285 
287 {
289  return;
290  }
291  bool both, both_rev;
292  x_LocHasStrandBoth(m_Feat.GetLocation(), both, both_rev);
293  if (both || both_rev) {
294  string suffix;
295  if (both && both_rev) {
296  suffix = "(forward and reverse)";
297  } else if (both) {
298  suffix = "(forward)";
299  } else if (both_rev) {
300  suffix = "(reverse)";
301  }
302 
304 
306  label + " may not be on both " + suffix + " strands");
307  }
308 }
309 
310 
311 void CSingleFeatValidator::x_LocHasStrandBoth(const CSeq_loc& loc, bool& both, bool& both_rev)
312 {
313  both = false;
314  both_rev = false;
315  for (CSeq_loc_CI it(loc); it; ++it) {
316  if (it.IsSetStrand()) {
317  ENa_strand s = it.GetStrand();
318  if (s == eNa_strand_both && !both) {
319  both = true;
320  } else if (s == eNa_strand_both_rev && !both_rev) {
321  both_rev = true;
322  }
323  }
324  if (both && both_rev) {
325  break;
326  }
327  }
328 }
329 
330 
331 bool HasGeneIdXref(const CMappedFeat& sf, const CObject_id& tag, bool& has_parent_gene_id)
332 {
333  has_parent_gene_id = false;
334  if (!sf.IsSetDbxref()) {
335  return false;
336  }
338  if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "GeneID")) {
339  has_parent_gene_id = true;
340  if ((*it)->IsSetTag() && (*it)->GetTag().Equals(tag)) {
341  return true;
342  }
343  }
344  }
345  return false;
346 }
347 
348 
350 {
351  if (!m_Feat.IsSetDbxref()) {
352  return;
353  }
354 
355  // no tse, no feat-handle
356  auto tse = m_Imp.GetTSE_Handle();
357  if (!tse) {
358  return;
359  }
360 
361  CRef<feature::CFeatTree> feat_tree;
364  if ((*it)->IsSetDb() && NStr::EqualNocase((*it)->GetDb(), "GeneID") &&
365  (*it)->IsSetTag()) {
366  if (!feat_tree) {
368  }
369  if (feat_tree) {
370  CMappedFeat parent = feat_tree->GetParent(mf);
371  while (parent) {
372  bool has_parent_gene_id = false;
373  if (!HasGeneIdXref(parent, (*it)->GetTag(), has_parent_gene_id)) {
374  if (has_parent_gene_id ||
377  "GeneID mismatch");
378  }
379  }
380  parent = feat_tree->GetParent(parent);
381  }
382  }
383  }
384  }
385 
386 }
387 
388 
390 {
391  if (!m_Feat.IsSetCit()) {
392  return;
393  }
394 
395  if (m_Feat.GetCit().IsPub()) {
397  if ((*pi)->IsEquiv()) {
399  "Citation on feature has unexpected internal Pub-equiv");
400  return;
401  }
402  }
403  }
404 }
405 
406 
407 const string kInferenceMessage[] = {
408  "unknown error",
409  "empty inference string",
410  "bad inference prefix",
411  "bad inference body",
412  "single inference field",
413  "spaces in inference",
414  "possible comment in inference",
415  "same species misused",
416  "the value in the accession field is not legal. The only allowed value is accession.version, eg AF123456.1. Problem =",
417  "bad inference accession version",
418  "accession.version not public",
419  "bad accession type",
420  "unrecognized database",
421 };
422 
423 
425 {
426  if (!qual.IsSetQual()) {
427  return;
428  }
429  /* first check for anything other than replace */
430  if (!qual.IsSetVal() || NStr::IsBlank(qual.GetVal())) {
431  if (NStr::EqualNocase(qual.GetQual(), "replace")) {
432  /* ok for replace */
433  } else {
435  "Qualifier other than replace has just quotation marks");
436  if (NStr::EqualNocase(qual.GetQual(), "EC_number")) {
437  PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberEmpty, "EC number should not be empty");
438  }
439  }
440  if (NStr::EqualNocase(qual.GetQual(), "inference")) {
442  "Inference qualifier problem - empty inference string ()");
443  } else if (NStr::EqualNocase(qual.GetQual(), "pseudogene")) {
444  PostErr(eDiag_Warning, eErr_SEQ_FEAT_InvalidPseudoQualifier, "/pseudogene value should not be empty");
445  }
446  } else if (NStr::EqualNocase(qual.GetQual(), "EC_number")) {
449  qual.GetVal() + " is not in proper EC_number format");
450  } else {
451  string ec_number = qual.GetVal();
454  switch (status) {
457  "EC_number " + ec_number + " was deleted");
458  break;
462  "EC_number " + ec_number + " was replaced");
463  break;
465  {
466  size_t pos = NStr::Find(ec_number, "n");
467  if (pos == string::npos || !isdigit(ec_number.c_str()[pos + 1])) {
469  ec_number + " is not a legal value for qualifier EC_number");
470  } else {
472  ec_number + " is not a legal preliminary value for qualifier EC_number");
473  }
474  }
475  break;
476  default:
477  break;
478  }
479  }
480  } else if (NStr::EqualNocase(qual.GetQual(), "inference")) {
481  /* TODO: Validate inference */
482  if (! m_Imp.IgnoreInferences()) {
483  string val;
484  if (qual.IsSetVal()) {
485  val = qual.GetVal();
486  }
489  if (NStr::IsBlank(val)) {
490  val = "?";
491  }
493  "Inference qualifier problem - " + kInferenceMessage[(int)rsult] + " ("
494  + val + ")");
495  }
496  }
497  } else if (NStr::EqualNocase(qual.GetQual(), "pseudogene")) {
501  "/pseudogene value should not be '" + qual.GetVal() + "'", m_Feat);
502  }
503  } else if (NStr::EqualNocase(qual.GetQual(), "number")) {
504  bool has_space = false;
505  bool has_char_after_space = false;
506  ITERATE(string, it, qual.GetVal()) {
507  if (isspace((unsigned char)(*it))) {
508  has_space = true;
509  } else if (has_space) {
510  // non-space after space
511  has_char_after_space = true;
512  break;
513  }
514  }
515  if (has_char_after_space) {
517  "Number qualifiers should not contain spaces");
518  }
519  }
520  if (qual.IsSetVal() && ContainsSgml(qual.GetVal())) {
522  "feature qualifier " + qual.GetVal() + " has SGML");
523  }
524 
525 }
526 
527 
529 {
530 
531  bool expected{true};
532  if (m_Imp.SetContext().CheckECNumFileStatus.compare_exchange_strong(expected,false)) {
535  "Unable to find EC number file 'ecnum_ambiguous.txt' in data directory");
536  }
539  "Unable to find EC number file 'ecnum_deleted.txt' in data directory");
540  }
543  "Unable to find EC number file 'ecnum_replaced.txt' in data directory");
544  }
547  "Unable to find EC number file 'ecnum_specific.txt' in data directory");
548  }
549  }
550 }
551 
552 
554 {
555  vector<TGoTermError> errors = GetGoTermErrors(m_Feat);
556  for (auto it : errors) {
558  it.first, it.second);
559  }
560 }
561 
562 
563 bool CSingleFeatValidator::x_HasNamedQual(const string& qual_name)
564 {
565  if (!m_Feat.IsSetQual()) {
566  return false;
567  }
569  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), qual_name)) {
570  return true;
571  }
572  }
573  return false;
574 }
575 
576 
578 {
579  if (!m_Feat.IsSetComment()) {
580  return;
581  }
582  const string& comment = m_Feat.GetComment();
583  if (m_Imp.IsSerialNumberInComment(comment)) {
585  "Feature comment may refer to reference by serial number - "
586  "attach reference specific comments to the reference "
587  "REMARK instead.", m_Feat);
588  }
589  if (ContainsSgml(comment)) {
591  "feature comment " + comment + " has SGML",
592  m_Feat);
593  }
594 }
595 
596 
598 {
599  unsigned int partial_prod = eSeqlocPartial_Complete,
600  partial_loc = eSeqlocPartial_Complete;
601 
602  bool is_partial = m_Feat.IsSetPartial() && m_Feat.GetPartial();
603  partial_loc = SeqLocPartialCheck(m_Feat.GetLocation(), &m_Scope);
604 
605  if (m_ProductBioseq) {
607  }
608 
609  if ((partial_loc != eSeqlocPartial_Complete) ||
610  (partial_prod != eSeqlocPartial_Complete) ||
611  is_partial) {
612 
613  // a feature on a partial sequence should be partial -- it often isn't
614  if (!is_partial &&
615  partial_loc != eSeqlocPartial_Complete &&
616  m_Feat.IsSetLocation() &&
617  m_Feat.GetLocation().IsWhole()) {
619  "On partial Bioseq, SeqFeat.partial should be TRUE");
620  }
621  // a partial feature, with complete location, but partial product
622  else if (is_partial &&
623  partial_loc == eSeqlocPartial_Complete &&
624  m_Feat.IsSetProduct() &&
625  m_Feat.GetProduct().IsWhole() &&
626  partial_prod != eSeqlocPartial_Complete) {
627  if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
628  // suppress in gpipe genomic
629  } else {
631  "When SeqFeat.product is a partial Bioseq, SeqFeat.location "
632  "should also be partial");
633  }
634  }
635  // gene on segmented set is now 'order', should also be partial
636  else if (m_Feat.GetData().IsGene() &&
637  !is_partial &&
638  partial_loc == eSeqlocPartial_Internal) {
640  "Gene of 'order' with otherwise complete location should "
641  "have partial flag set");
642  }
643  // inconsistent combination of partial/complete product,location,partial flag - part 1
644  else if (partial_prod == eSeqlocPartial_Complete && m_Feat.IsSetProduct()) {
645  // if not local bioseq product, lower severity
646  EDiagSev sev = eDiag_Warning;
647  bool is_far_fail = false;
649  sev = eDiag_Info;
651  is_far_fail = true;
652  }
653  }
654 
655  string str("Inconsistent: Product= complete, Location= ");
656  str += (partial_loc != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
657  str += "Feature.partial= ";
658  str += is_partial ? "TRUE" : "FALSE";
659  if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
660  // suppress for genomic gpipe
661  } else if (is_far_fail) {
663  } else {
665  }
666  }
667  // inconsistent combination of partial/complete product,location,partial flag - part 2
668  else if (partial_loc == eSeqlocPartial_Complete || !is_partial) {
669  string str("Inconsistent: ");
670  if (m_Feat.IsSetProduct()) {
671  str += "Product= ";
672  str += (partial_prod != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
673  }
674  str += "Location= ";
675  str += (partial_loc != eSeqlocPartial_Complete) ? "partial, " : "complete, ";
676  str += "Feature.partial= ";
677  str += is_partial ? "TRUE" : "FALSE";
678  if (m_Imp.IsGenomic() && m_Imp.IsGpipe()) {
679  // suppress for genomic gpipe
680  } else {
682  }
683  }
684  // 5' or 3' partial location giving unclassified partial product
685  else if ((((partial_loc & eSeqlocPartial_Start) != 0) ||
686  ((partial_loc & eSeqlocPartial_Stop) != 0)) &&
687  ((partial_prod & eSeqlocPartial_Other) != 0) &&
688  is_partial) {
690  "5' or 3' partial location should not have unclassified"
691  " partial in product molinfo descriptor");
692  }
693 
694  // note - in analogous C Toolkit function there is additional code for ensuring
695  // that partial intervals are partial at splice sites, gaps, or the ends of the
696  // sequence. This has been moved to CValidError_bioseq::ValidateFeatPartialInContext.
697  }
698 
699 }
700 
701 
703 {
704  if (x_HasSeqLocBond(m_Feat)) {
706  "Bond location should only be on bond features");
707  }
708 
709  // feature location should not be whole
710  if (m_Feat.GetLocation().IsWhole()) {
711  string prefix = "Feature";
712  if (m_Feat.IsSetData()) {
713  if (m_Feat.GetData().IsCdregion()) {
714  prefix = "CDS";
716  prefix = "mRNA";
717  }
718  }
719  PostErr(eDiag_Warning, eErr_SEQ_FEAT_WholeLocation, prefix + " may not have whole location");
720  }
721 
722  if (m_LocationBioseq) {
723  // look for mismatch in capitalization for IDs
724  CNcbiOstrstream os;
725  const CSeq_id *id = m_Feat.GetLocation().GetId();
726  if (id) {
727  id->WriteAsFasta(os);
728  string loc_id = CNcbiOstrstreamToString(os);
730  if ((*it)->IsGi() || (*it)->IsGibbsq() || (*it)->IsGibbmt()) {
731  continue;
732  }
733  CNcbiOstrstream os2;
734  (*it)->WriteAsFasta(os2);
735  string bs_id = CNcbiOstrstreamToString(os2);
736  if (NStr::EqualNocase(loc_id, bs_id) && !NStr::EqualCase(loc_id, bs_id)) {
738  "Sequence identifier in feature location differs in capitalization with identifier on Bioseq");
739  }
740  }
741  }
742  // look for protein features on the minus strand
746  "Feature on protein indicates negative strand");
747  }
748 
749  if (!m_Feat.GetData().IsImp()
750  || !m_Feat.GetData().GetImp().IsSetKey()
751  || !NStr::EqualNocase(m_Feat.GetData().GetImp().GetKey(), "gap")) {
752  try {
753  vector<TSeqPos> gap_starts;
754  size_t rval = x_CalculateLocationGaps(m_LocationBioseq, m_Feat.GetLocation(), gap_starts);
755  bool mostly_raw_ns = x_IsMostlyNs(m_Feat.GetLocation(), m_LocationBioseq);
756 
757  if ((rval & eLocationGapMostlyNs) || mostly_raw_ns) {
759  "Feature contains more than 50% Ns");
760  }
761  for (auto gap_start : gap_starts) {
763  "Feature begins or ends in gap starting at " + NStr::NumericToString(gap_start + 1));
764  }
765  if (rval & eLocationGapContainedInGap &&
768  "Feature inside sequence gap");
769  }
770  if (m_Feat.GetData().IsCdregion() || m_Feat.GetData().IsRna()) {
773  "Internal interval begins or ends in gap");
774  }
775  if (rval & eLocationGapCrossesUnknownGap) {
777  "Feature crosses gap of unknown length");
778  }
779  }
780  } catch (const CException &e) {
782  string("Exception while checking for intervals in gaps. EXCEPTION: ") +
783  e.what());
784  } catch (const std::exception&) {
785  }
786  }
787  }
788 
789 }
790 
791 
793 {
796  return true;
797  } else {
798  return false;
799  }
800 }
801 
802 
803 class CGapCache {
804 public:
805  CGapCache(const CSeq_loc& loc, CBioseq_Handle bsh);
807  bool IsUnknownGap(size_t offset);
808  bool IsKnownGap(size_t offset);
809  bool IsGap(size_t offset);
810 
811 private:
812  typedef enum {
818  size_t m_NumUnknown;
819  size_t m_NumKnown;
820 };
821 
822 CGapCache::CGapCache(const CSeq_loc& loc, CBioseq_Handle bsh)
823 {
824  TSeqPos start = loc.GetStart(eExtreme_Positional);
825  TSeqPos stop = loc.GetStop(eExtreme_Positional);
826  CRange<TSeqPos> range(start, stop);
828  TSeqPos pos = start;
829  while (map_iter && pos <= stop) {
830  TSeqPos map_end = map_iter.GetPosition() + map_iter.GetLength();
831  if (map_iter.GetType() == CSeqMap::eSeqGap) {
832  for (; pos < map_end && pos <= stop; pos++) {
833  if (map_iter.IsUnknownLength()) {
834  m_Map[pos - start] = eGapType_unknown;
835  m_NumUnknown++;
836  } else {
837  m_Map[pos - start] = eGapType_known;
838  m_NumKnown++;
839  }
840  }
841  } else {
842  pos = map_end;
843  }
844  ++map_iter;
845  }
846 }
847 
848 bool CGapCache::IsGap(size_t pos)
849 {
850  if (m_Map.find(pos) != m_Map.end()) {
851  return true;
852  } else {
853  return false;
854  }
855 }
856 
857 
858 bool CGapCache::IsKnownGap(size_t pos)
859 {
860  TGapTypeMap::iterator it = m_Map.find(pos);
861  if (it == m_Map.end()) {
862  return false;
863  } else if (it->second == eGapType_known) {
864  return true;
865  } else {
866  return false;
867  }
868 }
869 
870 
871 bool CGapCache::IsUnknownGap(size_t pos)
872 {
873  TGapTypeMap::iterator it = m_Map.find(pos);
874  if (it == m_Map.end()) {
875  return false;
876  } else if (it->second == eGapType_unknown) {
877  return true;
878  } else {
879  return false;
880  }
881 }
882 
883 
885 
886 {
887  if ( bsh.IsSetInst_Ext() ) {
888  const CBioseq_Handle::TInst_Ext& ext = bsh.GetInst_Ext();
889  if ( ext.IsDelta() ) {
890  ITERATE (CDelta_ext::Tdata, it, ext.GetDelta().Get()) {
891  if ( (*it)->IsLoc() ) {
892  return false;
893  }
894  }
895  }
896  }
897  return true;
898 }
899 
900 
901 size_t CSingleFeatValidator::x_CalculateLocationGaps(CBioseq_Handle bsh, const CSeq_loc& loc, vector<TSeqPos>& gap_starts)
902 {
903  size_t rval = eLocationGapNoProblems;
904  if (!bsh.IsNa() || !bsh.IsSetInst_Repr() || bsh.GetInst().GetRepr() != CSeq_inst::eRepr_delta) {
905  return rval;
906  }
907  // look for features inside gaps, crossing unknown gaps, or starting or ending in gaps
908  // ignore gap features for this
909  int num_n = 0;
910  int num_real = 0;
911  int num_gap = 0;
912  int num_unknown_gap = 0;
913  bool first_in_gap = false, last_in_gap = false;
914  bool local_first_gap = false, local_last_gap = false;
915  bool startsOrEndsInGap = false;
916  bool first = true;
917 
918  for (CSeq_loc_CI loc_it(loc); loc_it; ++loc_it) {
919  CConstRef<CSeq_loc> this_loc = loc_it.GetRangeAsSeq_loc();
920  CSeqVector vec = GetSequenceFromLoc(*this_loc, bsh.GetScope());
921  if (!vec.empty()) {
922  CBioseq_Handle ph;
923  bool match = false;
924  for (auto id_it : bsh.GetBioseqCore()->GetId()) {
925  if (id_it->Equals(loc_it.GetSeq_id())) {
926  match = true;
927  break;
928  }
929  }
930  if (match) {
931  ph = bsh;
932  } else {
933  ph = bsh.GetScope().GetBioseqHandle(*this_loc);
934  }
935  try {
936  CGapCache gap_cache(*this_loc, ph);
937  string vec_data;
938  vec.GetSeqData(0, vec.size(), vec_data);
939 
940  local_first_gap = false;
941  local_last_gap = false;
942  TSeqLength len = loc_it.GetRange().GetLength();
943  ENa_strand strand = loc_it.GetStrand();
944 
945  size_t pos = 0;
946  string::iterator it = vec_data.begin();
947  while (it != vec_data.end() && pos < len) {
948  bool is_gap = false;
949  bool unknown_length = false;
950  if (strand == eNa_strand_minus) {
951  if (gap_cache.IsKnownGap(len - pos - 1)) {
952  is_gap = true;
953  } else if (gap_cache.IsUnknownGap(len - pos - 1)) {
954  is_gap = true;
955  unknown_length = true;
956  }
957  } else {
958  if (gap_cache.IsKnownGap(pos)) {
959  is_gap = true;
960  } else if (gap_cache.IsUnknownGap(pos)) {
961  is_gap = true;
962  unknown_length = true;
963  }
964 
965  }
966  if (is_gap) {
967  if (pos == 0) {
968  local_first_gap = true;
969  } else if (pos == len - 1) {
970  local_last_gap = true;
971  }
972  if (unknown_length) {
973  num_unknown_gap++;
974  } else {
975  num_gap++;
976  }
977  } else if (*it == 'N') {
978  num_n++;
979  } else {
980  num_real++;
981  }
982  ++it;
983  ++pos;
984  }
985  } catch (CException&/* ex*/) {
986  /*
987  PostErr(eDiag_Fatal, eErr_INTERNAL_Exception,
988  string("Exception while checking for intervals in gaps. EXCEPTION: ") +
989  ex.what(), feat);
990  */
991  }
992  }
993  if (first) {
994  first_in_gap = local_first_gap;
995  first = false;
996  }
997  last_in_gap = local_last_gap;
998  if (local_first_gap || local_last_gap) {
999  startsOrEndsInGap = true;
1000  }
1001  }
1002 
1003  if (num_real == 0 && num_n == 0) {
1004  TSeqPos start = loc.GetStart(eExtreme_Positional);
1005  TSeqPos stop = loc.GetStop(eExtreme_Positional);
1006  if ((start == 0 || CSeqMap_CI(bsh, SSeqMapSelector(), start - 1).GetType() != CSeqMap::eSeqGap)
1007  && (stop == bsh.GetBioseqLength() - 1 || CSeqMap_CI(bsh, SSeqMapSelector(), stop + 1).GetType() != CSeqMap::eSeqGap)) {
1009  }
1010  }
1011 
1012 
1013  if (num_gap == 0 && num_unknown_gap == 0 && num_n == 0) {
1014  // ignore features that do not cover any gap characters
1015  } else if (first_in_gap || last_in_gap) {
1016  if (num_real > 0) {
1017  TSeqPos gap_start = x_FindStartOfGap(bsh,
1018  first_in_gap ? loc.GetStart(eExtreme_Biological)
1019  : loc.GetStop(eExtreme_Biological), &(bsh.GetScope()));
1020  gap_starts.push_back(gap_start);
1021  } else {
1023  }
1024  } else if (num_real == 0 && num_gap == 0 && num_unknown_gap == 0 && num_n >= 50) {
1026  } else if (startsOrEndsInGap) {
1028  } else if (num_unknown_gap > 0) {
1030  }
1031 
1032  if (num_n > num_real && xf_IsDeltaLitOnly(bsh)) {
1033  rval |= eLocationGapMostlyNs;
1034  }
1035 
1036  return rval;
1037 }
1038 
1039 
1041 {
1042  if (!bsh || !bsh.IsNa() || !bsh.IsSetInst_Repr()
1044  || !bsh.GetInst().IsSetExt()
1045  || !bsh.GetInst().GetExt().IsDelta()) {
1046  return bsh.GetInst_Length();
1047  }
1048  TSeqPos offset = 0;
1049 
1051  TSeqPos len = 0;
1052  if ((*it)->IsLiteral()) {
1053  len = (*it)->GetLiteral().GetLength();
1054  } else if ((*it)->IsLoc()) {
1055  len = sequence::GetLength((*it)->GetLoc(), scope);
1056  }
1057  if (pos >= offset && pos < offset + len) {
1058  return offset;
1059  } else {
1060  offset += len;
1061  }
1062  }
1063  return offset;
1064 }
1065 
1066 
1068 {
1069  if (!bsh.IsNa() || !bsh.IsSetInst_Repr() || bsh.GetInst_Repr() != CSeq_inst::eRepr_raw) {
1070  return false;
1071  }
1072  int num_n = 0;
1073  int real_bases = 0;
1074 
1075  for (CSeq_loc_CI loc_it(loc); loc_it; ++loc_it) {
1076  CConstRef<CSeq_loc> this_loc = loc_it.GetRangeAsSeq_loc();
1077  CSeqVector vec = GetSequenceFromLoc(*this_loc, bsh.GetScope());
1078  if (!vec.empty()) {
1079  CBioseq_Handle ph;
1080  bool match = false;
1081  for (auto id_it : bsh.GetBioseqCore()->GetId()) {
1082  if (id_it->Equals(loc_it.GetSeq_id())) {
1083  match = true;
1084  break;
1085  }
1086  }
1087  if (match) {
1088  ph = bsh;
1089  } else {
1090  ph = bsh.GetScope().GetBioseqHandle(*this_loc);
1091  }
1093  string vec_data;
1094  try {
1095  vec.GetSeqData(0, vec.size(), vec_data);
1096 
1097  int pos = 0;
1098  string::iterator it = vec_data.begin();
1099  while (it != vec_data.end()) {
1100  if (*it == 'N') {
1101  CSeqMap_CI map_iter(ph, SSeqMapSelector(), offset + pos);
1102  if (map_iter.GetType() == CSeqMap::eSeqGap) {
1103  } else {
1104  num_n++;
1105  }
1106  } else {
1107  if ((unsigned)(*it + 1) <= 256 && isalpha(*it)) {
1108  real_bases++;
1109  }
1110  }
1111  ++it;
1112  ++pos;
1113  }
1114  } catch (const CException& ) {
1115  } catch (const std::exception& ) {
1116  }
1117  }
1118  }
1119 
1120  return (num_n > real_bases);
1121 }
1122 
1123 
1125 {
1126  CBioseq_Handle prot_handle;
1127  is_far = false;
1128  if (!m_Feat.IsSetProduct()) {
1129  return prot_handle;
1130  }
1131  const CSeq_id* protid = nullptr;
1132  try {
1133  protid = &sequence::GetId(m_Feat.GetProduct(), &m_Scope);
1134  } catch (CException&) {}
1135 
1136  if (!protid) {
1137  return prot_handle;
1138  }
1139 
1140  if (m_Imp.IsHugeFileMode()) {
1141  if (look_far && m_Imp.IsFarSequence(*protid)) {
1142  prot_handle = m_Scope.GetBioseqHandle(*protid);
1143  if (prot_handle) {
1144  is_far = true;
1145  }
1146  return prot_handle;
1147  }
1148  return m_Imp.GetLocalBioseqHandle(*protid);
1149  }
1150 
1151  // try "local" scope
1153  if (!prot_handle) {
1154  prot_handle = m_Scope.GetBioseqHandleFromTSE(*protid, m_Imp.GetTSE_Handle());
1155  }
1156  if (!prot_handle && look_far) {
1157  prot_handle = m_Scope.GetBioseqHandle(*protid);
1158  if (prot_handle) {
1159  is_far = true;
1160  }
1161  }
1162 
1163  return prot_handle;
1164 }
1165 
1166 
1168 {
1169  bool look_far = false;
1170 
1171  if (m_Feat.IsSetData()) {
1172  if (m_Feat.GetData().IsCdregion()) {
1173  look_far = m_Imp.IsFarFetchCDSproducts();
1174  } else if (m_Feat.GetData().IsRna()) {
1175  look_far = m_Imp.IsFarFetchMRNAproducts();
1176  } else {
1177  look_far = m_Imp.IsRemoteFetch();
1178  }
1179  }
1180 
1181  return x_GetFeatureProduct(look_far, is_far);
1182 }
1183 
1184 
1186 {
1188  (!m_Feat.IsSetExcept() || !m_Feat.GetExcept())) {
1190  "Exception text is present, but exception flag is not set");
1191  } else if (m_Feat.IsSetExcept() && m_Feat.GetExcept() &&
1194  "Exception flag is set, but exception text is empty");
1195  }
1196  if (m_Feat.IsSetExcept_text() && !m_Feat.GetExcept_text().empty()) {
1198  }
1199 }
1200 
1201 
1203 {
1204  if (text.empty()) return;
1205 
1206  EDiagSev sev = eDiag_Error;
1207  bool found = false;
1208 
1209  string str;
1210 
1211  bool reasons_in_cit = false;
1212  bool annotated_by_transcript_or_proteomic = false;
1213  bool redundant_with_comment = false;
1214  bool refseq_except = false;
1215  vector<string> exceptions;
1216  NStr::Split(text, ",", exceptions, 0);
1217  ITERATE(vector<string>, it, exceptions) {
1218  found = false;
1219  str = NStr::TruncateSpaces(*it);
1220  if (NStr::IsBlank(*it)) {
1221  continue;
1222  }
1224 
1225  if (found) {
1226  if (NStr::EqualNocase(str, "reasons given in citation")) {
1227  reasons_in_cit = true;
1228  } else if (NStr::EqualNocase(str, "annotated by transcript or proteomic data")) {
1229  annotated_by_transcript_or_proteomic = true;
1230  }
1231  }
1232  if (!found) {
1233  if (m_LocationBioseq) {
1234  bool check_refseq = false;
1235  if (m_Imp.IsRefSeqConventions()) {
1236  check_refseq = true;
1237  } else if (GetGenProdSetParent(m_LocationBioseq)) {
1238  check_refseq = true;
1239  } else {
1241  if ((*id_it)->IsOther()) {
1242  check_refseq = true;
1243  break;
1244  }
1245  }
1246  }
1247 
1248  if (check_refseq) {
1250  found = true;
1251  refseq_except = true;
1252  }
1253  }
1254  }
1255  }
1256  if (!found) {
1257  // lower to warning for genomic refseq
1258  const CSeq_id *id = m_Feat.GetLocation().GetId();
1259  if ((id && IsNTNCNWACAccession(*id)) ||
1261  sev = eDiag_Warning;
1262  }
1264  str + " is not a legal exception explanation");
1265  }
1266  if (m_Feat.IsSetComment() && NStr::Find(m_Feat.GetComment(), str) != string::npos) {
1267  if (!NStr::EqualNocase(str, "ribosomal slippage") &&
1268  !NStr::EqualNocase(str, "trans-splicing") &&
1269  !NStr::EqualNocase(str, "RNA editing") &&
1270  !NStr::EqualNocase(str, "artificial location")) {
1271  redundant_with_comment = true;
1272  } else if (NStr::EqualNocase(m_Feat.GetComment(), str)) {
1273  redundant_with_comment = true;
1274  }
1275  }
1276  }
1277  if (redundant_with_comment) {
1279  "Exception explanation text is also found in feature comment");
1280  }
1281  if (refseq_except) {
1282  bool found_just_the_exception = CSeq_feat::IsExceptionTextRefSeqOnly(str);
1283 
1284  if (!found_just_the_exception) {
1286  "Genome processing exception should not be combined with other explanations");
1287  }
1288  }
1289 
1290  if (reasons_in_cit && !m_Feat.IsSetCit()) {
1292  "Reasons given in citation exception does not have the required citation");
1293  }
1294  if (annotated_by_transcript_or_proteomic) {
1295  bool has_inference = false;
1297  if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "inference")) {
1298  has_inference = true;
1299  break;
1300  }
1301  }
1302  if (!has_inference) {
1304  "Annotated by transcript or proteomic data exception does not have the required inference qualifier");
1305  }
1306  }
1307 }
1308 
1309 
1310 const string kOrigProteinId = "orig_protein_id";
1311 
1313 {
1314  if (!m_Feat.GetData().IsRna()) {
1315  return true;
1316  } else {
1317  return false;
1318  }
1319 }
1320 
1321 
1323 {
1324  if (!m_Feat.IsSetQual()) {
1325  return;
1326  }
1327  string key;
1328  bool is_imp = false;
1330 
1331  if (m_Feat.IsSetData() && m_Feat.GetData().IsImp()) {
1332  is_imp = true;
1333  key = m_Feat.GetData().GetImp().GetKey();
1334  if (ftype == CSeqFeatData::eSubtype_imp && NStr::EqualNocase (key, "gene")) {
1336  } else if (ftype == CSeqFeatData::eSubtype_imp) {
1338  } else if (ftype == CSeqFeatData::eSubtype_Imp_CDS
1340  || ftype == CSeqFeatData::eSubtype_org) {
1342  }
1343  }
1344  else {
1345  key = m_Feat.GetData().GetKey();
1346  if (NStr::Equal (key, "Gene")) {
1347  key = "gene";
1348  }
1349  }
1350 
1351  for (auto gbq : m_Feat.GetQual()) {
1352  const string& qual_str = gbq->GetQual();
1353 
1354  if ( NStr::Equal (qual_str, "gsdb_id")) {
1355  continue;
1356  }
1357  auto gbqual_and_value = CSeqFeatData::GetQualifierTypeAndValue(qual_str);
1358  auto gbqual = gbqual_and_value.first;
1359  bool same_case = (gbqual == CSeqFeatData::eQual_bad) || NStr::EqualCase(gbqual_and_value.second, qual_str);
1360 
1361  if ( !same_case ) {
1363  qual_str + " is improperly capitalized");
1364  }
1365 
1366  if ( gbqual == CSeqFeatData::eQual_bad ) {
1367  if (is_imp) {
1368  if (!gbq->IsSetQual() || NStr::IsBlank(gbq->GetQual())) {
1370  "NULL qualifier");
1371  }
1372  else {
1374  "Unknown qualifier " + qual_str);
1375  }
1376  } else if (NStr::Equal(qual_str, kOrigProteinId)) {
1377  if (x_ReportOrigProteinId()) {
1379  }
1380  } else {
1382  if (chs == CSeqFeatData::e_Gene) {
1383  if (NStr::Equal(qual_str, "gen_map")
1384  || NStr::Equal(qual_str, "cyt_map")
1385  || NStr::Equal(qual_str, "rad_map")) {
1386  continue;
1387  }
1388  } else if (chs == CSeqFeatData::e_Cdregion) {
1389  if (NStr::Equal(qual_str, "orig_transcript_id")) {
1390  continue;
1391  }
1392  } else if (chs == CSeqFeatData::e_Rna) {
1393  if (NStr::Equal(qual_str, "orig_transcript_id")) {
1394  continue;
1395  }
1396  }
1397  PostErr(eDiag_Warning, eErr_SEQ_FEAT_UnknownFeatureQual, "Unknown qualifier " + qual_str);
1398  }
1399  } else {
1400  if ( ftype != CSeqFeatData::eSubtype_bad && !CSeqFeatData::IsLegalQualifier(ftype, gbqual) ) {
1403  "Wrong qualifier " + qual_str + " for feature " +
1404  key);
1405  }
1406  else if (ftype == CSeqFeatData::eSubtype_misc_feature &&
1407  gbqual == CSeqFeatData::eQual_feat_class && !m_Imp.IsRefSeq()) {
1409  "feat_class qualifier is only legal for RefSeq");
1410  }
1411 
1412  if (gbq->IsSetVal() && !NStr::IsBlank(gbq->GetVal())) {
1413  // validate value of gbqual
1414  const string& val = gbq->GetVal();
1415  switch (gbqual) {
1416 
1418  if (NStr::Find(val, ",") != NPOS) {
1420  "Compound '" + val + "' must be split into separate instances of qualifier " + qual_str);
1421  }
1424  val + " is not a legal value for qualifier " + qual_str);
1425  }
1426  break;
1427 
1430  break;
1431 
1434  break;
1435 
1438  break;
1439 
1442  break;
1443 
1445  if (is_imp) {
1446  x_ValidateReplaceQual(key, qual_str, val);
1447  }
1448  break;
1449 
1452  if (is_imp && !CGb_qual::IsLegalMobileElementValue(val)) {
1454  val + " is not a legal value for qualifier " + qual_str);
1455  }
1456  break;
1457 
1460  break;
1461 
1463  if (is_imp && ftype == CSeqFeatData::eSubtype_misc_feature
1464  && NStr::EqualCase (val, "Vector Contamination")) {
1466  "Vector Contamination region should be trimmed from sequence");
1467  }
1468  break;
1469 
1471  if (!is_imp) {
1473  if (chs == CSeqFeatData::e_Gene) {
1475  "A product qualifier is not used on a gene feature");
1476  }
1477  }
1478  break;
1479 
1480  // for VR-825
1483  "locus-tag values should be on genes");
1484  break;
1485  default:
1486  break;
1487  } // end of switch statement
1488  }
1489  }
1490  }
1491 }
1492 
1493 
1494 void CSingleFeatValidator::x_ValidateRptUnitVal (const string& val, const string& key)
1495 {
1496  bool /* found = false, */ multiple_rpt_unit = false;
1497  ITERATE(string, it, val) {
1498  if ( *it <= ' ' ) {
1499  /* found = true; */
1500  } else if ( *it == '(' || *it == ')' ||
1501  *it == ',' || *it == '.' ||
1502  isdigit((unsigned char)(*it)) ) {
1503  multiple_rpt_unit = true;
1504  }
1505  }
1506  /*
1507  if ( found ||
1508  (!multiple_rpt_unit && val.length() > 48) ) {
1509  error = true;
1510  }
1511  */
1512  if ( NStr::CompareNocase(key, "repeat_region") == 0 &&
1513  !multiple_rpt_unit ) {
1514  if (val.length() <= GetLength(m_Feat.GetLocation(), &m_Scope) ) {
1515  bool just_nuc_letters = true;
1516  static const string nuc_letters = "ACGTNacgtn";
1517  ITERATE(string, it, val) {
1518  if ( nuc_letters.find(*it) == NPOS ) {
1519  just_nuc_letters = false;
1520  break;
1521  }
1522  }
1523 
1524  if ( just_nuc_letters ) {
1526  if ( !vec.empty() ) {
1527  string vec_data;
1528  vec.GetSeqData(0, vec.size(), vec_data);
1529  if (NStr::FindNoCase (vec_data, val) == string::npos) {
1531  "repeat_region /rpt_unit and underlying "
1532  "sequence do not match");
1533  }
1534  }
1535  }
1536  } else {
1538  "Length of rpt_unit_seq is greater than feature length");
1539  }
1540  }
1541 }
1542 
1543 
1544 void CSingleFeatValidator::x_ValidateRptUnitSeqVal (const string& val, const string& key)
1545 {
1546  // do validation common to rpt_unit
1548 
1549  // do the validation specific to rpt_unit_seq
1550  const char *cp = val.c_str();
1551  bool badchars = false;
1552  while (*cp != 0 && !badchars) {
1553  if (*cp < ' ') {
1554  badchars = true;
1555  } else if (*cp != '(' && *cp != ')'
1556  && !isdigit (*cp) && !isalpha (*cp)
1557  && *cp != ',' && *cp != ';') {
1558  badchars = true;
1559  }
1560  cp++;
1561  }
1562  if (badchars) {
1564  "/rpt_unit_seq has illegal characters");
1565  }
1566 }
1567 
1568 
1569 static bool s_RptUnitIsBaseRange (string str, TSeqPos& from, TSeqPos& to)
1570 
1571 {
1572  if (str.length() > 25) {
1573  return false;
1574  }
1575  size_t pos = NStr::Find (str, "..");
1576  if (pos == string::npos) {
1577  return false;
1578  }
1579 
1580  int tmp_from, tmp_to;
1581  try {
1582  tmp_from = NStr::StringToInt (str.substr(0, pos));
1583  from = tmp_from;
1584  tmp_to = NStr::StringToInt (str.substr (pos + 2));
1585  to = tmp_to;
1586  } catch (const CException& ) {
1587  return false;
1588  } catch (const std::exception& ) {
1589  return false;
1590  }
1591  if (tmp_from < 0 || tmp_to < 0) {
1592  return false;
1593  }
1594  return true;
1595 }
1596 
1597 
1599 {
1600  TSeqPos from = kInvalidSeqPos, to = kInvalidSeqPos;
1601  if (!s_RptUnitIsBaseRange(val, from, to)) {
1603  "/rpt_unit_range is not a base range");
1604  } else {
1606  if (from - 1 < range.GetFrom() || from - 1> range.GetTo() || to - 1 < range.GetFrom() || to - 1 > range.GetTo()) {
1608  "/rpt_unit_range is not within sequence length");
1609  } else {
1610  bool nulls_between = false;
1611  for ( CTypeConstIterator<CSeq_loc> lit = ConstBegin(m_Feat.GetLocation()); lit; ++lit ) {
1612  if ( lit->Which() == CSeq_loc::e_Null ) {
1613  nulls_between = true;
1614  }
1615  }
1616  if (nulls_between) {
1617  bool in_range = false;
1618  for ( CSeq_loc_CI it(m_Feat.GetLocation()); it; ++it ) {
1619  range = it.GetEmbeddingSeq_loc().GetTotalRange();
1620  if (from - 1 < range.GetFrom() || from - 1> range.GetTo() || to - 1 < range.GetFrom() || to - 1 > range.GetTo()) {
1621  } else {
1622  in_range = true;
1623  }
1624  }
1625  if (! in_range) {
1627  "/rpt_unit_range is not within ordered intervals");
1628  }
1629  }
1630  }
1631  }
1632 }
1633 
1634 
1636 {
1637  bool only_digits = true,
1638  has_spaces = false;
1639 
1640  ITERATE(string, it, val) {
1641  if ( isspace((unsigned char)(*it)) ) {
1642  has_spaces = true;
1643  }
1644  if ( !isdigit((unsigned char)(*it)) ) {
1645  only_digits = false;
1646  }
1647  }
1648  if (only_digits || has_spaces) {
1649  PostErr (eDiag_Error, eErr_SEQ_FEAT_InvalidQualifierValue, "Illegal value for qualifier label");
1650  }
1651 }
1652 
1653 
1655 {
1656  if (!NStr::StartsWith (val, "(")) {
1657  EAccessionFormatError valid_accession = ValidateAccessionString (val, true);
1658  if (valid_accession == eAccessionFormat_missing_version) {
1660  val + " accession missing version for qualifier compare");
1661  } else if (valid_accession == eAccessionFormat_bad_version) {
1663  val + " accession has bad version for qualifier compare");
1664  } else if (valid_accession != eAccessionFormat_valid) {
1666  val + " is not a legal accession for qualifier compare");
1667  } else if (m_Imp.IsINSDInSep() && NStr::Find (val, "_") != string::npos) {
1669  "RefSeq accession " + val + " cannot be used for qualifier compare");
1670  }
1671  }
1672 }
1673 
1674 
1675 static bool s_StringConsistsOf (string str, string consist)
1676 {
1677  const char *src = str.c_str();
1678  const char *find = consist.c_str();
1679  bool rval = true;
1680 
1681  while (*src != 0 && rval) {
1682  if (strchr (find, *src) == NULL) {
1683  rval = false;
1684  }
1685  src++;
1686  }
1687  return rval;
1688 }
1689 
1690 
1691 void CSingleFeatValidator::x_ValidateReplaceQual(const string& key, const string& qual_str, const string& val)
1692 {
1693  if (m_LocationBioseq) {
1694  if (m_LocationBioseq.IsNa()) {
1695  if (NStr::Equal(key, "variation")) {
1696  if (!s_StringConsistsOf (val, "acgtACGT")) {
1698  val + " is not a legal value for qualifier " + qual_str
1699  + " - should only be composed of acgt unambiguous nucleotide bases");
1700  }
1701  } else if (!s_StringConsistsOf (val, "acgtmrwsykvhdbn")) {
1703  val + " is not a legal value for qualifier " + qual_str
1704  + " - should only be composed of acgtmrwsykvhdbn nucleotide bases");
1705  }
1706  } else if (m_LocationBioseq.IsAa()) {
1707  if (!s_StringConsistsOf (val, "acdefghiklmnpqrstuvwy*")) {
1709  val + " is not a legal value for qualifier " + qual_str
1710  + " - should only be composed of acdefghiklmnpqrstuvwy* amino acids");
1711  }
1712  }
1713 
1714  // if no point in location with fuzz, info if text matches sequence
1715  bool has_fuzz = false;
1716  for( objects::CSeq_loc_CI it(m_Feat.GetLocation()); it && !has_fuzz; ++it) {
1717  if (it.IsPoint() && (it.GetFuzzFrom() || it.GetFuzzTo())) {
1718  has_fuzz = true;
1719  }
1720  }
1721  if (!has_fuzz && val.length() == GetLength (m_Feat.GetLocation(), &m_Scope)) {
1722  try {
1724  string bases;
1725  nuc_vec.GetSeqData(0, nuc_vec.size(), bases);
1726  if (NStr::EqualNocase(val, bases)) {
1728  "/replace already matches underlying sequence (" + val + ")");
1729  }
1730  } catch (const CException& ) {
1731  } catch (const std::exception& ) {
1732  }
1733  }
1734  }
1735 }
1736 
1737 
1739 {
1740  if (HasBadCharacter (value)) {
1742  field_name + " contains undesired character");
1743  }
1744  if (EndsWithBadCharacter (value)) {
1746  field_name + " ends with undesired character");
1747  }
1748  if (NStr::EndsWith (value, "-")) {
1751  field_name + " ends with hyphen");
1752  }
1753 }
1754 
1755 
1756 void CSingleFeatValidator::ValidateSplice(bool gene_pseudo, bool check_all)
1757 {
1758  if (!m_LocationBioseq) {
1759  return;
1760  }
1761 
1762  CSpliceProblems splice_problems;
1763  splice_problems.CalculateSpliceProblems(m_Feat, check_all, gene_pseudo, m_LocationBioseq);
1764 
1765  if (splice_problems.AreErrorsUnexpected()) {
1767  x_ReportSpliceProblems(splice_problems, label);
1768  }
1769 
1770  if (splice_problems.IsExceptionUnnecessary()) {
1772  "feature has exception but passes splice site test");
1773  }
1774 }
1775 
1776 
1778 {
1779  EDiagSev sev = eDiag_Warning;
1780  if (m_Imp.IsGpipe() && m_Imp.IsGenomic()) {
1781  sev = eDiag_Info;
1782  } else if ((m_Imp.IsGPS() || m_Imp.IsRefSeq()) && !m_Imp.ReportSpliceAsError()) {
1783  sev = eDiag_Warning;
1784  }
1785  return sev;
1786 }
1787 
1788 
1790 {
1791  if (problem.first == CSpliceProblems::eSpliceSiteRead_BadSeq) {
1793  "Bad sequence at splice donor after exon ending at position "
1794  + NStr::IntToString(problem.second + 1) + " of " + label);
1795  } else if (problem.first == CSpliceProblems::eSpliceSiteRead_WrongNT) {
1797  "Splice donor consensus (GT) not found after exon ending at position "
1798  + NStr::IntToString(problem.second + 1) + " of " + label);
1799  }
1800 
1801 }
1802 
1803 
1805 {
1806  if (problem.first == CSpliceProblems::eSpliceSiteRead_BadSeq) {
1808  "Bad sequence at splice acceptor before exon starting at position "
1809  + NStr::IntToString(problem.second + 1) + " of " + label);
1810  } else if (problem.first == CSpliceProblems::eSpliceSiteRead_WrongNT) {
1812  "Splice acceptor consensus (AG) not found before exon starting at position "
1813  + NStr::IntToString(problem.second + 1) + " of " + label);
1814  }
1815 
1816 }
1817 
1818 
1820 (const CSpliceProblems& problems, const string& label)
1821 {
1822  const CSpliceProblems::TSpliceProblemList& donor_problems = problems.GetDonorProblems();
1823  for (auto it = donor_problems.begin(); it != donor_problems.end(); it++) {
1825  }
1826  const CSpliceProblems::TSpliceProblemList& acceptor_problems = problems.GetAcceptorProblems();
1827  for (auto it = acceptor_problems.begin(); it != acceptor_problems.end(); it++) {
1829  }
1830 }
1831 
1832 
1834 {
1835  if (bsh) {
1836  FOR_EACH_SEQID_ON_BIOSEQ (it, *(bsh.GetBioseqCore())) {
1837  if ((*it)->IsOther() && (*it)->GetOther().IsSetAccession()
1838  && NStr::StartsWith ((*it)->GetOther().GetAccession(), "NM_")) {
1839  return true;
1840  }
1841  }
1842  }
1843  return false;
1844 }
1845 
1846 
1848 {
1849  if (m_Feat.GetData().IsImp()) {
1850  return;
1851  }
1852  string key = m_Feat.GetData().GetKey();
1853 
1855 
1856  // look for mandatory qualifiers
1857  EDiagSev sev = eDiag_Warning;
1858 
1859  for (auto required : CSeqFeatData::GetMandatoryQualifiers(subtype))
1860  {
1861  bool found = false;
1862  if (m_Feat.IsSetQual()) {
1863  for (auto qual : m_Feat.GetQual()) {
1864  if (qual->IsSetQual() && CSeqFeatData::GetQualifierType(qual->GetQual()) == required) {
1865  found = true;
1866  break;
1867  }
1868  }
1869  }
1870 
1871  if (!found) {
1872  if (required == CSeqFeatData::eQual_citation) {
1873  if (m_Feat.IsSetCit()) {
1874  found = true;
1875  } else if (m_Feat.IsSetComment() && NStr::EqualNocase (key, "conflict")) {
1876  // RefSeq allows conflict with accession in comment instead of sfp->cit
1878  if ((*it)->IsOther()) {
1879  found = true;
1880  break;
1881  }
1882  }
1883  }
1884  }
1885  }
1886  if (!found && (NStr::EqualNocase (key, "conflict") || NStr::EqualNocase (key, "old_sequence"))) {
1887  if (m_Feat.IsSetQual()) {
1888  for (auto qual : m_Feat.GetQual()) {
1889  if (qual->IsSetQual() && NStr::EqualNocase(qual->GetQual(), "compare")
1890  && qual->IsSetVal() && !NStr::IsBlank(qual->GetVal())) {
1891  found = true;
1892  break;
1893  }
1894  }
1895  }
1896  }
1897  if (!found && required == CSeqFeatData::eQual_ncRNA_class) {
1898  sev = eDiag_Error;
1899  if (m_Feat.GetData().IsRna() && m_Feat.GetData().GetRna().IsSetExt()
1900  && m_Feat.GetData().GetRna().GetExt().IsGen()
1903  found = true;
1904  }
1905  }
1906 
1907  if (!found) {
1909  "Missing qualifier " + CSeqFeatData::GetQualifierAsString(required) +
1910  " for feature " + key);
1911  }
1912  }
1913 }
1914 
1915 
1916 static bool s_LocationStrandsIncompatible (const CSeq_loc& loc1, const CSeq_loc& loc2, CScope * scope)
1917 {
1918  ENa_strand strand1 = loc1.GetStrand();
1919  ENa_strand strand2 = loc2.GetStrand();
1920 
1921  if (strand1 == strand2) {
1922  return false;
1923  }
1924  if ((strand1 == eNa_strand_unknown || strand1 == eNa_strand_plus) &&
1925  (strand2 == eNa_strand_unknown || strand2 == eNa_strand_plus)) {
1926  return false;
1927  }
1928  if (strand1 == eNa_strand_other) {
1929  ECompare comp = Compare(loc1, loc2, scope, fCompareOverlapping);
1930  if (comp == eContains) {
1931  return false;
1932  }
1933  } else if (strand2 == eNa_strand_other) {
1934  ECompare comp = Compare(loc1, loc2, scope, fCompareOverlapping);
1935  if (comp == eContained) {
1936  return false;
1937  }
1938  }
1939 
1940  return true;
1941 }
1942 
1943 
1945 {
1946  bool bad_strand = s_LocationStrandsIncompatible(gene.GetLocation(), m_Feat.GetLocation(), &m_Scope);
1947  if (bad_strand) {
1949  "Gene cross-reference is not on expected strand");
1950  }
1951 
1952 }
1953 
1954 
1956 {
1957  bool equivalent = false;
1958  if (g1.IsSetLocus_tag()
1959  && g2.IsSetLocus_tag()) {
1961  g2.GetLocus_tag())) {
1962  label = g1.GetLocus_tag();
1963  equivalent = true;
1964  }
1965  } else if (g1.IsSetLocus()
1966  && g2.IsSetLocus()) {
1967  if (NStr::EqualNocase(g1.GetLocus(),
1968  g2.GetLocus())) {
1969  label = g1.GetLocus();
1970  equivalent = true;
1971  }
1972  } else if (g1.IsSetSyn()
1973  && g2.IsSetSyn()) {
1974  if (NStr::EqualNocase (g1.GetSyn().front(),
1975  g2.GetSyn().front())) {
1976  label = g1.GetSyn().front();
1977  equivalent = true;
1978  }
1979  }
1980  return equivalent;
1981 }
1982 
1983 
1984 // Check for redundant gene Xref
1985 // Do not call if feat is gene
1987 {
1988  if (m_Feat.IsSetData() && m_Feat.GetData().IsGene()) {
1989  return;
1990  }
1991  auto tse = m_Imp.GetTSE_Handle();
1992  if (!tse) {
1993  return;
1994  }
1995 
1996  // first, look for gene by feature id xref
1997  bool has_gene_id_xref = false;
1998  if (m_Feat.IsSetXref()) {
2000  if ((*xref)->IsSetId() && (*xref)->GetId().IsLocal()) {
2001  CTSE_Handle::TSeq_feat_Handles gene_feats =
2002  tse.GetFeaturesWithId(CSeqFeatData::eSubtype_gene, (*xref)->GetId().GetLocal());
2003  if (gene_feats.size() > 0) {
2004  has_gene_id_xref = true;
2005  ITERATE(CTSE_Handle::TSeq_feat_Handles, gene, gene_feats) {
2006  x_ValidateGeneFeaturePair(*(gene->GetSeq_feat()));
2007  }
2008  }
2009  }
2010  }
2011  }
2012  if (has_gene_id_xref) {
2013  return;
2014  }
2015 
2016  // if we can't get the bioseq on which the gene is located, we can't check for
2017  // overlapping/ambiguous/redundant conditions
2018  if (!m_LocationBioseq) {
2019  return;
2020  }
2021 
2022  const CGene_ref* gene_xref = m_Feat.GetGeneXref();
2023 
2024  size_t num_genes = 0;
2025  size_t max = 0;
2026  size_t num_trans_spliced = 0;
2027  bool equivalent = false;
2028  /*
2029  CFeat_CI gene_it(bsh, CSeqFeatData::e_Gene);
2030  */
2031 
2032  //CFeat_CI gene_it(*m_Scope, feat.GetLocation(), SAnnotSelector (CSeqFeatData::e_Gene));
2033  CFeat_CI gene_it(m_LocationBioseq,
2037  CFeat_CI prev_gene;
2038  string label = "?";
2039  size_t num_match_by_locus = 0;
2040  size_t num_match_by_locus_tag = 0;
2041 
2042  for ( ; gene_it; ++gene_it) {
2043  if (gene_xref && gene_xref->IsSetLocus() &&
2044  gene_it->GetData().GetGene().IsSetLocus() &&
2045  NStr::Equal(gene_xref->GetLocus(), gene_it->GetData().GetGene().GetLocus())) {
2046  num_match_by_locus++;
2047  x_ValidateGeneFeaturePair(*(gene_it->GetSeq_feat()));
2048  }
2049  if (gene_xref && gene_xref->IsSetLocus_tag() &&
2050  gene_it->GetData().GetGene().IsSetLocus_tag() &&
2051  NStr::Equal(gene_xref->GetLocus_tag(), gene_it->GetData().GetGene().GetLocus_tag())) {
2052  num_match_by_locus_tag++;
2053  x_ValidateGeneFeaturePair(*(gene_it->GetSeq_feat()));
2054  if ((!gene_xref->IsSetLocus() || NStr::IsBlank(gene_xref->GetLocus())) &&
2055  gene_it->GetData().GetGene().IsSetLocus() &&
2056  !NStr::IsBlank(gene_it->GetData().GetGene().GetLocus())) {
2058  "Feature has Gene Xref with locus_tag but no locus, gene with locus_tag and locus exists");
2059  }
2060  }
2061 
2062  if (TestForOverlapEx (gene_it->GetLocation(), m_Feat.GetLocation(),
2063  gene_it->GetLocation().IsInt() ? eOverlap_Contained : eOverlap_Subset, &m_Scope) >= 0) {
2064  size_t len = GetLength(gene_it->GetLocation(), &m_Scope);
2065  if (len < max || num_genes == 0) {
2066  num_genes = 1;
2067  max = len;
2068  num_trans_spliced = 0;
2069  if (gene_it->IsSetExcept() && gene_it->IsSetExcept_text() &&
2070  NStr::FindNoCase (gene_it->GetExcept_text(), "trans-splicing") != string::npos) {
2071  num_trans_spliced++;
2072  }
2073  equivalent = false;
2074  prev_gene = gene_it;
2075  } else if (len == max) {
2076  equivalent |= s_GeneRefsAreEquivalent(gene_it->GetData().GetGene(), prev_gene->GetData().GetGene(), label);
2077  num_genes++;
2078  if (gene_it->IsSetExcept() && gene_it->IsSetExcept_text() &&
2079  NStr::FindNoCase (gene_it->GetExcept_text(), "trans-splicing") != string::npos) {
2080  num_trans_spliced++;
2081  }
2082  }
2083  }
2084  }
2085 
2086  if (!gene_xref) {
2087  // if there is no gene xref, then there should be 0 or 1 overlapping genes
2088  // so that mapping by overlap is unambiguous
2089  if (num_genes > 1 &&
2092  if (m_Imp.IsSmallGenomeSet() && num_genes == num_trans_spliced) {
2093  /* suppress for trans-spliced genes on small genome set */
2094  } else if (equivalent) {
2096  "Feature overlapped by "
2097  + NStr::SizetToString(num_genes)
2098  + " identical-length equivalent genes but has no cross-reference");
2099  } else {
2101  "Feature overlapped by "
2102  + NStr::SizetToString(num_genes)
2103  + " identical-length genes but has no cross-reference");
2104  }
2105  } else if (num_genes == 1
2106  && prev_gene->GetData().GetGene().IsSetAllele()
2107  && !NStr::IsBlank(prev_gene->GetData().GetGene().GetAllele())) {
2108  const string& allele = prev_gene->GetData().GetGene().GetAllele();
2109  // overlapping gene should not conflict with allele qualifier
2110  FOR_EACH_GBQUAL_ON_FEATURE (qual_iter, m_Feat) {
2111  const CGb_qual& qual = **qual_iter;
2112  if ( qual.IsSetQual() &&
2113  NStr::Compare(qual.GetQual(), "allele") == 0 ) {
2114  if ( qual.CanGetVal() &&
2115  NStr::CompareNocase(qual.GetVal(), allele) == 0 ) {
2117  "Redundant allele qualifier (" + allele +
2118  ") on gene and feature");
2121  "Mismatched allele qualifier on gene (" + allele +
2122  ") and feature (" + qual.GetVal() +")");
2123  }
2124  }
2125  }
2126  }
2127  } else if ( !gene_xref->IsSuppressed() ) {
2128  // we are counting features with gene xrefs
2130 
2131  // make sure overlapping gene and gene xref do not conflict
2132  if (gene_xref->IsSetAllele() && !NStr::IsBlank(gene_xref->GetAllele())) {
2133  const string& allele = gene_xref->GetAllele();
2134 
2135  FOR_EACH_GBQUAL_ON_FEATURE (qual_iter, m_Feat) {
2136  const CGb_qual& qual = **qual_iter;
2137  if ( qual.CanGetQual() &&
2138  NStr::Compare(qual.GetQual(), "allele") == 0 ) {
2139  if ( qual.CanGetVal() &&
2140  NStr::CompareNocase(qual.GetVal(), allele) == 0 ) {
2142  "Redundant allele qualifier (" + allele +
2143  ") on gene and feature");
2146  "Mismatched allele qualifier on gene (" + allele +
2147  ") and feature (" + qual.GetVal() +")");
2148  }
2149  }
2150  }
2151  }
2152 
2153  if (num_match_by_locus == 0 && num_match_by_locus_tag == 0) {
2154  // find gene on bioseq to match genexref
2155  if ((gene_xref->IsSetLocus_tag() &&
2156  !NStr::IsBlank(gene_xref->GetLocus_tag())) ||
2157  (gene_xref->IsSetLocus() &&
2158  !NStr::IsBlank(gene_xref->GetLocus()))) {
2160  if (!gene && m_LocationBioseq && m_LocationBioseq.IsAa()) {
2162  if (cds) {
2163  if (cds->IsSetLocation()) {
2164  const CSeq_loc& loc = cds->GetLocation();
2165  const CSeq_id* id = loc.GetId();
2166  if (id) {
2168  if (nbsh) {
2169  gene = m_Imp.GetGeneCache().GetGeneFromCache(cds, m_Scope);
2170  }
2171  }
2172  }
2173  }
2174  }
2175  string label2;
2176  if (gene && !CSingleFeatValidator::s_GeneRefsAreEquivalent(*gene_xref, gene->GetData().GetGene(), label2)) {
2177  gene.Reset();
2178  }
2179  if (gene_xref->IsSetLocus_tag() &&
2180  !NStr::IsBlank(gene_xref->GetLocus_tag()) &&
2181  !gene) {
2183  "Feature has gene locus_tag cross-reference but no equivalent gene feature exists");
2184  } else if (gene_xref->IsSetLocus() &&
2185  !NStr::IsBlank(gene_xref->GetLocus()) &&
2186  !gene) {
2188  "Feature has gene locus cross-reference but no equivalent gene feature exists");
2189  }
2190  }
2191  }
2192  }
2193 
2194 }
2195 
2196 
2198 {
2199  if (m_Feat.GetData().IsGene()) {
2200  return;
2201  }
2203 
2204  if (m_Feat.IsSetQual()) {
2205  // check old locus tag on feature and overlapping gene
2206  for (auto it : m_Feat.GetQual()) {
2207  if (it->IsSetQual() && NStr::Equal(it->GetQual(), "old_locus_tag")
2208  && it->IsSetVal() && !NStr::IsBlank(it->GetVal())) {
2209  x_ValidateOldLocusTag(it->GetVal());
2210  }
2211  }
2212  }
2213 }
2214 
2215 
2217 {
2218  if (ref.IsSetPseudo() && ref.GetPseudo()) {
2219  return true;
2220  } else {
2221  return false;
2222  }
2223 }
2224 
2225 
2226 bool s_HasNamedQual(const CSeq_feat& feat, const string& qual)
2227 {
2228  bool rval = false;
2229  if (feat.IsSetQual()) {
2230  for (auto it : feat.GetQual()) {
2231  if (it->IsSetQual() && NStr::EqualNocase(it->GetQual(), qual)) {
2232  rval = true;
2233  break;
2234  }
2235  }
2236  }
2237  return rval;
2238 }
2239 
2240 
2242 {
2243  if (feat.IsSetPseudo() && feat.GetPseudo()) {
2244  return true;
2245  } else if (s_HasNamedQual(feat, "pseudogene")) {
2246  return true;
2247  } else if (feat.IsSetData() && feat.GetData().IsGene() &&
2248  s_IsPseudo(feat.GetData().GetGene())) {
2249  return true;
2250  } else {
2251  return false;
2252  }
2253 }
2254 
2255 
2256 void CSingleFeatValidator::x_ValidateOldLocusTag(const string& old_locus_tag)
2257 {
2258  if (NStr::IsBlank(old_locus_tag)) {
2259  return;
2260  }
2261  bool pseudo = s_IsPseudo(m_Feat);
2262  const CGene_ref* grp = m_Feat.GetGeneXref();
2263  if ( !grp) {
2264  // check overlapping gene
2266  if ( overlap ) {
2267  if (s_IsPseudo(*overlap)) {
2268  pseudo = true;
2269  }
2270  string gene_old_locus_tag;
2271 
2272  FOR_EACH_GBQUAL_ON_SEQFEAT (it, *overlap) {
2273  if ((*it)->IsSetQual() && NStr::Equal ((*it)->GetQual(), "old_locus_tag")
2274  && (*it)->IsSetVal() && !NStr::IsBlank((*it)->GetVal())) {
2275  gene_old_locus_tag = (*it)->GetVal();
2276  break;
2277  }
2278  }
2279  if (!NStr::IsBlank (gene_old_locus_tag)
2280  && !NStr::EqualNocase (gene_old_locus_tag, old_locus_tag)) {
2282  "Old locus tag on feature (" + old_locus_tag
2283  + ") does not match that on gene (" + gene_old_locus_tag + ")");
2284  }
2285  grp = &(overlap->GetData().GetGene());
2286  }
2287  }
2288  if (grp && s_IsPseudo(*grp)) {
2289  pseudo = true;
2290  }
2291  if (!grp || !grp->IsSetLocus_tag() || NStr::IsBlank (grp->GetLocus_tag())) {
2292  if (! pseudo) {
2294  "old_locus_tag without inherited locus_tag");
2295  }
2296  }
2297 }
2298 
2299 
2301 {
2302  if (!m_Feat.GetData().IsImp()) {
2303  return;
2304  }
2305  const string& key = m_Feat.GetData().GetImp().GetKey();
2306  // validate the feature's location
2307  if ( m_Feat.GetData().GetImp().IsSetLoc() ) {
2308  const string& imp_loc = m_Feat.GetData().GetImp().GetLoc();
2309  if ( imp_loc.find("one-of") != string::npos ) {
2311  "ImpFeat loc " + imp_loc +
2312  " has obsolete 'one-of' text for feature " + key);
2313  } else if ( m_Feat.GetLocation().IsInt() ) {
2314  const CSeq_interval& seq_int = m_Feat.GetLocation().GetInt();
2315  string temp_loc = NStr::IntToString(seq_int.GetFrom() + 1) +
2316  ".." + NStr::IntToString(seq_int.GetTo() + 1);
2317  if ( imp_loc != temp_loc ) {
2319  "ImpFeat loc " + imp_loc + " does not equal feature location " +
2320  temp_loc + " for feature " + key);
2321  }
2322  }
2323  }
2324 
2325 }
2326 
2327 
2329 {
2330  if (!m_Feat.GetData().IsImp()) {
2331  return;
2332  }
2333  const string& key = m_Feat.GetData().GetImp().GetKey();
2334 
2335  // Make sure a feature has its mandatory qualifiers
2336  for (auto required : CSeqFeatData::GetMandatoryQualifiers(m_Feat.GetData().GetSubtype())) {
2337  bool found = false;
2338  if (m_Feat.IsSetQual()) {
2339  for (auto qual : m_Feat.GetQual()) {
2340  if (qual->IsSetQual() && CSeqFeatData::GetQualifierType(qual->GetQual()) == required) {
2341  found = true;
2342  break;
2343  }
2344  }
2345  if (!found && required == CSeqFeatData::eQual_citation) {
2346  if (m_Feat.IsSetCit()) {
2347  found = true;
2348  }
2349  else if (m_Feat.IsSetComment() && !NStr::IsBlank(m_Feat.GetComment())) {
2350  // RefSeq allows conflict with accession in comment instead of sfp->cit
2351  if (m_LocationBioseq) {
2353  if ((*it)->IsOther()) {
2354  found = true;
2355  break;
2356  }
2357  }
2358  }
2359  }
2360  if (!found
2361  && (NStr::EqualNocase(key, "conflict")
2362  || NStr::EqualNocase(key, "old_sequence"))) {
2363  // compare qualifier can now substitute for citation qualifier for conflict and old_sequence
2365  if ((*qual)->IsSetQual() && CSeqFeatData::GetQualifierType((*qual)->GetQual()) == CSeqFeatData::eQual_compare) {
2366  found = true;
2367  break;
2368  }
2369  }
2370  }
2371  }
2372  }
2373  if (!found) {
2375  "Missing qualifier " + CSeqFeatData::GetQualifierAsString(required) +
2376  " for feature " + key);
2377  }
2378  }
2379 }
2380 
2381 
2383 {
2384  switch ( m_Feat.GetData().Which () ) {
2385  case CSeqFeatData::e_Gene:
2387  case CSeqFeatData::e_Prot:
2388  case CSeqFeatData::e_Rna:
2389  case CSeqFeatData::e_Pub:
2390  case CSeqFeatData::e_Imp:
2392  case CSeqFeatData::e_Org:
2394  case CSeqFeatData::e_Seq:
2396  case CSeqFeatData::e_Bond:
2397  case CSeqFeatData::e_Site:
2398  case CSeqFeatData::e_Rsite:
2399  case CSeqFeatData::e_User:
2401  case CSeqFeatData::e_Num:
2404  case CSeqFeatData::e_Het:
2405  case CSeqFeatData::e_Clone:
2407  break;
2408  default:
2410  "Invalid SeqFeat type [" + NStr::IntToString(m_Feat.GetData().Which()) + "]");
2411  break;
2412  }
2413 }
2414 
2415 
2417 {
2418  bool rval = false;
2419  FOR_EACH_SEQID_ON_BIOSEQ (it, *(bsh.GetBioseqCore())) {
2420  if ((*it)->IsOther() && (*it)->GetTextseq_Id()->IsSetAccession()
2421  && NStr::StartsWith ((*it)->GetTextseq_Id()->GetAccession(), prefix)) {
2422  rval = true;
2423  break;
2424  }
2425  }
2426  return rval;
2427 }
2428 
2429 
2431 {
2432  if (!m_Feat.IsSetData() ||
2435  return;
2436  }
2437  string sfp_pseudo;
2438  string gene_pseudo;
2439  bool has_sfp_pseudo = false;
2440  bool has_gene_pseudo = false;
2441  if (m_Feat.IsSetQual()) {
2442  for (auto it : m_Feat.GetQual()) {
2443  if (it->IsSetQual() &&
2444  NStr::EqualNocase(it->GetQual(), "pseudogene") &&
2445  it->IsSetVal()) {
2446  sfp_pseudo = it->GetVal();
2447  has_sfp_pseudo = true;
2448  }
2449  }
2450  }
2451  if (gene && gene->IsSetQual()) {
2452  for (auto it : gene->GetQual()) {
2453  if (it->IsSetQual() &&
2454  NStr::EqualNocase(it->GetQual(), "pseudogene") &&
2455  it->IsSetVal()) {
2456  gene_pseudo = it->GetVal();
2457  has_gene_pseudo = true;
2458  }
2459  }
2460  }
2461 
2462  if (!has_sfp_pseudo && !has_gene_pseudo) {
2463  return;
2464  } else if (!has_sfp_pseudo) {
2465  return;
2466  } else if (has_sfp_pseudo && !has_gene_pseudo) {
2467  string msg = m_Feat.GetData().IsCdregion() ? "CDS" : "mRNA";
2468  msg += " has pseudogene qualifier, gene does not";
2470  msg);
2471  } else if (!NStr::EqualNocase(sfp_pseudo, gene_pseudo)) {
2472  string msg = "Different pseudogene values on ";
2473  msg += m_Feat.GetData().IsCdregion() ? "CDS" : "mRNA";
2474  msg += " (" + sfp_pseudo + ") and gene (" + gene_pseudo + ")";
2476  msg);
2477  }
2478 }
2479 
2480 
2481 // grp is from gene xref or from overlapping gene
2483 {
2484  if (!m_Imp.IsLocusTagGeneralMatch()) {
2485  return;
2486  }
2487  if (!m_Feat.IsSetProduct()) {
2488  return;
2489  }
2490 
2491  CTempString locus_tag = kEmptyStr;
2492  // obtain the gene-ref from the feature or the overlapping gene
2493  const CGene_ref* grp = m_Feat.GetGeneXref();
2494  if (grp && grp->IsSuppressed()) {
2495  return;
2496  } else if (grp && grp->IsSetLocus_tag() &&
2497  !NStr::IsBlank(grp->GetLocus_tag())) {
2498  locus_tag = grp->GetLocus_tag();
2499  } else if (gene && gene->GetData().GetGene().IsSetLocus_tag() &&
2500  !NStr::IsBlank(gene->GetData().GetGene().GetLocus_tag())) {
2501  locus_tag = gene->GetData().GetGene().GetLocus_tag();
2502  } else {
2503  return;
2504  }
2505 
2506  if (!m_ProductBioseq) {
2507  return;
2508  }
2509 
2510  for (auto id : m_ProductBioseq.GetId()) {
2511  CConstRef<CSeq_id> seqid = id.GetSeqId();
2512  if (!seqid || !seqid->IsGeneral()) {
2513  continue;
2514  }
2515  const CDbtag& dbt = seqid->GetGeneral();
2516  if (!dbt.IsSetDb() || dbt.IsSkippable()) {
2517  continue;
2518  }
2519 
2520  if (dbt.IsSetTag() && dbt.GetTag().IsStr()) {
2521  size_t pos = dbt.GetTag().GetStr().find('-');
2522  string str = dbt.GetTag().GetStr().substr(0, pos);
2523  if (!NStr::EqualNocase(locus_tag, str)) {
2525  "Gene locus_tag does not match general ID of product");
2526  }
2527  }
2528  }
2529 }
2530 
2531 
2532 static string s_AsciiString(const string& src)
2533 {
2534  string dst;
2535 
2536  for (char ch : src) {
2537  unsigned char chu = ch;
2538  if (chu > 31 && chu < 128) {
2539  dst += chu;
2540  } else {
2541  dst += '#';
2542  }
2543  }
2544 
2545  return dst;
2546 }
2547 
2548 
2550 {
2552 
2553  for (; it; ++it) {
2554  const string& str = *it;
2555  FOR_EACH_CHAR_IN_STRING(c_it, str) {
2556  const char& ch = *c_it;
2557  unsigned char chu = ch;
2558  if (ch > 127 || (ch < 32 && ch != '\t' && ch != '\r' && ch != '\n')) {
2559  string txt = s_AsciiString(str);
2561  "Non-ASCII character '" + NStr::NumericToString(chu) + "' found in feature (" + txt + ")");
2562  break;
2563  }
2564  }
2565  }
2566 }
2567 
2569 {
2571 
2572  x_CheckForEmpty();
2573 
2574  const CProt_ref& prot = m_Feat.GetData().GetProt();
2575  for (auto it : prot.GetName()) {
2576  if (prot.IsSetEc() && !prot.IsSetProcessed()
2577  && (NStr::EqualCase (it, "Hypothetical protein")
2578  || NStr::EqualCase (it, "hypothetical protein")
2579  || NStr::EqualCase (it, "Unknown protein")
2580  || NStr::EqualCase (it, "unknown protein"))) {
2582  "Unknown or hypothetical protein should not have EC number");
2583  }
2584 
2585  }
2586 
2587  if (prot.IsSetDesc() && ContainsSgml(prot.GetDesc())) {
2589  "protein description " + prot.GetDesc() + " has SGML");
2590  }
2591 
2592  if (prot.IsSetDesc() && m_Feat.IsSetComment()
2593  && NStr::EqualCase(prot.GetDesc(), m_Feat.GetComment())) {
2595  "Comment has same value as protein description");
2596  }
2597 
2600  "Apparent EC number in protein comment");
2601  }
2602 
2604 
2605  // only look for EC numbers in first protein name
2606  // only look for brackets and hypothetical protein XP_ in first protein name
2607  if (prot.IsSetName() && prot.GetName().size() > 0) {
2608  if (HasECnumberPattern(prot.GetName().front())) {
2610  "Apparent EC number in protein title");
2611  }
2612  x_ValidateProteinName(prot.GetName().front());
2613  }
2614 
2615  if ( prot.CanGetDb () ) {
2616  m_Imp.ValidateDbxref(prot.GetDb(), m_Feat);
2617  }
2618  if ( (!prot.IsSetName() || prot.GetName().empty()) &&
2619  (!prot.IsSetProcessed()
2620  || (prot.GetProcessed() != CProt_ref::eProcessed_signal_peptide
2621  && prot.GetProcessed() != CProt_ref::eProcessed_transit_peptide))) {
2622  if (prot.IsSetDesc() && !NStr::IsBlank (prot.GetDesc())) {
2624  "Protein feature has description but no name");
2625  } else if (prot.IsSetActivity() && !prot.GetActivity().empty()) {
2627  "Protein feature has function but no name");
2628  } else if (prot.IsSetEc() && !prot.GetEc().empty()) {
2630  "Protein feature has EC number but no name");
2631  } else {
2633  "Protein feature has no name");
2634  }
2635  }
2636 
2638 
2640 }
2641 
2642 
2644 {
2645  const CProt_ref& prot = m_Feat.GetData().GetProt();
2647 
2648  if ( prot.IsSetProcessed() ) {
2649  processed = prot.GetProcessed();
2650  }
2651 
2652  bool empty = true;
2653  if ( processed != CProt_ref::eProcessed_signal_peptide &&
2654  processed != CProt_ref::eProcessed_transit_peptide ) {
2655  if ( prot.IsSetName() &&
2656  !prot.GetName().empty() &&
2657  !prot.GetName().front().empty() ) {
2658  empty = false;
2659  }
2660  if ( prot.CanGetDesc() && !prot.GetDesc().empty() ) {
2661  empty = false;
2662  }
2663  if ( prot.CanGetEc() && !prot.GetEc().empty() ) {
2664  empty = false;
2665  }
2666  if ( prot.CanGetActivity() && !prot.GetActivity().empty() ) {
2667  empty = false;
2668  }
2669  if ( prot.CanGetDb() && !prot.GetDb().empty() ) {
2670  empty = false;
2671  }
2672 
2673  if ( empty ) {
2675  "There is a protein feature where all fields are empty");
2676  }
2677  }
2678 }
2679 
2680 
2681 // note - list bad protein names in lower case, as search term is converted to lower case
2682 // before looking for exact match
2683 static const char* const sc_BadProtNameText [] = {
2684  "'hypothetical protein",
2685  "alpha",
2686  "alternative",
2687  "alternatively spliced",
2688  "bacteriophage hypothetical protein",
2689  "beta",
2690  "cellular",
2691  "cnserved hypothetical protein",
2692  "conesrved hypothetical protein",
2693  "conserevd hypothetical protein",
2694  "conserved archaeal protein",
2695  "conserved domain protein",
2696  "conserved hypohetical protein",
2697  "conserved hypotehtical protein",
2698  "conserved hypotheical protein",
2699  "conserved hypothertical protein",
2700  "conserved hypothetcial protein",
2701  "conserved hypothetical",
2702  "conserved hypothetical exported protein",
2703  "conserved hypothetical integral membrane protein",
2704  "conserved hypothetical membrane protein",
2705  "conserved hypothetical phage protein",
2706  "conserved hypothetical prophage protein",
2707  "conserved hypothetical protein",
2708  "conserved hypothetical protein - phage associated",
2709  "conserved hypothetical protein fragment 3",
2710  "conserved hypothetical protein, fragment",
2711  "conserved hypothetical protein, putative",
2712  "conserved hypothetical protein, truncated",
2713  "conserved hypothetical protein, truncation",
2714  "conserved hypothetical protein.",
2715  "conserved hypothetical protein; possible membrane protein",
2716  "conserved hypothetical protein; putative membrane protein",
2717  "conserved hypothetical proteins",
2718  "conserved hypothetical protien",
2719  "conserved hypothetical transmembrane protein",
2720  "conserved hypotheticcal protein",
2721  "conserved hypthetical protein",
2722  "conserved in bacteria",
2723  "conserved membrane protein",
2724  "conserved protein",
2725  "conserved protein of unknown function",
2726  "conserved protein of unknown function ; putative membrane protein",
2727  "conserved unknown protein",
2728  "conservedhypothetical protein",
2729  "conserverd hypothetical protein",
2730  "conservered hypothetical protein",
2731  "consrved hypothetical protein",
2732  "converved hypothetical protein",
2733  "cytokine",
2734  "delta",
2735  "drosophila",
2736  "duplicated hypothetical protein",
2737  "epsilon",
2738  "gamma",
2739  "hla",
2740  "homeodomain",
2741  "homeodomain protein",
2742  "homolog",
2743  "hyopthetical protein",
2744  "hypotethical",
2745  "hypotheical protein",
2746  "hypothertical protein",
2747  "hypothetcical protein",
2748  "hypothetical",
2749  "hypothetical protein",
2750  "hypothetical conserved protein",
2751  "hypothetical exported protein",
2752  "hypothetical novel protein",
2753  "hypothetical orf",
2754  "hypothetical phage protein",
2755  "hypothetical prophage protein",
2756  "hypothetical protein (fragment)",
2757  "hypothetical protein (multi-domain)",
2758  "hypothetical protein (phage associated)",
2759  "hypothetical protein - phage associated",
2760  "hypothetical protein fragment",
2761  "hypothetical protein fragment 1",
2762  "hypothetical protein predicted by genemark",
2763  "hypothetical protein predicted by glimmer",
2764  "hypothetical protein predicted by glimmer/critica",
2765  "hypothetical protein, conserved",
2766  "hypothetical protein, phage associated",
2767  "hypothetical protein, truncated",
2768  "hypothetical protein-putative conserved hypothetical protein",
2769  "hypothetical protein.",
2770  "hypothetical proteins",
2771  "hypothetical protien",
2772  "hypothetical transmembrane protein",
2773  "hypothetoical protein",
2774  "hypothteical protein",
2775  "identified by sequence similarity; putative; orf located~using blastx/framed",
2776  "identified by sequence similarity; putative; orf located~using blastx/glimmer/genemark",
2777  "ion channel",
2778  "membrane protein, putative",
2779  "mouse",
2780  "narrowly conserved hypothetical protein",
2781  "novel protein",
2782  "orf",
2783  "orf, conserved hypothetical protein",
2784  "orf, hypothetical",
2785  "orf, hypothetical protein",
2786  "orf, hypothetical, fragment",
2787  "orf, partial conserved hypothetical protein",
2788  "orf; hypothetical protein",
2789  "orf; unknown function",
2790  "partial",
2791  "partial cds, hypothetical",
2792  "partially conserved hypothetical protein",
2793  "phage hypothetical protein",
2794  "phage-related conserved hypothetical protein",
2795  "phage-related protein",
2796  "plasma",
2797  "possible hypothetical protein",
2798  "precursor",
2799  "predicted coding region",
2800  "predicted protein",
2801  "predicted protein (pseudogene)",
2802  "predicted protein family",
2803  "product uncharacterised protein family",
2804  "protein family",
2805  "protein of unknown function",
2806  "pseudogene",
2807  "putative",
2808  "putative conserved protein",
2809  "putative exported protein",
2810  "putative hypothetical protein",
2811  "putative membrane protein",
2812  "putative orf; unknown function",
2813  "putative phage protein",
2814  "putative protein",
2815  "rearranged",
2816  "repeats containing protein",
2817  "reserved",
2818  "ribosomal protein",
2819  "similar to",
2820  "small",
2821  "small hypothetical protein",
2822  "transmembrane protein",
2823  "trna",
2824  "trp repeat",
2825  "trp-repeat protein",
2826  "truncated conserved hypothetical protein",
2827  "truncated hypothetical protein",
2828  "uncharacterized conserved membrane protein",
2829  "uncharacterized conserved protein",
2830  "uncharacterized conserved secreted protein",
2831  "uncharacterized protein",
2832  "uncharacterized protein conserved in archaea",
2833  "uncharacterized protein conserved in bacteria",
2834  "unique hypothetical",
2835  "unique hypothetical protein",
2836  "unknown",
2837  "unknown cds",
2838  "unknown function",
2839  "unknown gene",
2840  "unknown protein",
2841  "unknown, conserved protein",
2842  "unknown, hypothetical",
2843  "unknown-related protein",
2844  "unknown; predicted coding region",
2845  "unnamed",
2846  "unnamed protein product",
2847  "very hypothetical protein"
2848 };
2851 
2852 
2854 {
2855  if (!m_Imp.IsRefSeq()) {
2856  return;
2857  }
2858  const CProt_ref& prot = m_Feat.GetData().GetProt();
2859  if (!prot.IsSetName()) {
2860  if (!prot.IsSetProcessed() ||
2861  (prot.GetProcessed() != CProt_ref::eProcessed_signal_peptide &&
2862  prot.GetProcessed() != CProt_ref::eProcessed_transit_peptide)) {
2864  "Protein name is not set");
2865  }
2866  return;
2867  }
2868  for (auto it : m_Feat.GetData().GetProt().GetName()) {
2869  string search = it;
2870  search = NStr::ToLower(search);
2871  if (search.empty()) {
2873  "Protein name is empty");
2874  } else if (sc_BadProtName.find (search.c_str()) != sc_BadProtName.end()
2875  || NStr::Find(search, "=") != string::npos
2876  || NStr::Find(search, "~") != string::npos
2877  || NStr::FindNoCase(search, "uniprot") != string::npos
2878  || NStr::FindNoCase(search, "uniprotkb") != string::npos
2879  || NStr::FindNoCase(search, "pmid") != string::npos
2880  || NStr::FindNoCase(search, "dbxref") != string::npos) {
2882  "Uninformative protein name '" + it + "'");
2883  }
2884  }
2885 }
2886 
2887 
2889 {
2890  if (!m_Feat.GetData().GetProt().IsSetEc()) {
2891  return;
2892  }
2893  for (auto it : m_Feat.GetData().GetProt().GetEc()) {
2894  if (NStr::IsBlank (it)) {
2895  PostErr(eDiag_Warning, eErr_SEQ_FEAT_EcNumberEmpty, "EC number should not be empty");
2896  } else if (!CProt_ref::IsValidECNumberFormat(it)) {
2898  (it) + " is not in proper EC_number format");
2899  } else {
2900  const string& ec_number = it;
2903  switch (status) {
2906  "EC_number " + ec_number + " was deleted");
2907  break;
2911  "EC_number " + ec_number + " was transferred and is no longer valid");
2912  break;
2914  {
2915  size_t pos = NStr::Find (ec_number, "n");
2916  if (pos == string::npos || !isdigit (ec_number.c_str()[pos + 1])) {
2918  ec_number + " is not a legal value for qualifier EC_number");
2919  } else {
2921  ec_number + " is not a legal preliminary value for qualifier EC_number");
2922  }
2923  }
2924  break;
2925  default:
2926  break;
2927  }
2928  }
2929  }
2930 
2931 }
2932 
2933 
2934 void CProtValidator::x_ValidateProteinName(const string& prot_name)
2935 {
2936  if (NStr::EndsWith(prot_name, "]")) {
2937  bool report_name = true;
2938  size_t pos = NStr::Find(prot_name, "[", NStr::eNocase, NStr::eReverseSearch);
2939  if (pos == string::npos) {
2940  // no disqualifying text
2941  } else if (prot_name.length() - pos < 5) {
2942  // no disqualifying text
2943  } else if (NStr::EqualCase(prot_name, pos, 4, "[NAD")) {
2944  report_name = false;
2945  }
2946  if (!m_Imp.IsEmbl() && !m_Imp.IsTPE()) {
2947  if (report_name) {
2949  "Protein name ends with bracket and may contain organism name");
2950  }
2951  }
2952  }
2953  if (NStr::StartsWith(prot_name, "hypothetical protein XP_") && m_LocationBioseq) {
2954  for (auto id_it : m_LocationBioseq.GetCompleteBioseq()->GetId()) {
2955  if (id_it->IsOther()
2956  && id_it->GetOther().IsSetAccession()
2957  && !NStr::EqualNocase(id_it->GetOther().GetAccession(),
2958  prot_name.substr(21))) {
2960  "Hypothetical protein reference does not match accession");
2961  }
2962  }
2963  }
2964  if (!m_Imp.IsRefSeq() && NStr::FindNoCase(prot_name, "RefSeq") != string::npos) {
2965  PostErr(eDiag_Error, eErr_SEQ_FEAT_RefSeqInText, "Protein name contains 'RefSeq'");
2966  }
2967  if (m_Feat.IsSetComment() && NStr::EqualCase(m_Feat.GetComment(), prot_name)) {
2969  "Comment has same value as protein name");
2970  }
2971 
2972  if (s_StringHasPMID(prot_name)) {
2974  "Protein name has internal PMID");
2975  }
2976 
2977  if (m_Imp.DoRubiscoTest()) {
2978  if (NStr::FindCase(prot_name, "ribulose") != string::npos
2979  && NStr::FindCase(prot_name, "bisphosphate") != string::npos
2980  && NStr::FindCase(prot_name, "methyltransferase") == string::npos
2981  && NStr::FindCase(prot_name, "activase") == string::npos) {
2982  if (NStr::EqualNocase(prot_name, "ribulose-1,5-bisphosphate carboxylase/oxygenase")) {
2983  // allow standard name without large or small subunit designation - later need kingdom test
2984  } else if (!NStr::EqualNocase(prot_name, "ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit")
2985  && !NStr::EqualNocase(prot_name, "ribulose-1,5-bisphosphate carboxylase/oxygenase small subunit")) {
2987  "Nonstandard ribulose bisphosphate protein name");
2988  }
2989  }
2990  }
2991 
2992 
2993 
2994  ValidateCharactersInField(prot_name, "Protein name");
2995  if (ContainsSgml(prot_name)) {
2997  "protein name " + prot_name + " has SGML");
2998  }
2999 
3000 }
3001 
3002 
3004 {
3005  if (!m_LocationBioseq) {
3006  return;
3007  }
3008  const CBioseq& pbioseq = *(m_LocationBioseq.GetCompleteBioseq());
3009  // if there is a coding region for this bioseq, this type of error
3010  // will be handled there
3011  const CSeq_feat* cds = m_Imp.GetCDSGivenProduct(pbioseq);
3012  if (cds) return;
3014  if (! prot) return;
3015 
3017  if (! mi_i) return;
3018  const CMolInfo& mi = mi_i->GetMolinfo();
3019  if (! mi.IsSetCompleteness()) return;
3020  int completeness = mi.GetCompleteness();
3021 
3022  const CSeq_loc& prot_loc = prot->GetLocation();
3023  bool prot_partial5 = prot_loc.IsPartialStart(eExtreme_Biological);
3024  bool prot_partial3 = prot_loc.IsPartialStop(eExtreme_Biological);
3025 
3026  bool conflict = false;
3027  if (completeness == CMolInfo::eCompleteness_partial && ((! prot_partial5) && (! prot_partial3))) {
3028  conflict = true;
3029  } else if (completeness == CMolInfo::eCompleteness_no_left && ((! prot_partial5) || prot_partial3)) {
3030  conflict = true;
3031  } else if (completeness == CMolInfo::eCompleteness_no_right && (prot_partial5 || (! prot_partial3))) {
3032  conflict = true;
3033  } else if (completeness == CMolInfo::eCompleteness_no_ends && ((! prot_partial5) || (! prot_partial3))) {
3034  conflict = true;
3035  } else if ((completeness < CMolInfo::eCompleteness_partial || completeness > CMolInfo::eCompleteness_no_ends) && (prot_partial5 || prot_partial3)) {
3036  conflict = true;
3037  }
3038 
3039  if (conflict) {
3041  "Molinfo completeness and protein feature partials conflict");
3042  }
3043 }
3044 
3046 {
3048 
3049  const CRNA_ref& rna = m_Feat.GetData().GetRna();
3050 
3052  if (rna.IsSetType()) {
3053  rna_type = rna.GetType();
3054  }
3055 
3056  if (rna_type == CRNA_ref::eType_rRNA) {
3057  if (rna.CanGetExt() && rna.GetExt().IsName()) {
3058  const string& rna_name = rna.GetExt().GetName();
3059  ValidateCharactersInField (rna_name, "rRNA name");
3060  if (ContainsSgml(rna_name)) {
3062  "rRNA name " + rna_name + " has SGML");
3063  }
3064  }
3065  }
3066 
3069 
3070  bool feat_pseudo = s_IsPseudo(m_Feat);
3071  bool pseudo = feat_pseudo;
3072  if (!pseudo) {
3074  if (gene) {
3075  pseudo = s_IsPseudo(*gene);
3076  }
3077  }
3078 
3079  if (!pseudo) {
3081  }
3082 
3083  x_ValidateRnaProduct(feat_pseudo, pseudo);
3084 
3085  if (rna_type == CRNA_ref::eType_rRNA
3086  || rna_type == CRNA_ref::eType_snRNA
3087  || rna_type == CRNA_ref::eType_scRNA
3088  || rna_type == CRNA_ref::eType_snoRNA) {
3089  if (!rna.IsSetExt() || !rna.GetExt().IsName() || NStr::IsBlank(rna.GetExt().GetName())) {
3090  if (!pseudo) {
3091  string rna_typename = CRNA_ref::GetRnaTypeName(rna_type);
3093  rna_typename + " has no name");
3094  }
3095  }
3096  }
3097 
3098 
3099  if ( rna_type == CRNA_ref::eType_unknown ) {
3101  "RNA type 0 (unknown) not supported");
3102  }
3103 
3104 
3105 }
3106 
3107 
3108 void CRNAValidator::x_ValidateRnaProduct(bool feat_pseudo, bool pseudo)
3109 {
3110  if (!m_Feat.IsSetProduct()) {
3111  return;
3112  }
3113 
3115 
3116  if ((!m_Feat.IsSetExcept_text()
3117  || NStr::FindNoCase (m_Feat.GetExcept_text(), "transcribed pseudogene") == string::npos)
3118  && !m_Imp.IsRefSeq()) {
3119  if (feat_pseudo) {
3121  "A pseudo RNA should not have a product");
3122  } else if (pseudo) {
3124  "An RNA overlapped by a pseudogene should not have a product");
3125  }
3126  }
3127 
3128 }
3129 
3130 
3132 {
3133  if ( !m_Feat.GetData().GetRna().IsSetType() || !m_ProductBioseq ) {
3134  return;
3135  }
3137  if ( !di ) {
3138  return;
3139  }
3140  const CMolInfo& mol_info = di->GetMolinfo();
3141  if ( !mol_info.CanGetBiomol() ) {
3142  return;
3143  }
3144  int biomol = mol_info.GetBiomol();
3145 
3146  switch ( m_Feat.GetData().GetRna().GetType() ) {
3147 
3148  case CRNA_ref::eType_mRNA:
3149  if ( biomol == CMolInfo::eBiomol_mRNA ) {
3150  return;
3151  }
3152  break;
3153 
3154  case CRNA_ref::eType_tRNA:
3155  if ( biomol == CMolInfo::eBiomol_tRNA ) {
3156  return;
3157  }
3158  break;
3159 
3160  case CRNA_ref::eType_rRNA:
3161  if ( biomol == CMolInfo::eBiomol_rRNA ) {
3162  return;
3163  }
3164  break;
3165 
3166  default:
3167  return;
3168  }
3169 
3171  "Type of RNA does not match MolInfo of product Bioseq");
3172 }
3173 
3174 
3176 {
3178  if ( !sd ) {
3179  return false;
3180  }
3181  const CMolInfo& molinfo = sd->GetMolinfo();
3182  if (!molinfo.IsSetCompleteness ()) {
3183  return false;
3184  }
3185  CMolInfo::TCompleteness completeness = molinfo.GetCompleteness();
3186  if (completeness == CMolInfo::eCompleteness_partial
3187  || completeness == CMolInfo::eCompleteness_no_ends
3188  || completeness == CMolInfo::eCompleteness_no_left
3189  || completeness == CMolInfo::eCompleteness_no_right) {
3190  return true;
3191  } else {
3192  return false;
3193  }
3194 }
3195 
3196 
3198 {
3199  if (!m_Feat.GetData().GetRna().IsSetExt() || !m_Feat.GetData().GetRna().GetExt().IsTRNA()) {
3200  return;
3201  }
3202  if ( !m_Feat.GetData().GetRna().IsSetType() ||
3205  "tRNA data structure on non-tRNA feature");
3206  }
3207 
3208  const CTrna_ext& trna = m_Feat.GetData().GetRna().GetExt ().GetTRNA ();
3209  if ( trna.CanGetAnticodon () ) {
3210  const CSeq_loc& anticodon = trna.GetAnticodon();
3211  size_t anticodon_len = GetLength(anticodon, &m_Scope);
3212  if ( anticodon_len != 3 ) {
3214  "Anticodon is not 3 bases in length");
3215  }
3216  ECompare comp = sequence::Compare(anticodon,
3217  m_Feat.GetLocation(),
3218  &m_Scope,
3220  if ( comp != eContained && comp != eSame ) {
3222  "Anticodon location not in tRNA");
3223  }
3224  x_ValidateAnticodon(anticodon);
3225  }
3227 
3228 }
3229 
3230 
3232 {
3233  if (!m_Feat.GetData().GetRna().IsSetType() ||
3235  return;
3236  }
3237  const CRNA_ref& rna = m_Feat.GetData().GetRna();
3238 
3239  // check for unparsed qualifiers
3240  for (auto& gbqual : m_Feat.GetQual()) {
3241  if ( NStr::CompareNocase(gbqual->GetQual (), "anticodon") == 0 ) {
3243  "Unparsed anticodon qualifier in tRNA");
3244  } else if (NStr::CompareNocase (gbqual->GetQual (), "product") == 0 ) {
3245  if (NStr::CompareNocase (gbqual->GetVal (), "tRNA-fMet") != 0 &&
3246  NStr::CompareNocase (gbqual->GetVal (), "tRNA-iMet") != 0 &&
3247  NStr::CompareNocase (gbqual->GetVal (), "tRNA-Ile2") != 0) {
3249  "Unparsed product qualifier in tRNA");
3250  }
3251  }
3252  }
3253 
3254 
3255  /* tRNA with string extension */
3256  if ( rna.IsSetExt() &&
3257  rna.GetExt().Which () == CRNA_ref::C_Ext::e_Name ) {
3259  "Unparsed product qualifier in tRNA");
3260  } else if (!rna.IsSetExt() || rna.GetExt().Which() == CRNA_ref::C_Ext::e_not_set ) {
3262  "Missing encoded amino acid qualifier in tRNA");
3263  }
3264 
3266 
3267  bool isLessThan100 = false;
3268  const CSeq_loc& loc = m_Feat.GetLocation();
3269  CSeq_loc_CI li(loc);
3270 
3271  TSeqPos last_start = li.GetRange().GetFrom();
3272  TSeqPos last_stop = li.GetRange().GetTo();
3273  CRef<CSeq_id> last_id(new CSeq_id());
3274  last_id->Assign(li.GetSeq_id());
3275 
3276  ++li;
3277  while (li) {
3278  TSeqPos this_start = li.GetRange().GetFrom();
3279  TSeqPos this_stop = li.GetRange().GetTo();
3280  if (abs ((int)this_start - (int)last_stop) < 100 || abs ((int)this_stop - (int)last_start) < 100) {
3281  if (li.GetSeq_id().Equals(*last_id)) {
3282  // definitely same bioseq, definitely report
3283  isLessThan100 = true;
3284  break;
3285  } else {
3286  // only report if definitely on same bioseq
3287  CBioseq_Handle last_bsh = m_Scope.GetBioseqHandle(*last_id);
3288  if (last_bsh) {
3289  for (auto id_it : last_bsh.GetId()) {
3290  if (id_it.GetSeqId()->Equals(li.GetSeq_id())) {
3291  isLessThan100 = true;
3292  break;
3293  }
3294  }
3295  }
3296  }
3297  }
3298  last_start = this_start;
3299  last_stop = this_stop;
3300  last_id->Assign(li.GetSeq_id());
3301  ++li;
3302  }
3303  bool pseudo = m_Feat.IsSetPseudo() && m_Feat.GetPseudo() ;
3304  if ( !pseudo ) {
3305  const CGene_ref* grp = m_Feat.GetGeneXref();
3306  if ( grp == NULL ) {
3308  if (gene) {
3309  pseudo = gene->IsSetPseudo() && gene->GetPseudo();
3310  if ( !pseudo ) {
3311  grp = &(gene->GetData().GetGene());
3312  }
3313  }
3314  }
3315  if ( !pseudo && grp != NULL ) {
3316  pseudo = grp->GetPseudo();
3317  }
3318  }
3319  if (isLessThan100 && ! pseudo) {
3321  if (bsh) {
3323  if (sd) {
3324  const CSeqdesc::TSource& source = sd->GetSource();
3325  if (source.IsSetLineage()) {
3326  string lineage = source.GetLineage();
3327  if (NStr::StartsWith(lineage, "Bacteria; ")) {
3329  "tRNA intron in bacteria is less than 100 bp");
3330  }
3331  }
3332  }
3333  }
3334  }
3335 }
3336 
3337 
3338 void CRNAValidator::x_ValidateAnticodon(const CSeq_loc& anticodon)
3339 {
3340  bool ordered = true;
3341  bool adjacent = false;
3342  bool unmarked_strand = false;
3343  bool mixed_strand = false;
3344 
3345  CSeq_loc_CI prev;
3346  for (CSeq_loc_CI curr(anticodon); curr; ++curr) {
3347  bool chk = true;
3348  if (curr.GetEmbeddingSeq_loc().IsInt()) {
3349  chk = sequence::IsValid(curr.GetEmbeddingSeq_loc().GetInt(), &m_Scope);
3350  } else if (curr.GetEmbeddingSeq_loc().IsPnt()) {
3351  chk = sequence::IsValid(curr.GetEmbeddingSeq_loc().GetPnt(), &m_Scope);
3352  } else {
3353  continue;
3354  }
3355 
3356  if ( !chk ) {
3357  string lbl;
3358  curr.GetEmbeddingSeq_loc().GetLabel(&lbl);
3360  "Anticodon location [" + lbl + "] out of range");
3361  }
3362 
3363  if ( prev && curr &&
3364  IsSameBioseq(curr.GetSeq_id(), prev.GetSeq_id(), &m_Scope) ) {
3365  CSeq_loc_CI::TRange prev_range = prev.GetRange();
3366  CSeq_loc_CI::TRange curr_range = curr.GetRange();
3367  if ( ordered ) {
3368  if ( curr.GetStrand() == eNa_strand_minus ) {
3369  if (prev_range.GetTo() < curr_range.GetTo()) {
3370  ordered = false;
3371  }
3372  if (curr_range.GetTo() + 1 == prev_range.GetFrom()) {
3373  adjacent = true;
3374  }
3375  } else {
3376  if (prev_range.GetTo() > curr_range.GetTo()) {
3377  ordered = false;
3378  }
3379  if (prev_range.GetTo() + 1 == curr_range.GetFrom()) {
3380  adjacent = true;
3381  }
3382  }
3383  }
3384  ENa_strand curr_strand = curr.GetStrand();
3385  ENa_strand prev_strand = prev.GetStrand();
3386  if ( curr_range == prev_range && curr_strand == prev_strand ) {
3388  "Duplicate anticodon exons in location");
3389  }
3390  if ( curr_strand != prev_strand ) {
3391  if (curr_strand == eNa_strand_plus && prev_strand == eNa_strand_unknown) {
3392  unmarked_strand = true;
3393  } else if (curr_strand == eNa_strand_unknown && prev_strand == eNa_strand_plus) {
3394  unmarked_strand = true;
3395  } else {
3396  mixed_strand = true;
3397  }
3398  }
3399  }
3400  prev = curr;
3401  }
3402  if (adjacent) {
3404  "Adjacent intervals in Anticodon");
3405  }
3406 
3407  ENa_strand loc_strand = m_Feat.GetLocation().GetStrand();
3408  ENa_strand ac_strand = anticodon.GetStrand();
3409  if (loc_strand == eNa_strand_minus && ac_strand != eNa_strand_minus) {
3411  "Anticodon strand and tRNA strand do not match.");
3412  } else if (loc_strand != eNa_strand_minus && ac_strand == eNa_strand_minus) {
3414  "Anticodon strand and tRNA strand do not match.");
3415  }
3416 
3417  // trans splicing exception turns off both mixed_strand and out_of_order messages
3418  bool trans_splice = false;
3420  if (NStr::FindNoCase(m_Feat.GetExcept_text(), "trans-splicing") != NPOS) {
3421  trans_splice = true;
3422  }
3423  }
3424  if (!trans_splice) {
3425  string loc_lbl;
3426  anticodon.GetLabel(&loc_lbl);
3427  if (mixed_strand) {
3429  "Mixed strands in Anticodon [" + loc_lbl + "]");
3430  }
3431  if (unmarked_strand) {
3433  "Mixed plus and unknown strands in Anticodon [" + loc_lbl + "]");
3434  }
3435  if (!ordered) {
3437  "Intervals out of order in Anticodon [" + loc_lbl + "]");
3438  }
3439  }
3440 }
3441 
3442 
3443 int s_LegalNcbieaaValues[] = { 42, 65, 66, 67, 68, 69, 70, 71, 72, 73,
3444  74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
3445  84, 85, 86, 87, 88, 89, 90 };
3446 
3447 static const char* kAANames[] = {
3448  "---", "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
3449  "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr",
3450  "Val", "Trp", "OTHER", "Tyr", "Glx", "Sec", "TERM", "Pyl", "Xle"
3451 };
3452 
3453 
3454 const char* GetAAName(unsigned char aa, bool is_ascii)
3455 {
3456  try {
3457  if (is_ascii) {
3460  }
3461  return (aa < sizeof(kAANames)/sizeof(*kAANames)) ? kAANames[aa] : "OTHER";
3462  } catch (const CException& ) {
3463  return "OTHER";
3464  } catch (const std::exception& ) {
3465  return "OTHER";
3466  }
3467 }
3468 
3469 
3470 static string GetGeneticCodeName (int gcode)
3471 {
3472  const CGenetic_code_table& code_table = CGen_code_table::GetCodeTable();
3473  const list<CRef<CGenetic_code> >& codes = code_table.Get();
3474 
3475  for ( list<CRef<CGenetic_code> >::const_iterator code_it = codes.begin(), code_it_end = codes.end(); code_it != code_it_end; ++code_it ) {
3476  if ((*code_it)->GetId() == gcode) {
3477  return (*code_it)->GetName();
3478  }
3479  }
3480  return "unknown";
3481 }
3482 
3483 
3485 {
3486  if (!m_Feat.IsSetData() || !m_Feat.GetData().IsRna() ||
3487  !m_Feat.GetData().GetRna().IsSetExt() ||
3488  !m_Feat.GetData().GetRna().GetExt().IsTRNA()) {
3489  return;
3490  }
3491  const CTrna_ext& trna = m_Feat.GetData().GetRna().GetExt().GetTRNA();
3492 
3493  if (!trna.IsSetAa()) {
3494  PostErr (eDiag_Error, eErr_SEQ_FEAT_BadTrnaAA, "Missing tRNA amino acid");
3495  return;
3496  }
3497 
3498  unsigned char aa = 0, orig_aa;
3499  vector<char> seqData;
3500  string str;
3501 
3502  switch (trna.GetAa().Which()) {
3504  str = trna.GetAa().GetIupacaa();
3506  aa = seqData[0];
3507  break;
3509  str = trna.GetAa().GetNcbi8aa();
3511  aa = seqData[0];
3512  break;
3514  str = trna.GetAa().GetNcbi8aa();
3516  aa = seqData[0];
3517  break;
3519  seqData.push_back(trna.GetAa().GetNcbieaa());
3520  aa = seqData[0];
3521  break;
3522  default:
3523  NCBI_THROW (CCoreException, eCore, "Unrecognized tRNA aa coding");
3524  break;
3525  }
3526 
3527  // make sure the amino acid is valid
3528  bool found = false;
3529  for ( unsigned int i = 0; i < sizeof (s_LegalNcbieaaValues) / sizeof (int); ++i ) {
3530  if ( aa == s_LegalNcbieaaValues[i] ) {
3531  found = true;
3532  break;
3533  }
3534  }
3535  orig_aa = aa;
3536  if ( !found ) {
3537  aa = ' ';
3538  }
3539 
3540  if (m_Feat.GetData().GetRna().IsSetType() &&
3542  bool mustbemethionine = false;
3543  for (auto gbqual : m_Feat.GetQual()) {
3544  if (NStr::CompareNocase(gbqual->GetQual(), "product") == 0 &&
3545  (NStr::CompareNocase(gbqual->GetVal(), "tRNA-fMet") == 0 ||
3546  NStr::CompareNocase(gbqual->GetVal(), "tRNA-iMet") == 0)) {
3547  mustbemethionine = true;
3548  break;
3549  }
3550  }
3551  if (mustbemethionine) {
3552  if (aa != 'M') {
3553  string aanm = GetAAName(aa, true);
3555  "Initiation tRNA claims to be tRNA-" + aanm +
3556  ", but should be tRNA-Met");
3557  }
3558  }
3559  }
3560 
3561  // Retrive the Genetic code id for the tRNA
3562  int gcode = 1;
3563  if ( m_LocationBioseq ) {
3564  // need only the closest biosoure.
3566  if ( diter ) {
3567  gcode = diter->GetSource().GetGenCode();
3568  }
3569  }
3570 
3571  const string& ncbieaa = CGen_code_table::GetNcbieaa(gcode);
3572  if ( ncbieaa.length() != 64 ) {
3573  return;
3574  }
3575 
3576  string codename = GetGeneticCodeName (gcode);
3577  char buf[2];
3578  buf[0] = aa;
3579  buf[1] = 0;
3580  string aaname = buf;
3581  aaname += "/";
3582  aaname += GetAAName (aa, true);
3583 
3584  EDiagSev sev = (aa == 'U' || aa == 'O') ? eDiag_Warning : eDiag_Error;
3585 
3586  bool modified_codon_recognition = false;
3587  bool rna_editing = false;
3588  if ( m_Feat.IsSetExcept_text() ) {
3589  string excpt_text = m_Feat.GetExcept_text();
3590  if ( NStr::FindNoCase(excpt_text, "modified codon recognition") != NPOS ) {
3591  modified_codon_recognition = true;
3592  }
3593  if ( NStr::FindNoCase(excpt_text, "RNA editing") != NPOS ) {
3594  rna_editing = true;
3595  }
3596  }
3597 
3598  vector<string> recognized_codon_values;
3599  vector<unsigned char> recognized_taa_values;
3600 
3601  ITERATE( CTrna_ext::TCodon, iter, trna.GetCodon() ) {
3602  if (*iter == 255) continue;
3603  // test that codon value is in range 0 - 63
3604  if ( *iter > 63 ) {
3606  "tRNA codon value " + NStr::IntToString(*iter) +
3607  " is greater than maximum 63");
3608  continue;
3609  } else if (*iter < 0) {
3611  "tRNA codon value " + NStr::IntToString(*iter) +
3612  " is less than 0");
3613  continue;
3614  }
3615 
3616  if ( !modified_codon_recognition && !rna_editing ) {
3617  unsigned char taa = ncbieaa[*iter];
3618  string codon = CGen_code_table::IndexToCodon(*iter);
3619  recognized_codon_values.push_back (codon);
3620  recognized_taa_values.push_back (taa);
3621 
3622  if ( taa != aa ) {
3623  if ( (aa == 'U') && (taa == '*') && (*iter == 14) ) {
3624  // selenocysteine normally uses TGA (14), so ignore without requiring exception in record
3625  // TAG (11) is used for pyrrolysine in archaebacteria
3626  // TAA (10) is not yet known to be used for an exceptional amino acid
3627  } else {
3628  NStr::ReplaceInPlace (codon, "T", "U");
3629 
3631  "Codon recognized by tRNA (" + codon + ") does not match amino acid ("
3632  + aaname + ") specified by genetic code ("
3633  + NStr::IntToString (gcode) + "/" + codename + ")");
3634  }
3635  }
3636  }
3637  }
3638 
3639  // see if anticodon is compatible with codons recognized and amino acid
3640  string anticodon = "?";
3641  vector<string> codon_values;
3642  vector<unsigned char> taa_values;
3643 
3644  if (trna.IsSetAnticodon() && GetLength (trna.GetAnticodon(), &m_Scope) == 3) {
3645  try {
3646  anticodon = GetSequenceStringFromLoc(trna.GetAnticodon(), m_Scope);
3647  // get reverse complement sequence for location
3648  CRef<CSeq_loc> codon_loc(SeqLocRevCmpl(trna.GetAnticodon(), &m_Scope));
3649  string codon = GetSequenceStringFromLoc(*codon_loc, m_Scope);
3650  if (codon.length() > 3) {
3651  codon = codon.substr (0, 3);
3652  }
3653 
3654  // expand wobble base to known binding partners
3655  string wobble;
3656 
3657  char ch = anticodon.c_str()[0];
3658  switch (ch) {
3659  case 'A' :
3660  wobble = "ACT";
3661  break;
3662  case 'C' :
3663  wobble = "G";
3664  break;
3665  case 'G' :
3666  wobble = "CT";
3667  break;
3668  case 'T' :
3669  wobble = "AG";
3670  break;
3671  default :
3672  break;
3673  }
3674  if (!NStr::IsBlank(wobble)) {
3675  string::iterator str_it = wobble.begin();
3676  while (str_it != wobble.end()) {
3677  codon[2] = *str_it;
3678  int index = CGen_code_table::CodonToIndex (codon);
3679  if (index < 64 && index > -1) {
3680  unsigned char taa = ncbieaa[index];
3681  taa_values.push_back(taa);
3682  codon_values.push_back(codon);
3683  }
3684  ++str_it;
3685  }
3686  }
3687  NStr::ReplaceInPlace (anticodon, "T", "U");
3688  if (anticodon.length() > 3) {
3689  anticodon = anticodon.substr(0, 3);
3690  }
3691  } catch (const CException& ) {
3692  } catch (const std::exception& ) {
3693  }
3694 
3695  if (codon_values.size() > 0) {
3696  bool ok = false;
3697  // check that codons predicted from anticodon can transfer indicated amino acid
3698  for (size_t i = 0; i < codon_values.size(); i++) {
3699  if (!NStr::IsBlank (codon_values[i]) && aa == taa_values[i]) {
3700  ok = true;
3701  }
3702  }
3703  if (!ok) {
3704  if (aa == 'U' && NStr::Equal (anticodon, "UCA")) {
3705  // ignore TGA codon for selenocysteine
3706  } else if (aa == 'O' && NStr::Equal (anticodon, "CUA")) {
3707  // ignore TAG codon for pyrrolysine
3708  } else if (aa == 'I' && NStr::Equal (anticodon, "CAU")) {
3709  // ignore ATG predicted codon for Ile2
3710  } else if (!m_Feat.IsSetExcept_text()
3711  || (NStr::FindNoCase(m_Feat.GetExcept_text(), "modified codon recognition") == string::npos
3712  &&NStr::FindNoCase(m_Feat.GetExcept_text(), "RNA editing") == string::npos)) {
3714  "Codons predicted from anticodon (" + anticodon
3715  + ") cannot produce amino acid (" + aaname + ")");
3716  }
3717  }
3718 
3719  // check that codons recognized match codons predicted from anticodon
3720  if (recognized_codon_values.size() > 0) {
3721  ok = false;
3722  for (size_t i = 0; i < codon_values.size() && !ok; i++) {
3723  for (size_t j = 0; j < recognized_codon_values.size() && !ok; j++) {
3724  if (NStr::Equal (codon_values[i], recognized_codon_values[j])) {
3725  ok = true;
3726  } else if ( NStr::Equal (codon_values[i], "ATG") && aa == 'I') {
3727  // allow ATG recognized codon (pre-RNA-editing) for Ile2
3728  ok = true;
3729  }
3730  }
3731  }
3732  if (!ok
3733  && (!m_Feat.IsSetExcept_text()
3734  || NStr::FindNoCase (m_Feat.GetExcept_text(), "RNA editing") == string::npos)) {
3736  "Codon recognized cannot be produced from anticodon ("
3737  + anticodon + ")");
3738  }
3739  }
3740  }
3741  }
3742 
3743  if (!m_Feat.IsSetPseudo() || !m_Feat.GetPseudo()) {
3744  if (orig_aa == 0 || orig_aa == 255) {
3745  PostErr (sev, eErr_SEQ_FEAT_BadTrnaAA, "Missing tRNA amino acid");
3746  } else {
3747  // verify that legal amino acid is indicated
3748  unsigned int idx;
3749  if (aa != '*') {
3750  idx = aa - 64;
3751  } else {
3752  idx = 25;
3753  }
3754  if (idx == 0 || idx >= 28) {
3755  PostErr (sev, eErr_SEQ_FEAT_BadTrnaAA, "Invalid tRNA amino acid");
3756  }
3757  }
3758  }
3759 }
3760 
3761 
3763 {
3764  if (!m_Feat.GetData().GetRna().IsSetType() ||
3766  return;
3767  }
3768  TFeatScores scores;
3773  scores, m_Scope);
3774  bool found_bad = false;
3775  for (auto it : scores) {
3776  CRef<CSeq_loc> intersection = it.second->GetLocation().Intersect(m_Feat.GetLocation(),
3777  0 /* flags*/,
3778  nullptr /* synonym mapper */);
3779  if (intersection) {
3780  TSeqPos length = sequence::GetLength(*intersection, &m_Scope);
3781  if (length >= 5) {
3782  found_bad = true;
3783  break;
3784  }
3785  }
3786  }
3787  if (found_bad) {
3789  "tRNA-rRNA overlap");
3790  }
3794  if (cds) {
3796  "tRNA overlaps CDS");
3797  }
3798 }
3799 
3800 
3802 {
3803  size_t mismatches = 0;
3804  size_t problems = GetMRNATranslationProblems
3805  (m_Feat, mismatches, m_Imp.IgnoreExceptions(),
3808  m_Imp.IsGenomic(), &m_Scope);
3809  x_ReportRNATranslationProblems(problems, mismatches);
3810 }
3811 
3812 
3813 void CRNAValidator::x_ReportRNATranslationProblems(size_t problems, size_t mismatches)
3814 {
3815  if (problems & eMRNAProblem_TransFail) {
3817  "Unable to transcribe mRNA");
3818  }
3819  if (problems & eMRNAProblem_UnableToFetch) {
3820  const CSeq_id& product_id = GetId(m_Feat.GetProduct(), &m_Scope);
3821  string label = product_id.AsFastaString();
3823  "Unable to fetch mRNA transcript '" + label + "'");
3824  }
3825 
3826  bool is_refseq = m_Imp.IsRefSeqConventions();
3827  if (m_LocationBioseq) {
3829  if ((*it)->IsOther()) {
3830  is_refseq = true;
3831  break;
3832  }
3833  }
3834  }
3835 
3837 
3838  string farstr;
3839  EDiagSev sev = eDiag_Error;
3840 
3841  // if not local bioseq product, lower severity (with the exception of Refseq)
3842  if (m_ProductIsFar && !is_refseq) {
3843  sev = eDiag_Warning;
3844  }
3845  if (m_ProductIsFar) {
3846  farstr = "(far) ";
3847  if (m_Feat.IsSetPartial()
3850  sev = eDiag_Warning;
3851  }
3852  }
3853 
3854  if (problems & eMRNAProblem_TranscriptLenLess) {
3856  "Transcript length [" + NStr::SizetToString(feat_len) +
3857  "] less than " + farstr + "product length [" +
3858  NStr::SizetToString(m_ProductBioseq.GetInst_Length()) + "], and tail < 95% polyA");
3859  }
3860 
3861  if (problems & eMRNAProblem_PolyATail100) {
3863  "Transcript length [" + NStr::SizetToString(feat_len)
3864  + "] less than " + farstr + "product length ["
3865  + NStr::SizetToString(m_ProductBioseq.GetInst_Length()) + "], but tail is 100% polyA");
3866  }
3867  if (problems & eMRNAProblem_PolyATail95) {
3869  "Transcript length [" + NStr::SizetToString(feat_len) +
3870  "] less than " + farstr + "product length [" +
3871  NStr::SizetToString(m_ProductBioseq.GetInst_Length()) + "], but tail >= 95% polyA");
3872  }
3873  if (problems & eMRNAProblem_TranscriptLenMore) {
3875  "Transcript length [" + NStr::IntToString(feat_len) + "] " +
3876  "greater than " + farstr + "product length [" +
3878  }
3879  if ((problems & eMRNAProblem_Mismatch) && mismatches > 0) {
3881  "There are " + NStr::SizetToString(mismatches) +
3882  " mismatches out of " + NStr::SizetToString(feat_len) +
3883  " bases between the transcript and " + farstr + "product sequence");
3884  }
3885  if (problems & eMRNAProblem_UnnecessaryException) {
3887  "mRNA has exception but passes transcription test");
3888  }
3889  if (problems & eMRNAProblem_ErroneousException) {
3890  size_t total = min(feat_len, m_ProductBioseq.GetInst_Length());
3892  "mRNA has unclassified exception but only difference is " + NStr::SizetToString(mismatches)
3893  + " mismatches out of " + NStr::SizetToString(total) + " bases");
3894  }
3895  if (problems & eMRNAProblem_ProductReplaced) {
3897  "mRNA has transcribed product replaced exception");
3898  }
3899 }
3900 
3901 
3903 CRNAValidator(feat, scope, imp)
3904 {
3906  if (m_Gene) {
3908  } else {
3909  m_GeneIsPseudo = false;
3910  }
3912 }
3913 
3914 
3916 {
3918 
3921 
3922  x_ValidateMrna();
3923 
3924  if (!m_GeneIsPseudo && !m_FeatIsPseudo) {
3926  }
3928 }
3929 
3930 
3932 {
3933  bool pseudo = m_GeneIsPseudo;
3934  if (!pseudo) {
3935  pseudo = s_IsPseudo(m_Feat);
3936  }
3937  ValidateSplice(pseudo, false);
3938 
3939  const CRNA_ref& rna = m_Feat.GetData().GetRna();
3940 
3941  if (m_Feat.IsSetQual()) {
3942  for (auto it : m_Feat.GetQual()) {
3943  const CGb_qual& qual = *it;
3944  if (qual.CanGetQual()) {
3945  const string& key = qual.GetQual();
3946  if (NStr::EqualNocase(key, "protein_id")) {
3948  "protein_id should not be a gbqual on an mRNA feature");
3949  }
3950  else if (NStr::EqualNocase(key, "transcript_id")) {
3952  "transcript_id should not be a gbqual on an mRNA feature");
3953  }
3954  }
3955  }
3956  }
3957 
3958  if (rna.IsSetExt() && rna.GetExt().IsName()) {
3959  const string& rna_name = rna.GetExt().GetName();
3960  if (NStr::StartsWith(rna_name, "transfer RNA ") &&
3961  (!NStr::EqualNocase(rna_name, "transfer RNA nucleotidyltransferase")) &&
3962  (!NStr::EqualNocase(rna_name, "transfer RNA methyltransferase"))) {
3964  "mRNA feature product indicates it should be a tRNA feature");
3965  }
3966  ValidateCharactersInField(rna_name, "mRNA name");
3967  if (ContainsSgml(rna_name)) {
3969  "mRNA name " + rna_name + " has SGML");
3970  }
3971  }
3972 }
3973 
3974 
3976 {
3977  if (!m_Feat.IsSetProduct()) {
3978  return;
3979  }
3980  if ( !m_ProductBioseq) {
3981  if (m_LocationBioseq) {
3983  if (seh.IsSet() && seh.GetSet().IsSetClass()
3985  || seh.GetSet().GetClass() == CBioseq_set::eClass_other)) {
3987  "Product Bioseq of mRNA feature is not "
3988  "packaged in the record");
3989  }
3990  }
3991  } else {
3992 
3993  //CConstRef<CSeq_feat> mrna = m_Imp.GetmRNAGivenProduct (*(m_ProductBioseq.GetCompleteBioseq()));
3995  if (mrna && mrna.GetPointer() != &m_Feat) {
3997  "Identical transcript IDs found on multiple mRNAs");
3998  }
3999  }
4000 }
4001 
4002 
4003 static string s_GetGeneRefFields(const CGene_ref& gene, int field)
4004 {
4005  if (field == 1 && gene.CanGetLocus()) {
4006  return gene.GetLocus();
4007  }
4008  if (field == 2 && gene.CanGetAllele()) {
4009  return gene.GetAllele();
4010  }
4011  if (field == 3 && gene.CanGetDesc()) {
4012  return gene.GetLocus();
4013  }
4014  if (field == 4 && gene.CanGetLocus_tag()) {
4015  return gene.GetLocus_tag();
4016  }
4017  return "";
4018 }
4019 
4020 
4021 // check that there is no conflict between the gene on the genomic
4022 // and the gene on the mrna.
4024 {
4025  if (!m_ProductBioseq) {
4026  return;
4027  }
4028  const CGene_ref* genomicgrp = nullptr;
4029  if (m_Gene) {
4030  genomicgrp = &(m_Gene->GetData().GetGene());
4031  } else {
4032  genomicgrp = m_Feat.GetGeneXref();
4033  }
4034  if (!genomicgrp) {
4035  return;
4036  }
4038  if ( mrna_gene ) {
4039  const CGene_ref& mrnagrp = mrna_gene->GetData().GetGene();
4040  bool found_match = false;
4041  bool found_mismatch = false;
4042  for (int i = 1; i <= 4; i++) {
4043  string gen = s_GetGeneRefFields(*genomicgrp, i);
4044  string rna = s_GetGeneRefFields(mrnagrp, i);
4045  if (gen != "" && rna != "") {
4046  if (gen == rna) {
4047  found_match = true;
4048  } else {
4049  found_mismatch = true;
4050  }
4051  }
4052  }
4053  if (found_match) {
4054  if (found_mismatch) {
4056  "Found match and mismatch between gene on mRNA bioseq and gene on genomic bioseq",
4057  mrna_gene->GetOriginalFeature());
4058  }
4059  } else if (found_mismatch) {
4061  "Gene on mRNA bioseq does not match gene on genomic bioseq",
4062  mrna_gene->GetOriginalFeature());
4063  }
4064  }
4065 }
4066 
4067 
4069 {
4072 }
4073 
4074 
4076 {
4078  const CBioSource& bsrc = m_Feat.GetData().GetBiosrc();
4079  if ( bsrc.IsSetIs_focus() ) {
4081  "Focus must be on BioSource descriptor, not BioSource feature.");
4082  }
4083 
4085 
4087  if ( !dbsrc_i ) {
4088  return;
4089  }
4090 
4091  const COrg_ref& org = bsrc.GetOrg();
4092  const CBioSource& dbsrc = dbsrc_i->GetSource();
4093  const COrg_ref& dorg = dbsrc.GetOrg();
4094 
4095  if ( org.CanGetTaxname() && !org.GetTaxname().empty() &&
4096  dorg.CanGetTaxname() ) {
4097  string taxname = org.GetTaxname();
4098  string dtaxname = dorg.GetTaxname();
4099  if ( NStr::CompareNocase(taxname, dtaxname) != 0 ) {
4100  if ( !dbsrc.IsSetIs_focus() && !m_Imp.IsTransgenic(dbsrc) ) {
4102  "BioSource descriptor must have focus or transgenic "
4103  "when BioSource feature with different taxname is "
4104  "present.");
4105  }
4106  }
4107  }
4108 }
4109 
4110 
4112 {
4115  if ( range.GetFrom() != range.GetTo() ) {
4116  EDiagSev sev = eDiag_Warning;
4117  if (m_Imp.IsRefSeq()) {
4118  sev = eDiag_Error;
4119  }
4121  "PolyA_site should be a single point");
4122  }
4123 
4124 }
4125 
4126 
4128 {
4130  if ( range.GetFrom() == range.GetTo() ) {
4131  EDiagSev sev = eDiag_Warning;
4132  if (m_Imp.IsRefSeq()) {
4133  sev = eDiag_Error;
4134  }
4135  PostErr (sev, eErr_SEQ_FEAT_PolyAsignalNotRange, "PolyA_signal should be a range");
4136  }
4137 }
4138 
4139 
4141  CSingleFeatValidator(feat, scope, imp)
4142 {
4144 }
4145 
4146 
4148 {
4150 
4151  if (m_Imp.IsEmbl() || m_Imp.IsDdbj()) {
4153  "sig/mat/transit_peptide feature cannot be associated with a "
4154  "protein product of a coding region feature");
4155  } else {
4157  "Peptide processing feature should be converted to the "
4158  "appropriate protein feature subtype");
4159  }
4161 }
4162 
4163 
4165 {
4166  if (!m_CDS) {
4167  return;
4168  }
4169 
4170  const string& key = m_Feat.GetData().GetImp().GetKey();
4171 
4173  if (NStr::Equal(key, "sig_peptide") && in_frame == feature::eLocationInFrame_NotIn) {
4174  return;
4175  }
4176  switch (in_frame) {
4178  if (NStr::Equal(key, "sig_peptide")) {
4179  // ignore
4180  } else {
4182  "Start and stop of " + key + " are out of frame with CDS codons");
4183  }
4184  break;
4187  "Start and stop of " + key + " are out of frame with CDS codons");
4188  break;
4191  "Start of " + key + " is out of frame with CDS codons");
4192  break;
4195  "Stop of " + key + " is out of frame with CDS codons");
4196  break;
4198  break;
4199  }
4200 }
4201 
4202 
4204 {
4206  bool feat_pseudo = s_IsPseudo(m_Feat);
4207  bool pseudo = feat_pseudo;
4208  if (!pseudo) {
4210  if (gene) {
4211  pseudo = s_IsPseudo(*gene);
4212  }
4213  }
4214  if (m_Imp.IsValidateExons()) {
4215  ValidateSplice(pseudo, true);
4216  }
4217 }
4218 
4219 
4221 {
4223  bool feat_pseudo = s_IsPseudo(m_Feat);
4224  bool pseudo = feat_pseudo;
4225  if (!pseudo) {
4227  if (gene) {
4228  pseudo = s_IsPseudo(*gene);
4229  }
4230  }
4231 
4232  if (x_IsIntronShort(pseudo)) {
4234  "Introns should be at least 10 nt long");
4235  }
4236 
4238  && NStr::FindNoCase (m_Feat.GetExcept_text(), "nonconsensus splice site") != string::npos) {
4239  return;
4240  }
4241 
4242  const CSeq_loc& loc = m_Feat.GetLocation();
4243 
4244  bool partial5 = loc.IsPartialStart(eExtreme_Biological);
4245  bool partial3 = loc.IsPartialStop(eExtreme_Biological);
4246  if (partial5 && partial3) {
4247  return;
4248  }
4249 
4250  // suppress if contained by rRNA - different consensus splice site
4251  TFeatScores scores;
4256  scores, m_Scope);
4257  if (scores.size() > 0) {
4258  return;
4259  }
4260 
4261  // suppress if contained by tRNA - different consensus splice site
4262  scores.clear();
4267  scores, m_Scope);
4268  if (scores.size() > 0) {
4269  return;
4270  }
4271 
4272  // skip if more than one bioseq
4273  if (!IsOneBioseq(loc, &m_Scope)) {
4274  return;
4275  }
4276 
4277  // skip if organelle
4279  return;
4280  }
4281 
4283  string label;
4284  m_LocationBioseq.GetId().front().GetSeqId()->GetLabel(&label);
4286 
4287  ENa_strand strand = loc.GetStrand();
4288 
4289  if (eNa_strand_minus != strand && eNa_strand_plus != strand) {
4290  strand = eNa_strand_plus;
4291  }
4292 
4293  bool donor_in_gap = false;
4294  bool acceptor_in_gap = false;
4295 
4296  TSeqPos end5 = loc.GetStart (eExtreme_Biological);
4297  if (vec.IsInGap(end5)) {
4298  donor_in_gap = true;
4299  }
4300 
4301  TSeqPos end3 = loc.GetStop (eExtreme_Biological);
4302  if (vec.IsInGap(end3)) {
4303  acceptor_in_gap = true;
4304  }
4305 
4306  if (!partial5 && !partial3) {
4307  if (donor_in_gap && acceptor_in_gap) {
4308  return;
4309  }
4310  }
4311 
4312  Char donor[2]; // donor site signature
4313  Char acceptor[2]; // acceptor site signature
4314  bool donor_good = false; // flag == "true" indicates that donor signature is in @donor
4315  bool acceptor_good = false; // flag == "true" indicates that acceptor signature is in @acceptor
4316 
4317  // Read donor signature into @donor
4318  if (!partial5 && !donor_in_gap) {
4319  if (eNa_strand_minus == strand) {
4320  if (end5 > 0 && IsResidue (vec[end5 - 1]) && IsResidue (vec[end5])) {
4321  donor[0] = vec[end5 - 1];
4322  donor[1] = vec[end5];
4323  donor_good = true;
4324  }
4325  }
4326  else {
4327  if( end5 < seq_len - 1 && IsResidue (vec[end5]) && IsResidue (vec[end5 + 1])) {
4328  donor[0] = vec[end5];
4329  donor[1] = vec[end5 + 1];
4330  donor_good = true;
4331  }
4332  }
4333  }
4334 
4335  // Read acceptor signature into @acceptor
4336  if (!partial3 && !acceptor_in_gap) {
4337  if (eNa_strand_minus == strand) {
4338  if (end3 < seq_len - 1 && IsResidue (vec[end3]) && IsResidue (vec[end3 + 1])) {
4339  acceptor[0] = vec[end3];
4340  acceptor[1] = vec[end3 + 1];
4341  acceptor_good = true;
4342  }
4343  }
4344  else {
4345  if (end3 > 0 && IsResidue (vec[end3 - 1]) && IsResidue (vec[end3])) {
4346  acceptor[0] = vec[end3 - 1];
4347  acceptor[1] = vec[end3];
4348  acceptor_good = true;
4349  }
4350  }
4351  }
4352 
4353  // Check intron's both ends.
4354  if (!partial5 && !partial3) {
4355  if (donor_good && acceptor_good) {
4356  if (CheckIntronSpliceSites(strand, donor, acceptor)) {
4357  return;
4358  }
4359  }
4360  }
4361 
4362  // Check 5'-most
4363  if (!partial5) {
4364  if (!donor_in_gap) {
4365  bool not_found = true;
4366 
4367  if (donor_good) {
4368  if (CheckIntronDonor(strand, donor)) {
4369  not_found = false;
4370  }
4371  }
4372  //
4373  if (not_found) {
4374  if ((strand == eNa_strand_minus && end5 == seq_len - 1) ||
4375  (strand == eNa_strand_plus && end5 == 0)) {
4376 
4378  "Splice donor consensus (GT) not found at start of terminal intron, position "
4379  + NStr::IntToString (end5 + 1) + " of " + label);
4380  }
4381  else {
4383  "Splice donor consensus (GT) not found at start of intron, position "
4384  + NStr::IntToString (end5 + 1) + " of " + label);
4385  }
4386  }
4387  }
4388  }
4389 
4390  // Check 3'-most
4391  if (!partial3) {
4392  if (!acceptor_in_gap) {
4393  bool not_found = true;
4394 
4395  if (acceptor_good) {
4396  if (CheckIntronAcceptor(strand, acceptor)) {
4397  not_found = false;
4398  }
4399  }
4400 
4401  if (not_found) {
4402  if ((strand == eNa_strand_minus && end3 == 0) ||
4403  (strand == eNa_strand_plus && end3 == seq_len - 1)) {
4405  "Splice acceptor consensus (AG) not found at end of terminal intron, position "
4406  + NStr::IntToString (end3 + 1) + " of " + label + ", but at end of sequence");
4407  }
4408  else {
4410  "Splice acceptor consensus (AG) not found at end of intron, position "
4411  + NStr::IntToString (end3 + 1) + " of " + label);
4412  }
4413  }
4414  }
4415  }
4416 
4417 }
4418 
4419 
4421 {
4422  if (!m_Feat.IsSetData()
4424  || !m_Feat.IsSetLocation()
4425  || pseudo) {
4426  return false;
4427  }
4428 
4429  const CSeq_loc& loc = m_Feat.GetLocation();
4430  bool is_short = false;
4431 
4432  if (! m_Imp.IsIndexerVersion()) {
4433  if (!m_LocationBioseq || IsOrganelle(m_LocationBioseq)) return is_short;
4434  }
4435 
4436  if (GetLength(loc, &m_Scope) < 11) {
4437  bool partial_left = loc.IsPartialStart(eExtreme_Positional);
4438  bool partial_right = loc.IsPartialStop(eExtreme_Positional);
4439 
4440  CBioseq_Handle bsh;
4441  if (partial_left && loc.GetStart(eExtreme_Positional) == 0) {
4442  // partial at beginning of sequence, ok
4443  } else if (partial_right &&
4444  (m_LocationBioseq) &&
4445  loc.GetStop(eExtreme_Positional) == (
4447  {
4448  // partial at end of sequence
4449  } else {
4450  is_short = true;
4451  }
4452  }
4453  return is_short;
4454 }
4455 
4456 
4458 {
4461  && (!m_Feat.IsSetQual() || m_Feat.GetQual().empty())
4462  && (!m_Feat.IsSetDbxref() || m_Feat.GetDbxref().empty())) {
4464  "A note or other qualifier is required for a misc_feature");
4465  }
4467  if (NStr::FindWord(m_Feat.GetComment(), "cspA") != NPOS) {
4469  if (cds) {
4470  string content_label;
4471  feature::GetLabel(*cds, &content_label, feature::fFGL_Content, &m_Scope);
4472  if (NStr::Equal(content_label, "cold-shock protein")) {
4474  "cspA misc_feature overlapped by cold-shock protein CDS");
4475  }
4476  }
4477  }
4478  }
4479 
4480 }
4481 
4482 
4484 {
4486 
4487  bool is_far_delta = false;
4490  if ( repr == CSeq_inst::eRepr_delta ) {
4491  is_far_delta = true;
4492  const CBioseq& seq = *(m_LocationBioseq.GetCompleteBioseq());
4493  const CSeq_inst& inst = seq.GetInst();
4494  ITERATE(CDelta_ext::Tdata, sg, inst.GetExt().GetDelta().Get()) {
4495  if ( !(*sg) ) continue;
4496  if (( (**sg).Which() == CDelta_seq::e_Loc )) continue;
4497  is_far_delta = false;
4498  }
4499  }
4500  }
4501  if (! is_far_delta) {
4503  "An assembly_gap feature should only be on a contig record");
4504  }
4505  if (m_Feat.GetLocation().IsInt()) {
4506  TSeqPos from = m_Feat.GetLocation().GetInt().GetFrom();
4507  TSeqPos to = m_Feat.GetLocation().GetInt().GetTo();
4509  string sequence;
4510  bool is5 = false;
4511  bool is3 = false;
4512  long int count = 0;
4513  vec.GetSeqData(from - 1, from, sequence);
4514  if (NStr::Equal (sequence, "N")) {
4515  is5 = true;
4516  }
4517  vec.GetSeqData(to + 1, to + 2, sequence);
4518  if (NStr::Equal (sequence, "N")) {
4519  is3 = true;
4520  }
4521  EDiagSev sv = eDiag_Warning;
4522  if (m_Imp.IsGenomeSubmission()) {
4523  sv = eDiag_Error;
4524  }
4525  if (is5 && is3) {
4527  "Assembly_gap flanked by Ns on 5' and 3' sides");
4528  } else if (is5) {
4530  "Assembly_gap flanked by Ns on 5' side");
4531  } else if (is3) {
4533  "Assembly_gap flanked by Ns on 3' side");
4534  }
4535  vec.GetSeqData(from, to + 1, sequence);
4536  for (size_t i = 0; i < sequence.size(); i++) {
4537  if (sequence[i] != 'N') {
4538  count++;
4539  }
4540  }
4541  if (count > 0) {
4542  PostErr(eDiag_Error, eErr_SEQ_FEAT_AssemblyGapCoversSequence, "Assembly_gap extends into sequence");
4543  }
4544  }
4545 }
4546 
4547 
4549 {
4551  int loc_len = GetLength (m_Feat.GetLocation(), &m_Scope);
4552  // look for estimated length qualifier
4554  if ((*it)->IsSetQual() && NStr::EqualNocase ((*it)->GetQual(), "estimated_length")
4555  && (*it)->IsSetVal() && !NStr::EqualNocase ((*it)->GetVal(), "unknown")) {
4556  try {
4557  int estimated_length = NStr::StringToInt ((*it)->GetVal());
4558  if (estimated_length != loc_len) {
4560  "Gap feature estimated_length " + NStr::IntToString (estimated_length)
4561  + " does not match " + NStr::IntToString (loc_len) + " feature length");
4562  return;
4563  }
4564  } catch (const CException& ) {
4565  } catch (const std::exception& ) {
4566  }
4567  }
4568  }
4569  try {
4570  string s_data = GetSequenceStringFromLoc(m_Feat.GetLocation(), m_Scope);
4572  if ( !vec.empty() ) {
4573  string vec_data;
4574  vec.GetSeqData(0, vec.size(), vec_data);
4575  int num_n = 0;
4576  int num_real = 0;
4577  unsigned int num_gap = 0;
4578  int pos = 0;
4579  string::iterator it = vec_data.begin();
4580  while (it != vec_data.end()) {
4581  if (*it == 'N') {
4582  if (vec.IsInGap(pos)) {
4583  // gap not N
4584  num_gap++;
4585  } else {
4586  num_n++;
4587  }
4588  } else if (*it != '-') {
4589  num_real++;
4590  }
4591  ++it;
4592  ++pos;
4593  }
4594  if (num_real > 0 && num_n > 0) {
4596  "Gap feature over " + NStr::IntToString (num_real)
4597  + " real bases and " + NStr::IntToString (num_n)
4598  + " Ns");
4599  } else if (num_real > 0) {
4601  "Gap feature over " + NStr::IntToString (num_real)
4602  + " real bases");
4603  } else if (num_n > 0) {
4605  "Gap feature over " + NStr::IntToString (num_n)
4606  + " Ns");
4607  } else if (num_gap != GetLength (m_Feat.GetLocation(), &m_Scope)) {
4609  "Gap feature estimated_length " + NStr::IntToString (loc_len)
4610  + " does not match " + NStr::IntToString (num_gap)
4611  + " gap characters");
4612  }
4613  }
4614 
4615  } catch (const CException& ) {
4616  } catch (const std::exception& ) {
4617  }
4618 }
4619 
4620 
4622 {
4625 
4626  const string& key = m_Feat.GetData().GetImp().GetKey();
4627  if (NStr::IsBlank(key)) {
4629  "NULL feature key");
4630  return;
4631  }
4632 
4633  if (subtype == CSeqFeatData::eSubtype_imp || subtype == CSeqFeatData::eSubtype_bad) {
4634  if (NStr::Equal(key, "mRNA")) {
4635  subtype = CSeqFeatData::eSubtype_mRNA;
4636  } else if (NStr::Equal(key, "tRNA")) {
4637  subtype = CSeqFeatData::eSubtype_tRNA;
4638  } else if (NStr::Equal(key, "tRNA")) {
4639  subtype = CSeqFeatData::eSubtype_tRNA;
4640  } else if (NStr::Equal(key, "rRNA")) {
4641  subtype = CSeqFeatData::eSubtype_rRNA;
4642  } else if (NStr::Equal(key, "snRNA")) {
4643  subtype = CSeqFeatData::eSubtype_snRNA;
4644  } else if (NStr::Equal(key, "scRNA")) {
4645  subtype = CSeqFeatData::eSubtype_scRNA;
4646  } else if (NStr::Equal(key, "snoRNA")) {
4648  } else if (NStr::Equal(key, "misc_RNA")) {
4650  } else if (NStr::Equal(key, "precursor_RNA")) {
4652  } else if (NStr::EqualNocase (key, "mat_peptide")) {
4654  } else if (NStr::EqualNocase (key, "propeptide")) {
4656  } else if (NStr::EqualNocase (key, "sig_peptide")) {
4658  } else if (NStr::EqualNocase (key, "transit_peptide")) {
4660  } else if (NStr::EqualNocase (key, "preprotein")
4661  || NStr::EqualNocase(key, "proprotein")) {
4663  } else if (NStr::Eq