NCBI C++ ToolKit
cdregion_validator.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cdregion_validator.cpp 101247 2023-11-20 15:20:49Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko......
27  *
28  * File Description:
29  * validation of Seq_feat
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
40 
41 #include <serial/serialbase.hpp>
42 
43 #include <objmgr/bioseq_handle.hpp>
45 #include <objmgr/seqdesc_ci.hpp>
46 #include <objmgr/seq_vector.hpp>
47 #include <objmgr/scope.hpp>
48 #include <objmgr/util/sequence.hpp>
49 #include <objmgr/util/feature.hpp>
50 
59 
60 #include <objects/seq/MolInfo.hpp>
61 #include <objects/seq/Bioseq.hpp>
63 
65 
66 #include <string>
67 
68 
71 BEGIN_SCOPE(validator)
72 using namespace sequence;
73 
74 
76 CSingleFeatValidator(feat, scope, imp) {
78  if (m_Gene) {
80  } else {
81  m_GeneIsPseudo = false;
82  }
83 }
84 
85 
87 {
88  if (!m_Feat.IsSetComment()) {
89  return;
90  }
92  const string& comment = m_Feat.GetComment();
93  if (NStr::Find(comment, "ambiguity in stop codon") != NPOS
96  if (stop_codon_loc) {
97  TSeqPos len = sequence::GetLength(*stop_codon_loc, &m_Scope);
98  CSeqVector vec(*stop_codon_loc, m_Scope, CBioseq_Handle::eCoding_Iupac);
99  string seq_string;
100  vec.GetSeqData(0, len - 1, seq_string);
101  bool found_ambig = false;
102  string::iterator it = seq_string.begin();
103  while (it != seq_string.end() && !found_ambig) {
104  if (*it != 'A' && *it != 'T' && *it != 'C' && *it != 'G' && *it != 'U') {
105  found_ambig = true;
106  }
107  ++it;
108  }
109  if (!found_ambig) {
111  "Feature comment indicates ambiguity in stop codon "
112  "but no ambiguities are present in stop codon.", m_Feat);
113  }
114  }
115  }
116 
117  // look for EC number in comment
119  // suppress if protein has EC numbers
120  bool suppress = false;
121  if (m_ProductBioseq) {
123  if (prot_feat && prot_feat->GetData().GetProt().IsSetEc()) {
124  suppress = true;
125  }
126  }
127  if (!suppress) {
129  "Apparent EC number in CDS comment");
130  }
131  }
132 
133 }
134 
135 
137 {
140  NStr::FindNoCase(text, "RNA editing") != NPOS) {
142  "CDS has both RNA editing /exception and /transl_except qualifiers");
143  }
144 }
145 
146 
147 #define FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(Itr, Var) \
148 ITERATE (CBioseq_Handle::TId, Itr, Var.GetId())
149 
150 static
151 void s_LocIdType(CBioseq_Handle bsh, bool& is_nt, bool& is_ng, bool& is_nw, bool& is_nc)
152 {
153  is_nt = is_ng = is_nw = is_nc = false;
154  if (bsh) {
156  CSeq_id_Handle sid = *it;
157  switch (sid.Which()) {
158  case NCBI_SEQID(Embl):
159  case NCBI_SEQID(Ddbj):
160  case NCBI_SEQID(Other):
161  case NCBI_SEQID(Genbank):
162  {
164  is_nt |= (info == CSeq_id::eAcc_refseq_contig);
165  is_ng |= (info == CSeq_id::eAcc_refseq_genomic);
168  break;
169  }
170  default:
171  break;
172  }
173  }
174  }
175 }
176 
177 static
178 void s_LocIdType(const CSeq_loc& loc, CScope& scope, const CSeq_entry& tse,
179  bool& is_nt, bool& is_ng, bool& is_nw, bool& is_nc)
180 {
181  is_nt = is_ng = is_nw = is_nc = false;
182  if (!IsOneBioseq(loc, &scope)) {
183  return;
184  }
185  const CSeq_id& id = GetId(loc, &scope);
186  try {
187  CBioseq_Handle bsh = scope.GetBioseqHandleFromTSE(id, tse);
188  if (bsh) {
189  s_LocIdType(bsh, is_nt, is_ng, is_nw, is_nc);
190  }
191  } catch (CException&) {
192  }
193 }
194 
195 
197 {
198  CCDSTranslationProblems problems;
199  bool is_nt, is_ng, is_nw, is_nc;
200  s_LocIdType(m_LocationBioseq, is_nt, is_ng, is_nw, is_nc);
201 
203  m_Feat,
209  m_Imp.IsStandaloneAnnot() ? false : m_Imp.GetTSE().IsSeq(),
210  m_Imp.IsGpipe(),
211  m_Imp.IsGenomic(),
212  m_Imp.IsRefSeq(),
213  (is_nt || is_ng || is_nw),
214  is_nc,
215  (m_Imp.IsRefSeq() || m_Imp.IsGED() || m_Imp.IsTPE()),
216  &m_Scope);
217  if (!problems.UnableToTranslate() && !problems.HasException()) {
219  }
220 
224  }
225  }
226 
227  x_ReportTranslationProblems(problems);
228 }
229 
230 
231 int GetGcodeForName(const string& code_name)
232 {
235  if (NStr::EqualNocase((*it)->GetName(), code_name)) {
236  return (*it)->GetId();
237  }
238  }
239  return 255;
240 }
241 
242 
244 {
245  int gc = 0;
246  if (cdr.IsSetCode()) {
248  if ((*it)->IsId()) {
249  gc = (*it)->GetId();
250  } else if ((*it)->IsName()) {
251  gc = GetGcodeForName((*it)->GetName());
252  }
253  if (gc != 0) break;
254  }
255  }
256  return gc;
257 }
258 
259 
260 string GetInternalStopErrorMessage(const CSeq_feat& feat, size_t internal_stop_count, bool bad_start, char transl_start)
261 {
263  string gccode = NStr::IntToString(gc);
264 
265  string error_message;
266  if (bad_start) {
267  bool got_dash = transl_start == '-';
268  string codon_desc = got_dash ? "illegal" : "ambiguous";
269  error_message = NStr::SizetToString(internal_stop_count) +
270  " internal stops (and " + codon_desc + " start codon). Genetic code [" + gccode + "]";
271  } else {
272  error_message = NStr::SizetToString(internal_stop_count) +
273  " internal stops. Genetic code [" + gccode + "]";
274  }
275  return error_message;
276 }
277 
278 
279 string GetInternalStopErrorMessage(const CSeq_feat& feat, const string& transl_prot)
280 {
281  size_t internal_stop_count = CountInternalStopCodons(transl_prot);
282 
284  string gccode = NStr::IntToString(gc);
285 
286  string error_message;
287  if (HasBadStartCodon(feat.GetLocation(), transl_prot)) {
288  bool got_dash = transl_prot[0] == '-';
289  string codon_desc = got_dash ? "illegal" : "ambiguous";
290  error_message = NStr::SizetToString(internal_stop_count) +
291  " internal stops (and " + codon_desc + " start codon). Genetic code [" + gccode + "]";
292  } else {
293  error_message = NStr::SizetToString(internal_stop_count) +
294  " internal stops. Genetic code [" + gccode + "]";
295  }
296  return error_message;
297 }
298 
299 
300 string GetStartCodonErrorMessage(const CSeq_feat& feat, const char first_char, size_t internal_stop_count)
301 {
302  bool got_dash = first_char == '-';
303  string codon_desc = got_dash ? "Illegal" : "Ambiguous";
304  string p_word = got_dash ? "Probably" : "Possibly";
305 
307  string gccode = NStr::IntToString(gc);
308 
309  string error_message;
310 
311  if (internal_stop_count > 0) {
312  error_message = codon_desc + " start codon (and " +
313  NStr::SizetToString(internal_stop_count) +
314  " internal stops). " + p_word + " wrong genetic code [" +
315  gccode + "]";
316  } else {
317  error_message = codon_desc + " start codon used. Wrong genetic code [" +
318  gccode + "] or protein should be partial";
319  }
320  return error_message;
321 }
322 
323 
324 string GetStartCodonErrorMessage(const CSeq_feat& feat, const string& transl_prot)
325 {
326  size_t internal_stop_count = CountInternalStopCodons(transl_prot);
327 
328  return GetStartCodonErrorMessage(feat, transl_prot[0], internal_stop_count);
329 }
330 
331 
333 {
334  size_t problem_flags = problems.GetTranslationProblemFlags();
336  string label;
337  const CSeq_id* protid = &GetId(m_Feat.GetProduct(), &m_Scope);
338  protid->GetLabel(&label);
339  EDiagSev sev = eDiag_Error;
340  if (protid->IsGeneral() && protid->GetGeneral().IsSetDb() &&
341  (NStr::EqualNocase(protid->GetGeneral().GetDb(), "ti") ||
342  NStr::EqualNocase(protid->GetGeneral().GetDb(), "SRA"))) {
343  sev = eDiag_Warning;
344  }
346  "Unable to fetch CDS product '" + label + "'");
347  }
348 
349  if (!problems.HasException() && (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_NoProtein)) {
350  bool is_nt, is_ng, is_nw, is_nc;
351  s_LocIdType(m_Feat.GetLocation(), m_Scope, m_Imp.GetTSE(), is_nt, is_ng, is_nw, is_nc);
352  EDiagSev sev = eDiag_Error;
354  sev = eDiag_Warning;
355  }
356  if (is_nc) {
357  sev = eDiag_Warning;
358  }
360  "No protein Bioseq given");
361  }
362 
363  bool unclassified_except = false;
364  if (m_Feat.IsSetExcept_text() && NStr::FindNoCase(m_Feat.GetExcept_text(), "unclassified translation discrepancy") != NPOS) {
365  unclassified_except = true;
366  }
367 
369 
370  if (!problems.HasException() && problems.HasUnparsedTranslExcept()) {
371  if (problems.GetInternalStopCodons() == 0 && problems.GetTranslationMismatches().size() == 0) {
373  "Unparsed transl_except qual (but protein is okay). Skipped");
374  } else {
376  "Unparsed transl_except qual. Skipped");
377  }
378  }
379 
380 
381  for (size_t i = 0; i < problems.GetNumNonsenseIntrons(); i++) {
382  EDiagSev sev = eDiag_Critical;
383  if (m_Imp.IsEmbl() || m_Imp.IsDdbj()) {
384  sev = eDiag_Error;
385  }
386  PostErr(sev, eErr_SEQ_FEAT_IntronIsStopCodon, "Triplet intron encodes stop codon");
387  }
388 
390  PostErr(eDiag_Info, eErr_SEQ_FEAT_CDShasTooManyXs, "CDS translation consists of more than 50% X residues");
391  }
392 
393  if (problems.UnableToTranslate()) {
394  if (!problems.HasException()) {
396  "Unable to translate");
397  }
398  }
399 
400  if (!problems.UnableToTranslate() && !problems.AltStart() &&
402  NStr::Find(m_Feat.GetExcept_text(), "alternative start codon") != string::npos &&
404 
406  "Unnecessary alternative start codon exception");
407  }
408 
409  if ((!problems.HasException() || unclassified_except) && problems.GetInternalStopCodons() > 0) {
410  if (unclassified_except && m_Imp.IsGpipe()) {
411  // suppress if gpipe genomic
412  } else {
413  EDiagSev stop_sev = unclassified_except ? eDiag_Warning : eDiag_Error;
414  if (!m_Imp.IsRefSeq() && m_Imp.IsGI() && m_Imp.IsGED()) {
415  stop_sev = eDiag_Critical;
416  }
417 
421  problems.GetTranslStartCharacter()));
422  }
423  }
424 
425  if (!problems.HasException()) {
426 
427  if (!unclassified_except && (problem_flags & CCDSTranslationProblems::eCDSTranslationProblem_BadStart)) {
428  string start_err_msg = GetStartCodonErrorMessage(m_Feat, problems.GetTranslStartCharacter(), problems.GetInternalStopCodons());
430  start_err_msg);
431  }
432 
435  "Suspicious CDS location - reading frame > 1 but not 5' partial");
436  }
437 
439  EDiagSev sev = eDiag_Warning;
441  {
442  sev = eDiag_Error;
443  }
445  "Suspicious CDS location - reading frame > 1 and not at consensus splice site");
446  }
447 
450  "Missing stop codon");
451  }
454  "Got stop codon, but 3'end is labeled partial");
455  }
458  "Start of location should probably be partial");
459  }
460  if (problems.GetRaggedLength() > 0) {
462  "Coding region extends " + NStr::IntToString(problems.GetRaggedLength()) +
463  " base(s) past stop codon");
464  }
465  }
466 
467  if (!problems.UnableToTranslate() && problems.GetProtLen() > 1.2 * problems.GetTransLen()) {
468  if ((!m_Feat.IsSetExcept_text()) || NStr::Find(m_Feat.GetExcept_text(), "annotated by transcript or proteomic data") == string::npos) {
469  string msg = "Protein product length [" + NStr::SizetToString(problems.GetProtLen()) +
470  "] is more than 120% of the ";
471  if (m_ProductIsFar) {
472  msg += "(far) ";
473  }
474  msg += "translation length [" + NStr::SizetToString(problems.GetTransLen()) + "]";
476  }
477  }
478 
479 
480  bool rna_editing = false;
481  if (m_Feat.IsSetExcept_text() && NStr::FindNoCase(m_Feat.GetExcept_text(), "RNA editing") != NPOS) {
482  rna_editing = true;
483  }
484  if (problems.GetProtLen() != problems.GetTransLen() &&
485  (!problems.HasException() ||
486  (rna_editing &&
487  (problems.GetProtLen() < problems.GetTransLen() - 1 || problems.GetProtLen() > problems.GetTransLen())))) {
488  string msg = "Given protein length [" + NStr::SizetToString(problems.GetProtLen()) +
489  "] does not match ";
490  if (m_ProductIsFar) {
491  msg += "(far) ";
492  }
493  msg += "translation length [" +
494  NStr::SizetToString(problems.GetTransLen()) + "]";
495 
496  if (rna_editing) {
497  msg += " (RNA editing present)";
498  }
499  PostErr(rna_editing ? eDiag_Warning : eDiag_Error,
501  }
502 
503  bool mismatch_except = false;
504  if (m_Feat.IsSetExcept_text() && NStr::FindNoCase(m_Feat.GetExcept_text(), "mismatches in translation") != NPOS) {
505  mismatch_except = true;
506  }
507 
508  if (!problems.HasException() && !mismatch_except) {
510  }
511 
512  if (problems.GetTranslTerminalX() != problems.GetProdTerminalX()) {
514  "Terminal X count for CDS translation (" + NStr::SizetToString(problems.GetTranslTerminalX())
515  + ") and protein product sequence (" + NStr::SizetToString(problems.GetProdTerminalX())
516  + ") are not equal");
517  }
518 
521  "End of location should probably be partial");
522  }
525  "This SeqFeat should not be partial");
526  }
527 
530  "CDS has exception but passes translation test");
531  }
532 
535  "CDS has unclassified exception but only difference is "
536  + NStr::SizetToString(problems.GetTranslationMismatches().size()) + " mismatches out of "
537  + NStr::SizetToString(problems.GetProtLen()) + " residues");
538  }
539 
542  "CDS has unnecessary translated product replaced exception");
543  }
544 
545 }
546 
547 
549 {
550  string result;
551 
552  CSeq_point pnt;
553  pnt.SetPoint(pos);
555 
556  try {
557  pnt.SetId().Assign(GetId(m_Feat.GetProduct(), &m_Scope));
558  } catch (const CObjmgrUtilException&) {}
559 
560  CSeq_loc tmp;
561  tmp.SetPnt(pnt);
563 
565 
566  return result;
567 }
568 
569 
571 {
572  string nuclocstr;
573 
574  size_t num_mismatches = mismatches.size();
575 
576  if (num_mismatches > 10) {
577  // report total number of mismatches and the details of the
578  // first and last.
579  nuclocstr = MapToNTCoords(mismatches.front().pos);
580  string msg =
581  NStr::SizetToString(mismatches.size()) + " mismatches found. " +
582  "First mismatch at " + NStr::IntToString(mismatches.front().pos + 1) +
583  ", residue in protein [";
584  msg += mismatches.front().prot_res;
585  msg += "] != translation [";
586  msg += mismatches.front().transl_res;
587  msg += "]";
588  if (!nuclocstr.empty()) {
589  msg += " at " + nuclocstr;
590  }
591  nuclocstr = MapToNTCoords(mismatches.back().pos);
592  msg +=
593  ". Last mismatch at " + NStr::IntToString(mismatches.back().pos + 1) +
594  ", residue in protein [";
595  msg += mismatches.back().prot_res;
596  msg += "] != translation [";
597  msg += mismatches.back().transl_res;
598  msg += "]";
599  if (!nuclocstr.empty()) {
600  msg += " at " + nuclocstr;
601  }
602  int gc = 0;
604  // We assume that the id is set for all Genetic_code
605  gc = m_Feat.GetData().GetCdregion().GetCode().GetId();
606  }
607  string gccode = NStr::IntToString(gc);
608 
609  msg += ". Genetic code [" + gccode + "]";
611  } else {
612  // report individual mismatches
613  for (size_t i = 0; i < mismatches.size(); ++i) {
614  nuclocstr = MapToNTCoords(mismatches[i].pos);
615  if (mismatches[i].pos == 0 && mismatches[i].transl_res == '-') {
616  // skip - dash is expected to differ
617  num_mismatches--;
618  } else {
619  EDiagSev sev = eDiag_Error;
620  if (mismatches[i].prot_res == 'X' &&
621  (mismatches[i].transl_res == 'B' || mismatches[i].transl_res == 'Z' || mismatches[i].transl_res == 'J')) {
622  sev = eDiag_Warning;
623  }
624  string msg;
625  if (m_ProductIsFar) {
626  msg += "(far) ";
627  }
628  msg += "Residue " + NStr::IntToString(mismatches[i].pos + 1) +
629  " in protein [";
630  msg += mismatches[i].prot_res;
631  msg += "] != translation [";
632  msg += mismatches[i].transl_res;
633  msg += "]";
634  if (!nuclocstr.empty()) {
635  msg += " at " + nuclocstr;
636  }
638  }
639  }
640  }
641 }
642 
643 
645 {
646  for (auto it = problems.begin(); it != problems.end(); it++) {
647  string msg;
648  switch (it->problem) {
650  if (!has_exception) {
652  "transl_except qual out of frame.");
653  }
654  break;
656  msg = "Suspicious transl_except ";
657  msg += it->ex;
658  msg += " at first codon of complete CDS";
660  break;
662  msg = "Unnecessary transl_except ";
663  msg += it->ex;
664  msg += " at position ";
665  msg += NStr::SizetToString(it->prot_pos + 1);
667  msg);
668  break;
670  msg = "Unexpected transl_except ";
671  msg += it->ex;
672  msg += +" at position " + NStr::SizetToString(it->prot_pos + 1)
673  + " just past end of protein";
674 
676  msg);
677  break;
678  }
679  }
680 }
681 
682 
684 {
685  const CCdregion& cds = m_Feat.GetData().GetCdregion();
686  const CSeq_loc& feat_loc = m_Feat.GetLocation();
687  const CCode_break* prev_cbr = nullptr;
688 
690  const CCode_break& cbr = **it;
691  const CSeq_loc& cbr_loc = cbr.GetLoc();
692  ECompare comp = Compare(cbr_loc, feat_loc, &m_Scope, fCompareOverlapping);
693  if ( ((comp != eContained) && (comp != eSame)) || cbr_loc.IsNull() || cbr_loc.IsEmpty()) {
695  "Code-break location not in coding region");
696  } else if (m_Feat.IsSetProduct()) {
697  if (cbr_loc.GetStop(eExtreme_Biological) == feat_loc.GetStop(eExtreme_Biological)) {
698  // terminal exception - don't bother checking, can't be mapped
699  } else {
700  if (SeqLocCheck(cbr_loc, &m_Scope) == eSeqLocCheck_error) {
701  string lbl = GetValidatorLocationLabel(cbr_loc, m_Scope);
703  "Code-break: SeqLoc [" + lbl + "] out of range");
704  } else {
705  int frame = 0;
706  CRef<CSeq_loc> p_loc = SourceToProduct(m_Feat, cbr_loc, fS2P_AllowTer, &m_Scope, &frame);
707  if (!p_loc || p_loc->IsNull() || frame != 1) {
709  "Code-break location not in coding region - may be frame problem");
710  }
711  }
712  }
713  }
714  if (cbr_loc.IsPartialStart(eExtreme_Biological) ||
715  cbr_loc.IsPartialStop(eExtreme_Biological)) {
717  "Translation exception locations should not be partial");
718  }
719  if ( prev_cbr ) {
720  if ( Compare(cbr_loc, prev_cbr->GetLoc(), &m_Scope, fCompareOverlapping) == eSame ) {
721  string msg = "Multiple code-breaks at same location ";
722  string str = GetValidatorLocationLabel (cbr_loc, m_Scope);
723  if ( !str.empty() ) {
724  msg += "[" + str + "]";
725  }
727  msg);
728  }
729  }
730  prev_cbr = &cbr;
731  }
732 }
733 
734 
736 {
738 
739  bool feat_is_pseudo = s_IsPseudo(m_Feat);
740  bool pseudo = feat_is_pseudo || m_GeneIsPseudo;
741 
742  x_ValidateQuals();
744 
745  const CCdregion& cdregion = m_Feat.GetData().GetCdregion();
746  if (cdregion.IsSetOrf() && cdregion.GetOrf() &&
747  m_Feat.IsSetProduct()) {
749  "An ORF coding region should not have a product");
750  }
751 
752  if (pseudo) {
753  if (m_Feat.IsSetProduct()) {
754  if (feat_is_pseudo) {
756  "A pseudo coding region should not have a product");
757  } else if (m_GeneIsPseudo) {
759  "A coding region overlapped by a pseudogene should not have a product");
760  } else {
762  "A pseudo coding region should not have a product");
763  }
764  }
765  } else {
769  }
770 
775 
776  if (x_IsProductMisplaced()) {
777  if (m_Imp.IsSmallGenomeSet()) {
779  "Protein product not packaged in nuc-prot set with nucleotide in small genome set");
780  } else {
782  "Protein product not packaged in nuc-prot set with nucleotide");
783  }
784  }
785 
786  bool conflict = cdregion.IsSetConflict() && cdregion.GetConflict();
787  if ( !pseudo && !conflict ) {
788  x_ValidateTrans();
789  ValidateSplice(false, false);
790  }
791 
792  if (conflict) {
794  }
795 
798 
801 }
802 
803 
805 {
807  const CGb_qual& qual = **it;
808  if (qual.CanGetQual()) {
809  const string& key = qual.GetQual();
810  if (NStr::EqualNocase(key, "exception")) {
811  if (!m_Feat.IsSetExcept()) {
813  "Exception flag should be set in coding region");
814  }
815  } else if (NStr::EqualNocase(key, "codon")) {
817  "Use the proper genetic code, if available, "
818  "or set transl_excepts on specific codons");
819  } else if (NStr::EqualNocase(key, "protein_id")) {
821  "protein_id should not be a gbqual on a CDS feature");
822  } else if (NStr::EqualNocase(key, "gene_synonym")) {
824  "gene_synonym should not be a gbqual on a CDS feature");
825  } else if (NStr::EqualNocase(key, "transcript_id")) {
827  "transcript_id should not be a gbqual on a CDS feature");
828  } else if (NStr::EqualNocase(key, "codon_start")) {
829  const CCdregion& cdregion = m_Feat.GetData().GetCdregion();
830  if (cdregion.IsSetFrame() && cdregion.GetFrame() != CCdregion::eFrame_not_set) {
832  "conflicting codon_start values");
833  } else {
835  "codon_start value should be 1, 2, or 3");
836  }
837  }
838  }
839  }
840 }
841 
842 
844 {
845  if (!m_GeneIsPseudo && !s_IsPseudo(m_Feat)) {
846  return true;
847  } else {
848  return false;
849  }
850 }
851 
852 
853 const string s_PlastidTxt[20] = {
854  "",
855  "",
856  "chloroplast",
857  "chromoplast",
858  "",
859  "",
860  "plastid",
861  "",
862  "",
863  "",
864  "",
865  "",
866  "cyanelle",
867  "",
868  "",
869  "",
870  "apicoplast",
871  "leucoplast",
872  "proplastid",
873  "",
874 };
875 
876 
878 {
879  if ( genome == CBioSource::eGenome_chloroplast ||
881  genome == CBioSource::eGenome_plastid ||
882  genome == CBioSource::eGenome_cyanelle ||
883  genome == CBioSource::eGenome_apicoplast ||
884  genome == CBioSource::eGenome_leucoplast ||
885  genome == CBioSource::eGenome_proplastid ||
887  return true;
888  }
889 
890  return false;
891 }
892 
893 
894 static bool IsGeneticCodeValid(int gcode)
895 {
896  bool ret = false;
897  if (gcode > 0) {
898 
899  try {
900  const CTrans_table& tbl = CGen_code_table::GetTransTable(gcode);
901  (void)tbl; // suppress unused-variable warning
902  ret = true;
903  }
904  catch (CException&) {
905  }
906  }
907 
908  return ret;
909 }
910 
911 
912 static int s_GetStrictGenCode(const CBioSource& src)
913 {
914  int gencode = 0;
915 
916  try {
918 
919  if ( src.IsSetOrg() && src.GetOrg().IsSetOrgname() ) {
920  const COrgName& orn = src.GetOrg().GetOrgname();
921 
922  switch ( genome ) {
926  // bacteria and plant organelle code
927  if (orn.IsSetMgcode()) {
928  gencode = orn.GetMgcode();
929  }
930  break;
938  if (orn.IsSetPgcode() && orn.GetPgcode() != 0) {
939  gencode = orn.GetPgcode();
940  } else {
941  // bacteria and plant plastids are code 11.
942  gencode = 11;
943  }
944  break;
945  default:
946  if (orn.IsSetGcode()) {
947  gencode = orn.GetGcode();
948  }
949  break;
950  }
951  }
952  } catch (const CException& ) {
953  } catch (const std::exception& ) {
954  }
955  return gencode;
956 }
957 
958 
960 {
961  if (!m_LocationBioseq) {
962  return;
963  }
964  int cdsgencode = 0;
965 
966  const CCdregion& cdregion = m_Feat.GetData().GetCdregion();
967 
968  if (cdregion.CanGetCode()) {
969  cdsgencode = cdregion.GetCode().GetId();
970 
971  if (!IsGeneticCodeValid(cdsgencode)) {
973  "A coding region contains invalid genetic code [" + NStr::IntToString(cdsgencode) + "]");
974  }
975  }
976 
978  if (diter) {
979  const CBioSource& src = diter->GetSource();
980  int biopgencode = s_GetStrictGenCode(src);
981 
982  if (biopgencode != cdsgencode
983  && (!m_Feat.IsSetExcept()
985  || NStr::Find(m_Feat.GetExcept_text(), "genetic code exception") == string::npos)) {
986  int genome = 0;
987 
988  if (src.CanGetGenome()) {
989  genome = src.GetGenome();
990  }
991 
992  if (IsPlastid(genome)) {
994  "Genetic code conflict between CDS (code " +
995  NStr::IntToString(cdsgencode) +
996  ") and BioSource.genome biological context (" +
997  s_PlastidTxt[genome] + ") (uses code 11)");
998  } else {
1000  "Genetic code conflict between CDS (code " +
1001  NStr::IntToString(cdsgencode) +
1002  ") and BioSource (code " +
1003  NStr::IntToString(biopgencode) + ")");
1004  }
1005  }
1006  }
1007 }
1008 
1009 
1011 {
1013 
1014  // for coding regions, internal exons should not be 15 or less bp long
1015  int num_short_exons = 0;
1016  string message;
1017  CSeq_loc_CI it(m_Feat.GetLocation());
1018  if (it) {
1019  // note - do not want to warn for first or last exon
1020  ++it;
1021  size_t prev_len = 16;
1022  size_t prev_start = 0;
1023  size_t prev_stop = 0;
1024  while (it) {
1025  if (prev_len <= 15) {
1026  num_short_exons++;
1027  if (!message.empty()) {
1028  message += ", ";
1029  }
1030  message += NStr::NumericToString(prev_start + 1)
1031  + "-" + NStr::NumericToString(prev_stop + 1);
1032  }
1033  prev_len = it.GetRange().GetLength();
1034  prev_start = it.GetRange().GetFrom();
1035  prev_stop = it.GetRange().GetTo();
1036  ++it;
1037  }
1038  }
1039  if (num_short_exons > 1) {
1041  "Coding region has multiple internal exons that are too short at positions " + message);
1042  } else if (num_short_exons == 1) {
1044  "Internal coding region exon is too short at position " + message);
1045  }
1046 }
1047 
1048 
1050 {
1051  if (x_HasGoodParent()) {
1052  return;
1053  }
1054 
1055  const CSeq_loc& loc = m_Feat.GetLocation();
1056 
1058  loc,
1061  m_Scope);
1062  if (!mrna) {
1063  return;
1064  }
1065 
1066  mrna = GetBestOverlappingFeat(
1067  loc,
1070  m_Scope);
1071  if (mrna) {
1072  return;
1073  }
1074 
1075  mrna = GetBestOverlappingFeat(
1076  loc,
1079  m_Scope);
1080  if (!mrna) {
1081  return;
1082  }
1083 
1084  bool pseudo = s_IsPseudo(m_Feat) || m_GeneIsPseudo;
1085 
1087  if (pseudo) {
1089  }
1090 
1091  mrna = GetBestOverlappingFeat(
1092  loc,
1095  m_Scope);
1096 
1097  EDiagSev sev = eDiag_Warning;
1098  if (pseudo) {
1099  sev = eDiag_Info;
1100  }
1101  if (mrna) {
1102  // ribosomal slippage exception suppresses CDSmRNArange warning
1103  bool supress = false;
1104 
1105  if (m_Feat.CanGetExcept_text()) {
1107  if (NStr::FindNoCase(text, "ribosomal slippage") != NPOS
1108  || NStr::FindNoCase(text, "trans-splicing") != NPOS) {
1109  supress = true;
1110  }
1111  }
1112  if (!supress) {
1113  PostErr(sev, err_type,
1114  "mRNA contains CDS but internal intron-exon boundaries "
1115  "do not match");
1116  }
1117  } else {
1118  PostErr(sev, err_type,
1119  "mRNA overlaps or contains CDS but does not completely "
1120  "contain intervals");
1121  }
1122 }
1123 
1124 
1126 {
1127 
1128  CSeq_feat_Handle fh;
1129  try {
1130  // will fail if location is bad
1132  } catch (CException&) {
1133  return false;
1134  }
1135 
1136  static const list<CSeqFeatData::ESubtype> parent_types = {
1141  };
1142 
1143  CRef<feature::CFeatTree> feat_tree;
1144  if (m_Imp.IsHugeFileMode()) {
1145  feat_tree = Ref(new feature::CFeatTree());
1146  CMappedFeat mappedFeat(fh);
1147  for (auto parent_type : parent_types) {
1148  feat_tree->AddFeaturesFor(mappedFeat, parent_type);
1149  }
1150  }
1151  else feat_tree = m_Imp.GetGeneCache().GetFeatTreeFromCache(m_Feat, m_Scope);
1152  if (!feat_tree) {
1153  return false;
1154  }
1155 
1156  for (auto parent_type : parent_types) {
1157  CMappedFeat parent = feat_tree->GetParent(fh, parent_type);
1158  if (parent) {
1160  parent.GetLocation(),
1161  &m_Scope,
1164  return true;
1165  }
1166  }
1167  }
1168  return false;
1169 }
1170 
1171 
1172 // VR-619
1173 // for an mRNA / CDS pair where both have far products
1174 // (which is only true for genomic RefSeqs with instantiated mRNA products),
1175 // please check that the pair found by CFeatTree corresponds to the nuc-prot pair in ID
1176 // (i.e.the CDS product is annotated on the mRNA product).
1178 {
1179  // if coding region doesn't have a far product, nothing to check
1180  if (!m_ProductIsFar) {
1181  return;
1182  }
1183  // no point if not far-fetching
1184  if (!m_Imp.IsRemoteFetch()) {
1185  return;
1186  }
1187  if (!m_Feat.GetData().IsCdregion() || !m_Feat.IsSetProduct()) {
1188  return;
1189  }
1190  if (!m_Imp.IsRefSeq()) {
1191  return;
1192  }
1193  const CSeq_id * cds_sid = m_Feat.GetProduct().GetId();
1194  if (!cds_sid) {
1195  return;
1196  }
1198  if (!feat_tree) {
1199  return;
1200  }
1202  if (!fh) {
1203  return;
1204  }
1205  CMappedFeat mrna = feat_tree->GetParent(fh, CSeqFeatData::eSubtype_mRNA);
1206  if (!mrna || !mrna.IsSetProduct()) {
1207  // no mRNA or no mRNA product
1208  return;
1209  }
1210  const CSeq_id * mrna_sid = mrna.GetProduct().GetId();
1211  if (!mrna_sid) {
1212  return;
1213  }
1214 
1215  if (!m_Imp.IsFarSequence(*mrna_sid)) {
1216  // mRNA product is not far
1217  return;
1218  }
1219  auto mrna_prod = m_Scope.GetBioseqHandle(*mrna_sid);
1220  if (!mrna_prod) {
1221  // can't be fetched, will be reported elsewhere
1222  return;
1223  }
1224  CSeq_entry_Handle far_mrna_nps =
1225  mrna_prod.GetExactComplexityLevel(CBioseq_set::eClass_nuc_prot);
1226  if (!far_mrna_nps) {
1227  PostErr(eDiag_Error, eErr_SEQ_FEAT_CDSmRNAmismatch, "no Far mRNA nuc-prot-set");
1228  } else {
1229  CBioseq_Handle cds_prod = m_Scope.GetBioseqHandleFromTSE(*cds_sid, far_mrna_nps);
1230  if (!cds_prod) {
1231  PostErr(eDiag_Error, eErr_SEQ_FEAT_CDSmRNAmismatch, "Far CDS product and far mRNA product are not packaged together");
1232  m_Imp.PostErr(eDiag_Error, eErr_SEQ_FEAT_CDSmRNAmismatch, "Far CDS product and far mRNA product are not packaged together", *(mrna.GetSeq_feat()));
1233  }
1234  }
1235 }
1236 
1237 
1239 {
1240  try {
1241  if (!m_Feat.GetData().IsCdregion() || !m_Feat.CanGetProduct()) {
1242  return;
1243  }
1244 
1246  if (!prot) {
1247  return;
1248  }
1250  if (!nuc) {
1251  return;
1252  }
1253  // check for self-referential CDS feature
1254  if (nuc == prot) {
1255  return;
1256  }
1257 
1258  const CGene_ref* cds_ref = nullptr;
1259 
1260  // map from cds product to nucleotide
1261  const string prev = GetDiagFilter(eDiagFilter_Post);
1262  SetDiagFilter(eDiagFilter_All, "!(1305.28,31)");
1265 
1266  for (CFeat_CI it(prot, CSeqFeatData::e_Prot); it; ++it) {
1267  CSeq_feat_Handle curr = it->GetSeq_feat_Handle();
1268  CSeqFeatData::ESubtype subtype = curr.GetFeatSubtype();
1269 
1270  if (subtype != CSeqFeatData::eSubtype_preprotein &&
1275  continue;
1276  }
1277 
1278  // see if already has gene xref
1279  if (curr.GetGeneXref()) {
1280  continue;
1281  }
1282 
1283  if (! cds_ref) {
1284  // wait until first mat_peptide found to avoid expensive computation on CDS /gene qualifier
1286  if (cgene && cgene->CanGetData() && cgene->GetData().IsGene()) {
1287  const CGene_ref& cgref = cgene->GetData().GetGene();
1288  cds_ref = &cgref;
1289  } else {
1290  // if CDS does not have overlapping gene, bail out of function
1291  return;
1292  }
1293  }
1294 
1295  const CSeq_loc& loc = curr.GetLocation();
1296  // map prot location to nuc location
1297  CRef<CSeq_loc> nloc(prot_to_cds.Map(loc));
1298  if (! nloc) {
1299  continue;
1300  }
1301 
1302  const CGene_ref* pep_ref = nullptr;
1304  if (pgene && pgene->CanGetData() && pgene->GetData().IsGene()) {
1305  const CGene_ref& pgref = pgene->GetData().GetGene();
1306  pep_ref = &pgref;
1307  }
1308 
1309  if (! cds_ref || ! pep_ref) {
1310  continue;
1311  }
1312  if (cds_ref->IsSetLocus_tag() && pep_ref->IsSetLocus_tag()) {
1313  if (cds_ref->GetLocus_tag() == pep_ref->GetLocus_tag()) {
1314  continue;
1315  }
1316  } else if (cds_ref->IsSetLocus() && pep_ref->IsSetLocus()) {
1317  if (cds_ref->GetLocus() == pep_ref->GetLocus()) {
1318  continue;
1319  }
1320  }
1321 
1322  if (pgene) {
1323 
1324  const CSeq_loc& gloc = pgene->GetLocation();
1325 
1326  if (sequence::Compare(*nloc, gloc, nullptr /* scope */, sequence::fCompareOverlapping) == sequence::eSame) {
1327 
1329  "Peptide under CDS matches small Gene");
1330  }
1331  }
1332  }
1333  } catch (const CException& ) {
1334  }
1335 }
1336 
1337 
1339 {
1341  return;
1342  }
1343 
1345  if (!sd) {
1346  return;
1347  }
1348  const CMolInfo& molinfo = sd->GetMolinfo();
1349 
1350  const CSeq_loc& loc = m_Feat.GetLocation();
1351  bool partial5 = loc.IsPartialStart(eExtreme_Biological);
1352  bool partial3 = loc.IsPartialStop(eExtreme_Biological);
1353 
1354  if (molinfo.CanGetCompleteness()) {
1355  switch (molinfo.GetCompleteness()) {
1357  break;
1358 
1360  if (partial5 || partial3) {
1362  "CDS is partial but protein is complete");
1363  }
1364  break;
1365 
1367  break;
1368 
1370  if (!partial5) {
1372  "CDS is 5' complete but protein is NH2 partial");
1373  }
1374  if (partial3) {
1375  EDiagSev sev = eDiag_Error;
1376  if (x_CDS3primePartialTest())
1377  {
1378  sev = eDiag_Warning;
1379  }
1381  "CDS is 3' partial but protein is NH2 partial");
1382  }
1383  break;
1384 
1386  if (!partial3) {
1388  "CDS is 3' complete but protein is CO2 partial");
1389  }
1390  if (partial5) {
1391  EDiagSev sev = eDiag_Error;
1392  if (x_CDS5primePartialTest())
1393  {
1394  sev = eDiag_Warning;
1395  }
1397  "CDS is 5' partial but protein is CO2 partial");
1398  }
1399  break;
1400 
1402  if (partial5 && partial3) {
1403  } else if (partial5) {
1404  EDiagSev sev = eDiag_Error;
1405  if (x_CDS5primePartialTest())
1406  {
1407  sev = eDiag_Warning;
1408  }
1410  "CDS is 5' partial but protein has neither end");
1411  } else if (partial3) {
1412  EDiagSev sev = eDiag_Error;
1413  if (x_CDS3primePartialTest()) {
1414  sev = eDiag_Warning;
1415  }
1416 
1418  "CDS is 3' partial but protein has neither end");
1419  } else {
1421  "CDS is complete but protein has neither end");
1422  }
1423  break;
1424 
1426  break;
1427 
1429  break;
1430 
1432  break;
1433 
1434  default:
1435  break;
1436  }
1437  }
1438 }
1439 
1440 
1441 static const char* const sc_BypassCdsPartialCheckText[] = {
1442  "RNA editing",
1443  "annotated by transcript or proteomic data",
1444  "artificial frameshift",
1445  "mismatches in translation",
1446  "rearrangement required for product",
1447  "reasons given in citation",
1448  "translated product replaced",
1449  "unclassified translation discrepancy"
1450 };
1453 
1455 {
1457  const string& except_text = m_Feat.GetExcept_text();
1458  ITERATE(TBypassCdsPartialCheckSet, it, sc_BypassCdsPartialCheck) {
1459  if (NStr::FindNoCase(except_text, *it) != NPOS) {
1460  return true; // biological exception
1461  }
1462  }
1463  }
1464  return false;
1465 }
1466 
1467 
1469 {
1470  CSeq_loc_CI last;
1471  for (CSeq_loc_CI sl_iter(m_Feat.GetLocation()); sl_iter; ++sl_iter) {
1472  last = sl_iter;
1473  }
1474 
1475  if (last) {
1476  if (last.GetStrand() == eNa_strand_minus) {
1477  if (last.GetRange().GetFrom() == 0) {
1478  return true;
1479  }
1480  } else {
1481  if (!m_LocationBioseq) {
1482  return false;
1483  }
1484  if (last.GetRange().GetTo() == m_LocationBioseq.GetInst_Length() - 1) {
1485  return true;
1486  }
1487  }
1488  }
1489  return false;
1490 }
1491 
1492 
1494 {
1495  CSeq_loc_CI first(m_Feat.GetLocation());
1496 
1497  if (first) {
1498  if (first.GetStrand() == eNa_strand_minus) {
1499  if (!m_LocationBioseq) {
1500  return false;
1501  }
1502  if (first.GetRange().GetTo() == m_LocationBioseq.GetInst_Length() - 1) {
1503  return true;
1504  }
1505  } else {
1506  if (first.GetRange().GetFrom() == 0) {
1507  return true;
1508  }
1509  }
1510  }
1511  return false;
1512 }
1513 
1514 
1516 {
1517  // don't calculate if no product or if ORF flag is set
1518  if (!m_Feat.IsSetProduct() ||
1520  return false;
1521  }
1522  // don't calculate if feature is pseudo
1523  if (s_IsPseudo(m_Feat) || m_GeneIsPseudo) {
1524  return false;
1525  }
1526  if (!m_ProductBioseq) {
1527  return false;
1528  } else if (m_ProductIsFar) {
1530  return true;
1531  } else {
1532  return false;
1533  }
1534  }
1535 
1536  bool found_match = false;
1537 
1538  CSeq_entry_Handle prod_nps =
1540  if (!prod_nps) {
1541  return true;
1542  }
1543 
1544  for (CSeq_loc_CI loc_i(m_Feat.GetLocation()); loc_i; ++loc_i) {
1545  const CSeq_id& sid = loc_i.GetSeq_id();
1546  if (sid.IsOther() && sid.GetOther().IsSetAccession() && NStr::StartsWith(sid.GetOther().GetAccession(), "NT_")) {
1547  return false;
1548  }
1549  CBioseq_Handle nuc = m_Scope.GetBioseqHandle(loc_i.GetSeq_id());
1550  if (nuc) {
1552  // we don't report this for NT records
1553  return false;
1554  }
1555  CSeq_entry_Handle wgs = nuc.GetExactComplexityLevel(CBioseq_set::eClass_gen_prod_set);
1556  if (wgs) {
1557  // we don't report this for gen-prod-sets
1558  return false;
1559  }
1560 
1561  CSeq_entry_Handle nuc_nps =
1562  nuc.GetExactComplexityLevel(CBioseq_set::eClass_nuc_prot);
1563 
1564  if (prod_nps == nuc_nps) {
1565  found_match = true;
1566  break;
1567  }
1568  }
1569  }
1570  return !found_match;
1571 }
1572 
1573 
1574 void CCdregionValidator::x_AddToIntronList(vector<CCdregionValidator::TShortIntron>& shortlist, TSeqPos last_start, TSeqPos last_stop, TSeqPos this_start, TSeqPos this_stop)
1575 {
1576  if (abs ((int)this_start - (int)last_stop) < 11) {
1577  shortlist.push_back(TShortIntron(last_stop, this_start));
1578  } else if (abs ((int)this_stop - (int)last_start) < 11) {
1579  shortlist.push_back(TShortIntron(last_start, this_stop));
1580  }
1581 }
1582 
1583 
1584 vector<CCdregionValidator::TShortIntron> CCdregionValidator::x_GetShortIntrons(const CSeq_loc& loc, CScope* scope)
1585 {
1586  vector<CCdregionValidator::TShortIntron> shortlist;
1587 
1588  CSeq_loc_CI li(loc);
1589 
1590  TSeqPos last_start = li.GetRange().GetFrom();
1591  TSeqPos last_stop = li.GetRange().GetTo();
1592  CRef<CSeq_id> last_id(new CSeq_id());
1593  last_id->Assign(li.GetSeq_id());
1594 
1595  ++li;
1596  while (li) {
1597  TSeqPos this_start = li.GetRange().GetFrom();
1598  TSeqPos this_stop = li.GetRange().GetTo();
1599  if (abs ((int)this_start - (int)last_stop) < 11 || abs ((int)this_stop - (int)last_start) < 11) {
1600  if (li.GetSeq_id().Equals(*last_id)) {
1601  // definitely same bioseq, definitely report
1602  x_AddToIntronList(shortlist, last_start, last_stop, this_start, this_stop);
1603  } else if (scope) {
1604  // only report if definitely on same bioseq
1605  CBioseq_Handle last_bsh = scope->GetBioseqHandle(*last_id);
1606  if (last_bsh) {
1607  for (auto id_it : last_bsh.GetId()) {
1608  if (id_it.GetSeqId()->Equals(li.GetSeq_id())) {
1609  x_AddToIntronList(shortlist, last_start, last_stop, this_start, this_stop);
1610  break;
1611  }
1612  }
1613  }
1614  }
1615  }
1616  last_start = this_start;
1617  last_stop = this_stop;
1618  last_id->Assign(li.GetSeq_id());
1619  ++li;
1620  }
1621  return shortlist;
1622 }
1623 
1624 
1626 {
1627  return NStr::NumericToString(interval.first + 1) + "-"
1628  + NStr::NumericToString(interval.second + 1);
1629 }
1630 
1631 
1633 {
1634  if (m_Feat.IsSetExcept()) {
1635  return;
1636  }
1637 
1638  string message;
1639 
1640  vector<TShortIntron> shortlist = x_GetShortIntrons(m_Feat.GetLocation(), &m_Scope);
1641  if (shortlist.size() == 0) {
1642  return;
1643  }
1644 
1645  // only report if no nonsense introns
1646  vector<CRef<CSeq_loc> > nonsense_introns = CCDSTranslationProblems::GetNonsenseIntrons(m_Feat, m_Scope);
1647  if (nonsense_introns.size() > 0) {
1648  return;
1649  }
1650 
1651  if (shortlist.size() == 1) {
1652  message = x_FormatIntronInterval(shortlist.front());
1653  } else if (shortlist.size() == 2) {
1654  message = x_FormatIntronInterval(shortlist.front())
1655  + " and " +
1656  x_FormatIntronInterval(shortlist.back());
1657  } else {
1658  for (size_t i = 0; i < shortlist.size() - 2; i++) {
1659  message += x_FormatIntronInterval(shortlist[i]) + ", ";
1660  }
1661  message += " and " + x_FormatIntronInterval(shortlist.back());
1662  }
1664  "Introns at positions " + message + " should be at least 10 nt long");
1665 }
1666 
1667 
1668 // non-pseudo CDS must have product
1670 {
1671  // bail if product exists
1672  if ( m_Feat.IsSetProduct() ) {
1673  return;
1674  }
1675  // bail if location has just stop
1676  if ( m_Feat.IsSetLocation() ) {
1677  const CSeq_loc& loc = m_Feat.GetLocation();
1678  if ( loc.IsPartialStart(eExtreme_Biological) && !loc.IsPartialStop(eExtreme_Biological) ) {
1679  if ( GetLength(loc, &m_Scope) <= 5 ) {
1680  return;
1681  }
1682  }
1683  }
1684  // supress in case of the appropriate exception
1685  if ( m_Feat.IsSetExcept() && m_Feat.IsSetExcept_text() &&
1688  "rearrangement required for product") != NPOS ) {
1689  return;
1690  }
1691  }
1692 
1693  // non-pseudo CDS must have /product
1695  "Expected CDS product absent");
1696 }
1697 
1698 
1700 {
1701  if (!m_ProductBioseq) {
1702  return;
1703  }
1704  // translate the coding region
1705  string transl_prot;
1706  try {
1708  false, // do not include stop codons
1709  false); // do not remove trailing X/B/Z
1710 
1711  } catch ( const runtime_error& ) {
1712  }
1713 
1715  prot_vec.SetCoding(CSeq_data::e_Ncbieaa);
1716 
1717  string prot_seq;
1718  prot_vec.GetSeqData(0, prot_vec.size(), prot_seq);
1719 
1720  if ( transl_prot.empty() || prot_seq.empty() || NStr::Equal(transl_prot, prot_seq) ) {
1722  "Coding region conflict flag should not be set");
1723  } else {
1725  "Coding region conflict flag is set");
1726  }
1727 }
1728 
1729 
1731 {
1732  if ( !m_Feat.IsSetProduct() ) {
1733  return;
1734  }
1735 
1736  const CCdregion& cdr = m_Feat.GetData().GetCdregion();
1737  if ( cdr.CanGetOrf() ) {
1738  return;
1739  }
1740 
1741  if ( !m_ProductBioseq || m_ProductIsFar ) {
1742  const CSeq_id* sid = nullptr;
1743  try {
1744  sid = &(GetId(m_Feat.GetProduct(), &m_Scope));
1745  } catch (const CObjmgrUtilException&) {}
1746  if (m_Imp.RequireLocalProduct(sid)) {
1748  "Unable to find product Bioseq from CDS feature");
1749  }
1750  return;
1751  }
1752 
1754  if ( !sfp ) {
1755  return;
1756  }
1757 
1758  if ( &m_Feat != sfp ) {
1759  // if genomic product set, with one cds on contig and one on cdna,
1760  // do not report.
1761  if ( m_Imp.IsGPS() ) {
1762  // feature packaging test will do final contig vs. cdna check
1764  if ( m_LocationBioseq != sfh ) {
1765  return;
1766  }
1767  }
1769  "Same product Bioseq from multiple CDS features");
1770  }
1771 }
1772 
1773 
1775 {
1776  if (!m_ProductBioseq || !m_LocationBioseq) {
1777  return;
1778  }
1779 
1781  return;
1782  }
1784  if (!prot) {
1785  return;
1786  }
1787  if (!PartialsSame(m_Feat.GetLocation(), prot->GetLocation())) {
1789  "Coding region and protein feature partials conflict");
1790  }
1791 }
1792 
1793 
1795 {
1796  if (vec.IsInGap(pos) || vec[pos] == 'N') {
1797  return true;
1798  } else {
1799  return false;
1800  }
1801 }
1802 
1803 
1804 void CCdregionValidator::x_ValidateParentPartialness(const CSeq_loc& parent_loc, const string& parent_name)
1805 {
1806  if (!m_LocationBioseq) {
1807  return;
1808  }
1809 
1810  bool check_gaps = false;
1813  check_gaps = true;
1814  }
1815 
1816  bool has_abutting_gap = false;
1817  bool is_minus_strand = m_Feat.GetLocation().IsSetStrand() && m_Feat.GetLocation().GetStrand() == eNa_strand_minus;
1818 
1819  if (m_Feat.GetLocation().IsPartialStart(eExtreme_Biological) && !parent_loc.IsPartialStart(eExtreme_Biological)) {
1820 
1821  if (check_gaps) {
1824  pos = is_minus_strand ? start + 1 : start - 1;
1825 
1826  if (pos < m_LocationBioseq.GetBioseqLength()) {
1827  has_abutting_gap = x_CheckPosNOrGap(pos, seq_vec);
1828  }
1829  }
1830 
1831  if (!has_abutting_gap) {
1832  EDiagSev sev = eDiag_Warning;
1834  if (gene && gene->GetData().GetGene().IsSetLocus()) {
1835  string locus = gene->GetData().GetGene().GetLocus();
1836  if ( NStr::EqualNocase (locus, "orf1ab") ) {
1837  sev = eDiag_Info;
1838  }
1839  }
1840  PostErr(sev, eErr_SEQ_FEAT_PartialProblemMismatch5Prime, parent_name + " should not be 5' complete if coding region is 5' partial");
1841  }
1842  }
1843  if (m_Feat.GetLocation().IsPartialStop(eExtreme_Biological) && !parent_loc.IsPartialStop(eExtreme_Biological)) {
1844 
1845  if (check_gaps) {
1846 
1849  pos = is_minus_strand ? stop - 1 : stop + 1;
1850 
1851  if (pos < m_LocationBioseq.GetBioseqLength()) {
1852  has_abutting_gap = x_CheckPosNOrGap(pos, seq_vec);
1853  }
1854  }
1855 
1856  if (!has_abutting_gap) {
1857  EDiagSev sev = eDiag_Warning;
1859  if (gene && gene->GetData().GetGene().IsSetLocus()) {
1860  string locus = gene->GetData().GetGene().GetLocus();
1861  if ( NStr::EqualNocase (locus, "orf1ab") ) {
1862  sev = eDiag_Info;
1863  }
1864  }
1865  PostErr(sev, eErr_SEQ_FEAT_PartialProblemMismatch3Prime, parent_name + " should not be 3' complete if coding region is 3' partial");
1866  }
1867  }
1868 }
1869 
1870 
1872 {
1873  if (!m_Gene) {
1874  return;
1875  }
1877 
1879  if (mrna) {
1880  TFeatScores contained_mrna;
1883  if (contained_mrna.size() == 1) {
1884  // messy for alternate splicing, so only check if there is only one
1885  x_ValidateParentPartialness(mrna->GetLocation(), "mRNA");
1886  }
1887  }
1888 }
1889 
1890 
1891 END_SCOPE(validator)
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
EErrType
@ eErr_SEQ_FEAT_PartialProblemMismatch5Prime
@ eErr_SEQ_FEAT_CDSmRNAmismatch
@ eErr_SEQ_FEAT_CDShasTooManyXs
@ eErr_SEQ_FEAT_TranslExceptPhase
@ eErr_SEQ_FEAT_InvalidCodonStart
@ eErr_SEQ_FEAT_SuspiciousFrame
@ eErr_SEQ_FEAT_UnnecessaryTranslExcept
@ eErr_SEQ_FEAT_PartialsInconsistentCDSProtein
@ eErr_SEQ_FEAT_MissingCDSproduct
@ eErr_SEQ_FEAT_TranslExceptAndRnaEditing
@ eErr_SEQ_FEAT_UnnecessaryException
@ eErr_SEQ_FEAT_PartialProblemMismatch3Prime
@ eErr_SEQ_FEAT_TransLen
@ eErr_SEQ_FEAT_InternalStop
@ eErr_SEQ_FEAT_MisMatchAA
@ eErr_SEQ_FEAT_NoStop
@ eErr_SEQ_FEAT_WrongQualOnFeature
@ eErr_SEQ_FEAT_ProductFetchFailure
@ eErr_SEQ_FEAT_GenCodeMismatch
@ eErr_SEQ_FEAT_PseudoCdsHasProduct
@ eErr_SEQ_FEAT_ProductLength
@ eErr_SEQ_FEAT_CdTransFail
@ eErr_SEQ_FEAT_BadCDScomment
@ eErr_SEQ_FEAT_NoProtein
@ eErr_SEQ_FEAT_MultipleCDSproducts
@ eErr_SEQ_FEAT_EcNumberInCDSComment
@ eErr_SEQ_FEAT_ShortExon
@ eErr_SEQ_FEAT_StartCodon
@ eErr_SEQ_FEAT_OrfCdsHasProduct
@ eErr_SEQ_FEAT_ErroneousException
@ eErr_SEQ_FEAT_GeneOnNucPositionOfPeptide
@ eErr_SEQ_FEAT_DuplicateTranslExcept
@ eErr_SEQ_FEAT_PartialProblemHasStop
@ eErr_SEQ_FEAT_PseudoCdsViaGeneHasProduct
@ eErr_SEQ_FEAT_CodonQualifierUsed
@ eErr_SEQ_FEAT_ShortIntron
@ eErr_SEQ_FEAT_AltStartCodonException
@ eErr_SEQ_FEAT_WrongQualOnCDS
@ eErr_SEQ_FEAT_TranslExceptIsPartial
@ eErr_SEQ_FEAT_ConflictFlagSet
@ eErr_SEQ_FEAT_CDSproductPackagingProblem
@ eErr_SEQ_FEAT_CDSrange
@ eErr_SEQ_FEAT_BadConflictFlag
@ eErr_SEQ_FEAT_CDSmRNArange
@ eErr_SEQ_FEAT_TranslExcept
@ eErr_SEQ_FEAT_PartialProblem
@ eErr_SEQ_FEAT_TerminalXDiscrepancy
@ eErr_SEQ_FEAT_GenCodeInvalid
@ eErr_SEQ_FEAT_PseudoCDSmRNArange
@ eErr_SEQ_FEAT_MissingExceptionFlag
@ eErr_SEQ_FEAT_IntronIsStopCodon
static const char *const sc_BypassCdsPartialCheckText[]
const string s_PlastidTxt[20]
static void s_LocIdType(CBioseq_Handle bsh, bool &is_nt, bool &is_ng, bool &is_nw, bool &is_nc)
#define FOR_EACH_SEQID_ON_BIOSEQ_HANDLE(Itr, Var)
int GetGcodeForInternalStopErrors(const CCdregion &cdr)
string GetStartCodonErrorMessage(const CSeq_feat &feat, const char first_char, size_t internal_stop_count)
string GetInternalStopErrorMessage(const CSeq_feat &feat, size_t internal_stop_count, bool bad_start, char transl_start)
CStaticArraySet< const char *, PCase_CStr > TBypassCdsPartialCheckSet
static bool IsGeneticCodeValid(int gcode)
DEFINE_STATIC_ARRAY_MAP(TBypassCdsPartialCheckSet, sc_BypassCdsPartialCheck, sc_BypassCdsPartialCheckText)
int GetGcodeForName(const string &code_name)
static int s_GetStrictGenCode(const CBioSource &src)
CRef< CSeq_loc > GetLastCodonLoc(const CSeq_feat &cds, CScope &scope)
Definition: cds_fix.cpp:138
bool DoesCodingRegionHaveTerminalCodeBreak(const objects::CCdregion &cdr)
CBioseq_Handle –.
vector< STranslExceptProblem > TTranslExceptProblems
void CalculateTranslationProblems(const CSeq_feat &feat, CBioseq_Handle loc_handle, CBioseq_Handle prot_handle, bool ignore_exceptions, bool far_fetch_cds, bool standalone_annot, bool single_seq, bool is_gpipe, bool is_genomic, bool is_refseq, bool is_nt_or_ng_or_nw, bool is_nc, bool has_accession, CScope *scope)
static vector< CRef< CSeq_loc > > GetNonsenseIntrons(const CSeq_feat &feat, CScope &scope)
vector< STranslationMismatch > TTranslationMismatches
const TTranslationMismatches & GetTranslationMismatches() const
size_t GetTranslationProblemFlags() const
const TTranslExceptProblems & GetTranslExceptProblems() const
void x_ReportTranslationMismatches(const CCDSTranslationProblems::TTranslationMismatches &mismatches)
bool x_CDS5primePartialTest() const
void x_ValidateSeqFeatLoc() override
void Validate() override
bool x_CDS3primePartialTest() const
static string x_FormatIntronInterval(const TShortIntron &interval)
static bool IsPlastid(int genome)
CConstRef< CSeq_feat > m_Gene
bool x_IsProductMisplaced() const
static vector< TShortIntron > x_GetShortIntrons(const CSeq_loc &loc, CScope *scope)
pair< TSeqPos, TSeqPos > TShortIntron
CCdregionValidator(const CSeq_feat &feat, CScope &scope, CValidError_imp &imp)
bool x_BypassCDSPartialTest() const
void x_ReportTranslationProblems(const CCDSTranslationProblems &problems)
bool x_CheckPosNOrGap(TSeqPos pos, const CSeqVector &vec)
string MapToNTCoords(TSeqPos pos)
static void x_AddToIntronList(vector< TShortIntron > &shortlist, TSeqPos last_start, TSeqPos last_stop, TSeqPos this_start, TSeqPos this_stop)
void x_ValidateFeatComment() override
void x_ValidateExceptText(const string &text) override
bool x_ReportOrigProteinId() override
void x_ReportTranslExceptProblems(const CCDSTranslationProblems::TTranslExceptProblems &problems, bool has_exception)
CCdregion –.
Definition: Cdregion.hpp:66
CCode_break –.
Definition: Code_break.hpp:66
CFeat_CI –.
Definition: feat_ci.hpp:64
@Gb_qual.hpp User-defined methods of the data storage class.
Definition: Gb_qual.hpp:61
static const CTrans_table & GetTransTable(int id)
static const CGenetic_code_table & GetCodeTable(void)
CRef< feature::CFeatTree > GetFeatTreeFromCache(const CSeq_loc &loc, CScope &scope)
Definition: gene_cache.cpp:79
CConstRef< CSeq_feat > GetGeneFromCache(const CSeq_feat *feat, CScope &scope)
Definition: gene_cache.cpp:106
int GetId(void) const
CMappedFeat –.
Definition: mapped_feat.hpp:59
Exceptions for objmgr/util library.
CScope –.
Definition: scope.hpp:92
@ eSubtype_transit_peptide_aa
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
CSeq_feat_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeq_loc_Mapper –.
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
void PostErr(EDiagSev sv, EErrType et, const string &msg)
static bool s_IsPseudo(const CSeq_feat &feat)
virtual void x_ValidateFeatComment()
void x_ValidateLocusTagGeneralMatch(CConstRef< CSeq_feat > gene)
void x_ReportPseudogeneConflict(CConstRef< CSeq_feat > gene)
static bool x_BioseqHasNmAccession(CBioseq_Handle bsh)
void ValidateSplice(bool gene_pseudo, bool check_all)
virtual void x_ValidateExceptText(const string &text)
static bool s_BioseqHasRefSeqThatStartsWithPrefix(CBioseq_Handle bsh, string prefix)
bool IsGED() const
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
Definition: validatorp.cpp:372
bool IsRemoteFetch() const
bool IsStandaloneAnnot() const
bool IsGenomic() const
bool IsFarSequence(const CSeq_id &id)
Definition: validatorp.cpp:239
bool IsHugeFileMode() const
Definition: validatorp.cpp:216
bool IsRefSeq() const
bool IsGPS() const
bool x_IsFarFetchFailure(const CSeq_loc &loc)
bool RequireLocalProduct(const CSeq_id *sid) const
bool IsGI() const
bool IsGpipe() const
bool IsFarFetchCDSproducts() const
bool IgnoreExceptions() const
CGeneCache & GetGeneCache()
bool IsSmallGenomeSet() const
bool IsEmbl() const
bool IsTPE() const
const CSeq_entry & GetTSE() const
bool IsDdbj() const
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
Public API for finding the gene(s) on a given feature using the same criteria as the flatfile generat...
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
void SetDiagFilter(EDiagFilter what, const char *filter_str)
Set diagnostic filter.
Definition: ncbidiag.cpp:7670
string GetDiagFilter(EDiagFilter what)
Get current diagnostic filter.
Definition: ncbidiag.cpp:7681
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
@ eDiag_Info
Informational message.
Definition: ncbidiag.hpp:651
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
@ eDiag_Critical
Critical error message.
Definition: ncbidiag.hpp:654
@ eDiagFilter_All
for all non-FATAL
Definition: ncbidiag.hpp:2531
@ eDiagFilter_Post
for all non-TRACE, non-FATAL
Definition: ncbidiag.hpp:2530
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2040
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
CSeq_id::EAccessionInfo IdentifyAccession(void) const
CSeq_id::E_Choice Which(void) const
@ eAcc_refseq_contig
Definition: Seq_id.hpp:420
@ eAcc_refseq_chromosome
Definition: Seq_id.hpp:429
@ eAcc_refseq_genomic
Definition: Seq_id.hpp:430
@ eAcc_refseq_wgs_intermed
Definition: Seq_id.hpp:431
bool IsPartialStart(ESeqLocExtremes ext) const
check start or stop of location for e_Lim fuzz
Definition: Seq_loc.cpp:3222
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
bool IsSetStrand(EIsSetStrand flag=eIsSetStrand_Any) const
Check if strand is set for any/all part(s) of the seq-loc depending on the flag.
Definition: Seq_loc.cpp:858
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
CMappedFeat GetBestOverlappingFeat(const CMappedFeat &feat, CSeqFeatData::ESubtype need_subtype, sequence::EOverlapType overlap_type, CFeatTree *feat_tree=0, const SAnnotSelector *base_sel=0)
Definition: feature.cpp:3653
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
ENa_strand GetStrand(const CSeq_loc &loc, CScope *scope=0)
Returns eNa_strand_unknown if multiple Bioseqs in loc Returns eNa_strand_other if multiple strands in...
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
bool IsOneBioseq(const CSeq_loc &loc, CScope *scope)
Returns true if all embedded CSeq_ids represent the same CBioseq, else false.
ECompare
ESeqLocCheck SeqLocCheck(const CSeq_loc &loc, CScope *scope)
Checks that a CSeq_loc is all on one strand on one CBioseq.
@ fCompareOverlapping
Check if seq-locs are overlapping.
@ eSeqLocCheck_error
@ eOverlap_SubsetRev
1st is a subset of 2nd ranges
@ eOverlap_Contains
2nd contains 1st extremes
@ eOverlap_CheckIntRev
1st is a subset of 2nd with matching boundaries
@ eOverlap_Simple
any overlap of extremes
@ eOverlap_Interval
at least one pair of intervals must overlap
@ eSame
CSeq_locs contain each other.
@ eContained
First CSeq_loc contained by second.
CRef< CSeq_loc > ProductToSource(const CSeq_feat &feat, const CSeq_loc &prod_loc, TP2SFlags flags=0, CScope *scope=0)
Definition: sequence.cpp:841
const CSeq_feat * GetCDSForProduct(const CBioseq &product, CScope *scope)
Get the encoding CDS feature of a given protein sequence.
Definition: sequence.cpp:2549
CRef< CSeq_loc > SourceToProduct(const CSeq_feat &feat, const CSeq_loc &source_loc, TS2PFlags flags=0, CScope *scope=0, int *frame=0)
Definition: sequence.cpp:790
CConstRef< CSeq_feat > GetmRNAforCDS(const CSeq_feat &cds, CScope &scope)
GetmRNAforCDS A function to find a CSeq_feat representing the appropriate mRNA for a given CDS.
Definition: sequence.cpp:1261
vector< TFeatScore > TFeatScores
Definition: sequence.hpp:353
void GetOverlappingFeatures(const CSeq_loc &loc, CSeqFeatData::E_Choice feat_type, CSeqFeatData::ESubtype feat_subtype, EOverlapType overlap_type, TFeatScores &feats, CScope &scope, const TBestFeatOpts opts=0, CGetOverlappingFeaturesPlugin *plugin=NULL)
Find all features overlapping the location.
Definition: sequence.cpp:945
static void Translate(const string &seq, string &prot, const CGenetic_code *code, bool include_stop=true, bool remove_trailing_X=false, bool *alt_start=NULL, bool is_5prime_complete=true, bool is_3prime_complete=true)
Translate a string using a specified genetic code.
Definition: sequence.cpp:4095
@ fS2P_AllowTer
map the termination codon as a legal location
Definition: sequence.hpp:304
CBioseq_Handle GetBioseqHandleFromTSE(const CSeq_id &id, const CTSE_Handle &tse)
Get bioseq handle for sequence withing one TSE.
Definition: scope.cpp:253
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
CSeq_feat_Handle GetSeq_featHandle(const CSeq_feat &feat, EMissing action=eMissing_Default)
Definition: scope.cpp:200
@ eProductToLocation
Map from the feature's product to location.
const CSeqFeatData & GetData(void) const
TSeqPos GetBioseqLength(void) const
bool IsSetProduct(void) const
virtual const CSeq_loc & GetLocation(void) const
TInst_Length GetInst_Length(void) const
bool IsSetInst(void) const
CSeq_entry_Handle GetExactComplexityLevel(CBioseq_set::EClass cls) const
Return level with exact complexity, or empty handle if not found.
CSeqFeatData::ESubtype GetFeatSubtype(void) const
const CGene_ref * GetGeneXref(void) const
get gene (if present) from Seq-feat.xref list
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
const TId & GetId(void) const
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
const TInst & GetInst(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
const CSeq_loc & GetLocation(void) const
const CSeq_loc & GetProduct(void) const
CConstRef< CSeq_feat > GetSeq_feat(void) const
Get current seq-feat.
bool IsInGap(TSeqPos pos) const
true if sequence at 0-based position 'pos' has gap Note: this method is not MT-safe,...
Definition: seq_vector.hpp:277
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
void SetCoding(TCoding coding)
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5384
static const char label[]
TGenome GetGenome(void) const
Get the Genome member data.
Definition: BioSource_.hpp:422
bool IsSetOrg(void) const
Check if a value has been assigned to Org data member.
Definition: BioSource_.hpp:497
bool CanGetGenome(void) const
Check if it is safe to call GetGenome method.
Definition: BioSource_.hpp:403
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
bool IsSetGenome(void) const
Check if a value has been assigned to Genome data member.
Definition: BioSource_.hpp:397
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
Definition: Gene_ref_.hpp:781
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
Definition: Gene_ref_.hpp:493
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
Definition: Gene_ref_.hpp:793
const TLocus & GetLocus(void) const
Get the Locus member data.
Definition: Gene_ref_.hpp:505
bool IsSetDb(void) const
name of database or system Check if a value has been assigned to Db data member.
Definition: Dbtag_.hpp:208
const TDb & GetDb(void) const
Get the Db member data.
Definition: Dbtag_.hpp:220
bool IsSetPgcode(void) const
plastid genetic code Check if a value has been assigned to Pgcode data member.
Definition: OrgName_.hpp:1040
TMgcode GetMgcode(void) const
Get the Mgcode member data.
Definition: OrgName_.hpp:965
TGcode GetGcode(void) const
Get the Gcode member data.
Definition: OrgName_.hpp:918
bool IsSetMgcode(void) const
mitochondrial genetic code Check if a value has been assigned to Mgcode data member.
Definition: OrgName_.hpp:946
bool IsSetOrgname(void) const
Check if a value has been assigned to Orgname data member.
Definition: Org_ref_.hpp:529
bool IsSetGcode(void) const
genetic code (see CdRegion) Check if a value has been assigned to Gcode data member.
Definition: OrgName_.hpp:899
TPgcode GetPgcode(void) const
Get the Pgcode member data.
Definition: OrgName_.hpp:1059
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
Definition: Org_ref_.hpp:541
bool IsSetEc(void) const
E.C.
Definition: Prot_ref_.hpp:438
bool IsSetOrf(void) const
just an ORF ? Check if a value has been assigned to Orf data member.
Definition: Cdregion_.hpp:462
bool IsSetComment(void) const
Check if a value has been assigned to Comment data member.
Definition: Seq_feat_.hpp:1037
bool IsSetCode(void) const
genetic code used Check if a value has been assigned to Code data member.
Definition: Cdregion_.hpp:700
bool IsCdregion(void) const
Check if variant Cdregion is selected.
TOrf GetOrf(void) const
Get the Orf member data.
Definition: Cdregion_.hpp:481
const TLoc & GetLoc(void) const
Get the Loc member data.
list< CRef< CGenetic_code > > Tdata
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
bool IsSetConflict(void) const
conflict Check if a value has been assigned to Conflict data member.
Definition: Cdregion_.hpp:559
bool IsGene(void) const
Check if variant Gene is selected.
TFrame GetFrame(void) const
Get the Frame member data.
Definition: Cdregion_.hpp:534
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
bool CanGetData(void) const
Check if it is safe to call GetData method.
Definition: Seq_feat_.hpp:919
bool IsSetExcept(void) const
something funny about this? Check if a value has been assigned to Except data member.
Definition: Seq_feat_.hpp:990
const TExcept_text & GetExcept_text(void) const
Get the Except_text member data.
Definition: Seq_feat_.hpp:1405
bool IsSetExcept_text(void) const
explain if except=TRUE Check if a value has been assigned to Except_text data member.
Definition: Seq_feat_.hpp:1393
const TCode & GetCode(void) const
Get the Code member data.
Definition: Cdregion_.hpp:712
list< CRef< C_E > > Tdata
const TCdregion & GetCdregion(void) const
Get the variant data.
bool CanGetExcept_text(void) const
Check if it is safe to call GetExcept_text method.
Definition: Seq_feat_.hpp:1399
const TProduct & GetProduct(void) const
Get the Product member data.
Definition: Seq_feat_.hpp:1096
const Tdata & Get(void) const
Get the member data.
bool CanGetExcept(void) const
Check if it is safe to call GetExcept method.
Definition: Seq_feat_.hpp:996
const TComment & GetComment(void) const
Get the Comment member data.
Definition: Seq_feat_.hpp:1049
bool CanGetOrf(void) const
Check if it is safe to call GetOrf method.
Definition: Cdregion_.hpp:468
const TGene & GetGene(void) const
Get the variant data.
const Tdata & Get(void) const
Get the member data.
const TProt & GetProt(void) const
Get the variant data.
bool CanGetCode(void) const
Check if it is safe to call GetCode method.
Definition: Cdregion_.hpp:706
TExcept GetExcept(void) const
Get the Except member data.
Definition: Seq_feat_.hpp:1009
bool CanGetProduct(void) const
Check if it is safe to call GetProduct method.
Definition: Seq_feat_.hpp:1090
const TQual & GetQual(void) const
Get the Qual member data.
Definition: Gb_qual_.hpp:212
string TExcept_text
Definition: Seq_feat_.hpp:125
bool IsSetProduct(void) const
product of process Check if a value has been assigned to Product data member.
Definition: Seq_feat_.hpp:1084
bool CanGetQual(void) const
Check if it is safe to call GetQual method.
Definition: Gb_qual_.hpp:206
TConflict GetConflict(void) const
Get the Conflict member data.
Definition: Cdregion_.hpp:578
bool IsSetCode_break(void) const
individual exceptions Check if a value has been assigned to Code_break data member.
Definition: Cdregion_.hpp:721
bool IsSetFrame(void) const
Check if a value has been assigned to Frame data member.
Definition: Cdregion_.hpp:509
bool IsSetLocation(void) const
feature made from Check if a value has been assigned to Location data member.
Definition: Seq_feat_.hpp:1105
@ eFrame_not_set
not set, code uses one
Definition: Cdregion_.hpp:95
void SetPoint(TPoint value)
Assign a value to Point data member.
Definition: Seq_point_.hpp:312
void SetId(TId &value)
Assign a value to Id data member.
Definition: Seq_point_.cpp:61
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
bool IsOther(void) const
Check if variant Other is selected.
Definition: Seq_id_.hpp:871
void SetStrand(TStrand value)
Assign a value to Strand data member.
Definition: Seq_point_.hpp:359
bool IsGeneral(void) const
Check if variant General is selected.
Definition: Seq_id_.hpp:877
const TOther & GetOther(void) const
Get the variant data.
Definition: Seq_id_.cpp:347
const TGeneral & GetGeneral(void) const
Get the variant data.
Definition: Seq_id_.cpp:369
bool IsNull(void) const
Check if variant Null is selected.
Definition: Seq_loc_.hpp:504
const TAccession & GetAccession(void) const
Get the Accession member data.
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
@ eClass_gen_prod_set
genomic products, chrom+mRNA+protein
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
bool IsSetRepr(void) const
Check if a value has been assigned to Repr data member.
Definition: Seq_inst_.hpp:546
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
TCompleteness GetCompleteness(void) const
Get the Completeness member data.
Definition: MolInfo_.hpp:594
bool CanGetCompleteness(void) const
Check if it is safe to call GetCompleteness method.
Definition: MolInfo_.hpp:575
const TMolinfo & GetMolinfo(void) const
Get the variant data.
Definition: Seqdesc_.cpp:588
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eCompleteness_unknown
Definition: MolInfo_.hpp:155
@ eCompleteness_has_left
5' or NH3 end present
Definition: MolInfo_.hpp:161
@ eCompleteness_complete
complete biological entity
Definition: MolInfo_.hpp:156
@ eCompleteness_has_right
3' or COOH end present
Definition: MolInfo_.hpp:162
@ eCompleteness_no_left
missing 5' or NH3 end
Definition: MolInfo_.hpp:158
@ eCompleteness_partial
partial but no details given
Definition: MolInfo_.hpp:157
@ eCompleteness_no_right
missing 3' or COOH end
Definition: MolInfo_.hpp:159
@ eCompleteness_no_ends
missing both ends
Definition: MolInfo_.hpp:160
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
int i
int len
static void text(MDB_val *v)
Definition: mdb_dump.c:62
static MDB_envinfo info
Definition: mdb_load.c:37
const struct ncbi::grid::netcache::search::fields::KEY key
#define abs(a)
Definition: ncbi_heapmgr.c:130
bool HasECnumberPattern(const string &str)
Definition: utilities.cpp:1088
bool PartialsSame(const CSeq_loc &loc1, const CSeq_loc &loc2)
Definition: utilities.cpp:1463
string GetValidatorLocationLabel(const CSeq_loc &loc, CScope &scope)
Definition: utilities.cpp:958
bool IsDeltaOrFarSeg(const CSeq_loc &loc, CScope *scope)
Definition: utilities.cpp:90
size_t CountInternalStopCodons(const string &transl_prot)
Definition: utilities.cpp:2393
bool HasBadStartCodon(const CSeq_feat &feat, CScope &scope, bool ignore_exceptions)
Definition: utilities.cpp:2362
#define FOR_EACH_CODEBREAK_ON_CDREGION(Itr, Var)
FOR_EACH_CODEBREAK_ON_CDREGION EDIT_EACH_CODEBREAK_ON_CDREGION.
#define FOR_EACH_GBQUAL_ON_FEATURE
#define NCBI_SEQID(Type)
@NAME Convenience macros for NCBI objects
SAnnotSelector –.
else result
Definition: token2.c:20
Modified on Mon May 27 04:38:55 2024 by modify_doxy.py rev. 669887