NCBI C++ ToolKit
validerror_bioseq.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: validerror_bioseq.hpp 101299 2023-11-28 18:18:38Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *`
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko......
27  *
28  * File Description:
29  * Privae classes and definition for the validator
30  * .......
31  *
32  */
33 
34 #ifndef VALIDATOR___VALIDERROR_BIOSEQ__HPP
35 #define VALIDATOR___VALIDERROR_BIOSEQ__HPP
36 
37 #include <corelib/ncbistd.hpp>
39 
40 #include <objmgr/scope.hpp>
41 #include <objmgr/feat_ci.hpp> // for CMappedFeat
43 
51 
52 
55 
56 //class CSeq_entry;
57 //class CCit_sub;
58 //class CCit_art;
59 //class CCit_gen;
60 class CSeq_feat;
61 class CBioseq;
62 class CSeqdesc;
63 class CSeq_annot;
64 class CTrna_ext;
65 class CProt_ref;
66 class CSeq_loc;
67 class CAuth_list;
68 class CTitle;
69 class CMolInfo;
70 class CUser_object;
71 class CSeqdesc_CI;
72 class CBioSource;
73 class COrg_ref;
74 class CGene_ref;
75 class CCdregion;
76 class CRNA_ref;
77 class CImp_feat;
78 class CSeq_literal;
79 class CBioseq_Handle;
80 class CSeq_feat_Handle;
81 
82 BEGIN_SCOPE(validator)
83 
85 class CGeneCache;
86 class CValidError_base;
87 class CValidError_annot;
88 
89 // =============================================================================
90 // Caching classes
91 // =============================================================================
92 
93 // for convenience
95 
96 
97 // =============================================================================
98 // Validation classes
99 // =============================================================================
100 class CValidError_desc;
101 class CValidError_descr;
102 
103 
104 
105 // ============================= Validate Bioseq =============================
106 
107 // internal structures
108 class CCdsMatchInfo;
109 class CMrnaMatchInfo;
110 
112 
114 {
115 public:
117  ~CValidError_bioseq() override;
118 
119  void ValidateBioseq(const CBioseq& seq);
120  void ValidateSeqIds(const CBioseq& seq);
121  void ValidateSeqId(const CSeq_id& id, const CBioseq& ctx, bool longer_general = false);
122  void ValidateInst(const CBioseq& seq);
123  void ValidateBioseqContext(const CBioseq& seq);
124  void ValidateHistory(const CBioseq& seq);
125 
126  bool GetTSANStretchErrors(const CBioseq& seq);
127  bool GetTSAConflictingBiomolTechErrors(const CBioseq& seq);
128 
129  static bool IsSelfReferential(const CBioseq& seq);
130  static bool IsAllNs(const CSeqVector& vec);
131  static int PctNs(CBioseq_Handle bsh);
132 
133  static bool IsMaster(const CBioseq& seq);
134  static bool IsWGSMaster(const CBioseq& seq, CScope& scope);
135  static bool IsWGSMaster(const CSeq_entry& entry);
136  static bool IsWGS(const CBioseq& seq);
137  static bool IsWGS(CBioseq_Handle bsh);
138  static bool IsWGS(const CSeq_entry& entry);
139  static bool IsWGSAccession(const CSeq_id& id);
140  static bool IsWGSAccession(const CBioseq& seq);
141  static bool IsTSAAccession(const CSeq_id& id);
142  static bool IsTSAAccession(const CBioseq& seq);
143  static bool IsWp(CBioseq_Handle bsh);
144  static bool IsEmblOrDdbj(const CBioseq& seq);
145  static bool IsGenbank(const CBioseq& seq);
146  static bool IsRefSeq(const CBioseq& seq);
147  static bool IsPdb(const CBioseq& seq);
148  static bool IsPartial(const CBioseq& seq, CScope& scope);
149 
150  // DBLink user object counters
159 
160 private:
162  typedef vector<CMappedFeat> TMappedFeatVec;
163 
164  void x_SetupCommonFlags (CBioseq_Handle bsh);
165 
166  void ValidateSeqLen(const CBioseq& seq);
167  void ValidateSegRef(const CBioseq& seq);
168  void ValidateDelta(const CBioseq& seq);
169  static bool x_IgnoreEndGap(CBioseq_Handle bsh, CSeq_gap::TType gap_type);
170  void ValidateSeqGap(const CSeq_gap& gap, const CBioseq& seq);
171  void ValidateDeltaLoc(const CSeq_loc& loc, const CBioseq& seq, TSeqPos& len);
172  bool ValidateRepr(const CSeq_inst& inst, const CBioseq& seq);
173  void ValidateSeqParts(const CBioseq& seq);
174  void x_ValidateTitle(const CBioseq& seq);
175  void x_ValidateBarcode(const CBioseq& seq);
176  void ValidateRawConst(const CBioseq& seq);
177  void x_CalculateNsStretchAndTotal(const CSeqVector& seqvec, TSeqPos& num_ns, TSeqPos& max_stretch, bool& n5, bool& n3);
178  void ValidateNsAndGaps(const CBioseq& seq);
179  void GapByGapInst (const CBioseq& seq);
180  void ReportBadAssemblyGap (const CBioseq& seq);
181  static bool HasBadWGSGap(const CBioseq& seq);
182  void ReportBadWGSGap(const CBioseq& seq);
183  void ReportBadTSAGap(const CBioseq& seq);
184  void ReportBadGenomeGap(const CBioseq& seq);
185  void ValidateWGSMaster(CBioseq_Handle bsh);
186 
187  void ValidateMultipleGeneOverlap (const CBioseq_Handle& bsh);
188  void ValidateBadGeneOverlap(const CSeq_feat& feat);
189  void x_ReportGeneOverlapError(const CSeq_feat& feat, const string& gene_label);
190  void x_ReportImproperPartial(const CSeq_feat& feat);
191  void x_ReportInternalPartial(const CSeq_feat& feat);
192  bool x_PartialAdjacentToIntron(const CSeq_loc& loc);
193  void ValidateFeatPartialInContext (const CMappedFeat& feat, bool is_complete);
194  void x_ReportStartStopPartialProblem(int partial_type, bool at_splice_or_gap, bool abuts_n, const CSeq_feat& feat);
195 
196  bool x_IsPartialAtSpliceSiteOrGap (const CSeq_loc& loc, unsigned int tag, bool& bad_seq, bool& is_gap, bool& abuts_n);
197  bool x_MatchesOverlappingFeaturePartial (const CMappedFeat& feat, unsigned int partial_type);
198  bool x_IsSameAsCDS(const CMappedFeat& feat);
199  void ValidateSeqFeatContext(const CBioseq& seq, bool is_complete);
200  static bool x_HasPGAPStructuredComment(CBioseq_Handle bsh);
201  EDiagSev x_DupFeatSeverity (const CSeq_feat& curr, const CSeq_feat& prev, bool viral, bool htgs, bool same_annot, bool same_label);
202  bool x_ReportDupOverlapFeaturePair (const CSeq_feat_Handle & f1, const CSeq_feat_Handle & f2, bool fruit_fly, bool viral, bool htgs);
203  bool x_SuppressDicistronic(const CSeq_feat_Handle & f1, const CSeq_feat_Handle & f2, bool fruit_fly);
204  void x_ReportOverlappingPeptidePair (CSeq_feat_Handle f1, CSeq_feat_Handle f2, const CBioseq& bioseq, bool& reported_last_peptide);
205  void ValidateDupOrOverlapFeats(const CBioseq& seq);
206  void ValidateTwintrons(const CBioseq& seq);
207  void ValidateCollidingGenes(const CBioseq& seq);
208  void ValidateCompleteGenome(const CBioseq& seq);
209  void x_CompareStrings(const TStrFeatMap& str_feat_map, const string& type);
210  void x_ValidateCompletness(const CBioseq& seq, const CMolInfo& mi);
211  void x_ReportSuspiciousUseOfComplete(const CBioseq& seq, EDiagSev sev);
212  void x_ValidateAbuttingUTR(const CBioseq_Handle& seq);
213  bool x_IsRangeGap (const CBioseq_Handle& seq, int start, int stop);
214  void x_ValidateAbuttingRNA(const CBioseq_Handle& seq);
215  void x_ValidateGeneCDSmRNACounts();
216  void x_ValidateCDSmRNAmatch(const CBioseq_Handle& seq);
217  void x_ValidateCDSVDJCmatch(const CBioseq_Handle& seq);
218  void x_ValidateCDSagainstVDJC(const CBioseq_Handle& seq);
219  void x_CheckForMultiplemRNAs(CCdsMatchInfo& cds_match, const TmRNAList& unmatched_mrnas);
220  void x_CheckMrnaProteinLink(const CCdsMatchInfo& cds_match);
221  void x_CheckOrigProteinAndTranscriptIds(const CCdsMatchInfo& cds_match);
222  void x_TranscriptIDsMatch(const string& protein_id, const CSeq_feat& cds);
223  unsigned int x_IdXrefsNotReciprocal (const CSeq_feat &cds, const CSeq_feat &mrna);
224  bool x_IdXrefsAreReciprocal (const CSeq_feat &cds, const CSeq_feat &mrna);
225 
226  void ValidateSeqDescContext(const CBioseq& seq);
227  void CheckForMissingChromosome(CBioseq_Handle bsh);
228  void CheckForMultipleStructuredComments(const CBioseq& seq);
229  void x_CheckForMultipleComments(CBioseq_Handle bsh);
230  void ValidateGBBlock (const CGB_block& gbblock, const CBioseq& seq, const CSeqdesc& desc);
231  void ValidateMolInfoContext(const CMolInfo& minfo, int& seq_biomol, int& tech, int& completeness,
232  const CBioseq& seq, const CSeqdesc& desc);
233  void x_ValidateMolInfoForBioSource(
234  const CBioSource& src,
235  const CMolInfo& minfo,
236  const CSeqdesc& desc
237  );
238  void x_CheckSingleStrandedRNAViruses(
239  const CBioSource& source,
240  const string& lineage,
241  const string& stranded_mol,
242  const CMolInfo::TBiomol biomol,
243  const CBioseq_Handle& bsh,
244  const CSerialObject& obj,
245  const CSeq_entry *ctx
246  );
247 
248  // for conflicts between lineage and molecule type
249  typedef enum {
250  eStrandedMoltype_unknown = 0,
251  eStrandedMoltype_ssRNA = 1,
252  eStrandedMoltype_dsRNA = 2,
253  eStrandedMoltype_ssDNA = 4,
254  eStrandedMoltype_dsDNA = 8
255  } EStrandedMoltype;
256  static string s_GetStrandedMolStringFromLineage(const string& lineage);
257 
258  void x_ReportLineageConflictWithMol(
259  const string& lineage,
260  const string& stranded_mol,
261  const CMolInfo::TBiomol biomol,
262  CSeq_inst::EMol mol,
263  const CSerialObject& obj,
264  const CSeq_entry *ctx
265  );
266  void ValidateMolTypeContext(const EGIBB_mol& gibb, EGIBB_mol& seq_biomol,
267  const CBioseq& seq, const CSeqdesc& desc);
268  void ValidateUpdateDateContext(const CDate& update,const CDate& create,
269  const CBioseq& seq, const CSeqdesc& desc);
270  void ValidateOrgContext(const COrg_ref& this_org,
271  const COrg_ref& org, const CBioseq& seq, const CSeqdesc& desc);
272  void ReportModifInconsistentError (int new_mod, int& old_mod, const CSeqdesc& desc, const CSeq_entry& ctx);
273  void ValidateModifDescriptors (const CBioseq& seq);
274  void ValidateMoltypeDescriptors (const CBioseq& seq);
275 
276  void ValidateSecondaryAccConflict(const string& primary_acc,
277  const CBioseq& seq, int choice);
278  void ValidateIDSetAgainstDb(const CBioseq& seq);
279  void x_ValidateSourceFeatures(const CBioseq_Handle& bsh);
280  void x_ValidateOverlappingRNAFeatures(const CBioseq_Handle& bsh);
281  void x_ValidatePubFeatures(const CBioseq_Handle& bsh);
282  void x_ReportDuplicatePubLabels (const CBioseq& seq, const vector<CTempString>& labels);
283  void x_ValidateMultiplePubs(
284  const CBioseq_Handle& bsh);
285 
286  void CheckForPubOnBioseq(const CBioseq& seq);
287  void CheckSourceDescriptor(const CBioseq_Handle& bsh);
288  static bool x_ParentAndComponentLocationsDiffer(CBioseq_Handle bsh, CBioSource::TGenome parent_location);
289  static size_t x_BadMetazoanMitochondrialLength(const CBioSource& src, const CSeq_inst& inst);
290  void CheckForMolinfoOnBioseq(const CBioseq& seq);
291  void CheckTpaHistory(const CBioseq& seq);
292 
293  size_t GetDataLen(const CSeq_inst& inst);
294  bool CdError(const CBioseq_Handle& bsh);
295  bool IsMrna(const CBioseq_Handle& bsh);
296  size_t NumOfIntervals(const CSeq_loc& loc);
297  //bool NotPeptideException(const CFeat_CI& curr, const CFeat_CI& prev);
298  //bool IsSameSeqAnnot(const CFeat_CI& fi1, const CFeat_CI& fi2);
299  bool IsIdIn(const CSeq_id& id, const CBioseq& seq);
300  bool SuppressTrailingXMsg(const CBioseq& seq);
301  CRef<CSeq_loc> GetLocFromSeq(const CBioseq& seq);
302  bool IsHistAssemblyMissing(const CBioseq& seq);
303  bool IsFlybaseDbxrefs(const TDbtags& dbxrefs);
304  bool GraphsOnBioseq() const;
305  bool IsSynthetic() const;
306  bool x_IsArtificial(const CBioseq& seq) const;
307  bool x_IsActiveFin() const;
308  bool x_IsMicroRNA() const;
309  bool x_IsDeltaLitOnly(const CSeq_inst& inst) const;
310  bool x_ShowBioProjectWarning(const CBioseq& seq);
311  bool x_HasCitSub(CBioseq_Handle bsh) const;
312  static bool x_HasCitSub(const CPub_equiv& pub);
313  static bool x_HasCitSub(const CPub& pub);
314 
315  void ValidateCDSUTR();
316  bool x_ReportUTRPair(const CSeq_feat& utr5, const CSeq_feat& utr3);
317 
318  size_t x_CountAdjacentNs(const CSeq_literal& lit);
319  void x_CheckGeneralIDs(const CBioseq& seq);
320 
321  static bool x_HasGap(const CBioseq& seq);
322 
323  //internal validators
327 
328  // BioseqHandle for bioseq currently being validated - to cut down on overhead
330 
331  // feature iterator for genes on bioseq - to cut down on overhead
332  // (This class does *not* own this)
334 
335  // feature iterator for all features on bioseq (again, trying to cut down on overhead
336  // (This class does *not* own this)
338 
346 };
347 
348 
349 
350 END_SCOPE(validator)
353 
354 #endif /* VALIDATOR___VALIDERROR_BIOSEQ__HPP */
@Auth_list.hpp User-defined methods of the data storage class.
Definition: Auth_list.hpp:57
CBioseq_Handle –.
CCdregion –.
Definition: Cdregion.hpp:66
Definition: Date.hpp:53
@Imp_feat.hpp User-defined methods of the data storage class.
Definition: Imp_feat.hpp:54
CMappedFeat –.
Definition: mapped_feat.hpp:59
Definition: Pub.hpp:56
@RNA_ref.hpp User-defined methods of the data storage class.
Definition: RNA_ref.hpp:54
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
Definition: Seq_entry.hpp:56
CSeq_feat_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
Base class for all serializable objects.
Definition: serialbase.hpp:150
Definition: Title.hpp:51
CBioseq_Handle m_CurrentHandle
void ValidateMolTypeContext(const EGIBB_mol &gibb, EGIBB_mol &seq_biomol, const CBioseq &seq, const CSeqdesc &desc)
CValidError_descr m_DescrValidator
CValidError_annot m_AnnotValidator
bool x_IsArtificial(const CBioseq &seq) const
const CCacheImpl::TFeatValue * m_AllFeatIt
vector< CMappedFeat > TMappedFeatVec
CValidError_feat m_FeatValidator
multimap< string, const CSeq_feat *, PNocase > TStrFeatMap
const CCacheImpl::TFeatValue * m_GeneIt
std::vector< CMappedFeat > TFeatValue
Definition: cache_impl.hpp:164
Definition: map.hpp:338
Include a standard set of the NCBI C++ Toolkit most basic headers.
CS_CONTEXT * ctx
Definition: t0006.c:12
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_VALIDATOR_EXPORT
Definition: ncbi_export.h:913
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
EGIBB_mol
type of molecule represented
Definition: GIBB_mol_.hpp:64
int len
const CharType(& source)[N]
Definition: pointer.h:1149
const char * tag
const CSeq_feat::TDbxref TDbtags
Definition: utilities.hpp:199
static bool IsWGSAccession(const string &acc, const CTextseq_id &id, TAllowSeqType allow_seq_type)
Definition: type.c:6
CValidator::CCache CCache
map< const CSeq_feat *, CRef< CMrnaMatchInfo > > TmRNAList
Modified on Thu Feb 29 12:21:08 2024 by modify_doxy.py rev. 669887